From 2ea9c95839ddf439873d652a27e88e6ed0535013 Mon Sep 17 00:00:00 2001 From: SheetJS Date: Fri, 18 Aug 2017 14:10:18 -0400 Subject: [PATCH] HTML and CSV parsing - blank cells are omitted or stubbed (fixes #770 h/t @doxma) - bare equals signs are interpred as strings - CSV / TSV determination based on frequency (fixes #732 h/t @nknapp) - HTML DOM Element and CSV tests - XLS do not leak XF and index (fixes #782 h/t @the-spyke) - cellText and cellNF apply to CSV (fixes #781 h/t @the-spyke) --- .travis.yml | 2 + CHANGELOG.md | 4 ++ README.md | 12 +++++- bits/40_harb.js | 33 +++++++++++---- bits/61_fcommon.js | 6 +++ bits/76_xls.js | 8 +++- bits/79_html.js | 19 ++++++--- docbits/80_parseopts.md | 4 +- docbits/85_filetype.md | 8 ++++ misc/docs/README.md | 12 +++++- package.json | 1 + test.js | 93 ++++++++++++++++++++++++++++------------- tests/core.js | 93 ++++++++++++++++++++++++++++------------- types/write.ts | 2 +- xlsx.flow.js | 66 ++++++++++++++++++++++------- xlsx.js | 66 ++++++++++++++++++++++------- 16 files changed, 314 insertions(+), 115 deletions(-) diff --git a/.travis.yml b/.travis.yml index aaf734d..278255a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,6 +32,8 @@ before_install: - "npm install blanket" - "npm install xlsjs" - "npm install coveralls mocha-lcov-reporter" +# note: jsdom 11.x expects node >= 6 but is missing engines.node + - "npm install jsdom@11.x" before_script: - "make init" - "cd test_files; make all; cd -" diff --git a/CHANGELOG.md b/CHANGELOG.md index 332de5f..8c7d18a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ This log is intended to keep track of backwards-incompatible changes, including but not limited to API changes and file location changes. Minor behavioral changes may not be included if they are not expected to break existing code. +## Unreleased (2017-08-??) + +* XLS cell ixfe/XF removed + ## 0.11.0 (2017-07-31) * Strip `require` statements from minified version diff --git a/README.md b/README.md index e6d9ca2..09f07ad 100644 --- a/README.md +++ b/README.md @@ -1385,7 +1385,7 @@ The exported `read` and `readFile` functions accept an options argument: | Option Name | Default | Description | | :---------- | ------: | :--------------------------------------------------- | | type | | Input data encoding (see Input Type below) | -| raw | | If true, plaintext parsing will not parse values ** | +| raw | false | If true, plaintext parsing will not parse values ** | | cellFormula | true | Save formulae to the .f field | | cellHTML | true | Parse rich text and save HTML to the `.h` field | | cellNF | false | Save number format string to the `.z` field | @@ -1473,8 +1473,8 @@ Plaintext format guessing follows the priority order: | XML | starts with `<` | | RTF | starts with `{\rt` | | DSV | starts with `/sep=.$/`, separator is the specified character | +| CSV | more unquoted `","` characters than `"\t"` chars in the first 1024 | | TSV | one of the first 1024 characters is a tab char `"\t"` | -| CSV | one of the first 1024 characters is a comma char `","` | | PRN | (default) | - HTML tags include: `html`, `table`, `head`, `meta`, `script`, `style`, `div` @@ -1964,6 +1964,14 @@ writer proactively generates cells for formulae if values are unavailable. Excel TXT uses tab as the delimiter and codepage 1200. +Notes: + +- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic + Link files. Unlike Excel, if the file does not have a valid SYLK header, it + will be proactively reinterpreted as CSV. There are some files with semicolon + delimiter that align with a valid SYLK file. For the broadest compatibility, + all cells with the value of `ID` are automatically wrapped in double-quotes. + ### Other Workbook Formats diff --git a/bits/40_harb.js b/bits/40_harb.js index 01e83aa..abb14cf 100644 --- a/bits/40_harb.js +++ b/bits/40_harb.js @@ -512,6 +512,16 @@ var PRN = (function() { return arr; } + function guess_sep(str) { + var cnt = [], instr = false, end = 0, cc = 0; + for(;end < str.length;++end) { + if((cc=str.charCodeAt(end)) == 0x22) instr = !instr; + else if(!instr) cnt[cc] = (cnt[cc]||0)+1; + } + if(cnt[0x2C] > cnt[0x09]) return ","; + return ","; + } + function dsv_to_sheet_str(str/*:string*/, opts)/*:Worksheet*/ { var o = opts || {}; var sep = ""; @@ -519,9 +529,8 @@ var PRN = (function() { var ws/*:Worksheet*/ = o.dense ? ([]/*:any*/) : ({}/*:any*/); var range/*:Range*/ = ({s: {c:0, r:0}, e: {c:0, r:0}}/*:any*/); - /* known sep */ if(str.substr(0,4) == "sep=" && str.charCodeAt(5) == 10) { sep = str.charAt(4); str = str.substr(6); } - else if(str.substr(0,1024).indexOf("\t") == -1) sep = ","; else sep = "\t"; + else sep = guess_sep(str.substr(0,1024)); var R = 0, C = 0, v = 0; var start = 0, end = 0, sepcc = sep.charCodeAt(0), instr = false, cc=0; str = str.replace(/\r\n/mg, "\n"); @@ -529,24 +538,30 @@ var PRN = (function() { function finish_cell() { var s = str.slice(start, end); var cell = ({}/*:any*/); - if(o.raw) { cell.t = 's'; cell.v = s; } - else if(s.charCodeAt(0) == 0x3D) { cell.t = 'n'; cell.f = s.substr(1); } + if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"'); + if(s.length == 0) cell.t = 'z'; + else if(o.raw) { cell.t = 's'; cell.v = s; } + else if(s.charCodeAt(0) == 0x3D) { + if(s.charCodeAt(1) == 0x22 && s.charCodeAt(s.length - 1) == 0x22) { cell.t = 's'; cell.v = s.slice(2,-1).replace(/""/g,'"'); } + else if(fuzzyfmla(s)) { cell.t = 'n'; cell.f = s.substr(1); } + else { cell.t = 's'; cell.v = s; } } else if(s == "TRUE") { cell.t = 'b'; cell.v = true; } else if(s == "FALSE") { cell.t = 'b'; cell.v = false; } - else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; cell.w = s; cell.v = v; } + else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; if(o.cellText !== false) cell.w = s; cell.v = v; } else if(!isNaN(fuzzydate(s).getDate()) || _re && s.match(_re)) { cell.z = o.dateNF || SSF._table[14]; var k = 0; if(_re && s.match(_re)){ s=dateNF_fix(s, o.dateNF, (s.match(_re)||[])); k=1; } if(o.cellDates) { cell.t = 'd'; cell.v = parseDate(s, k); } else { cell.t = 'n'; cell.v = datenum(parseDate(s, k)); } - cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v); + if(o.cellText !== false) cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v); + if(!o.cellNF) delete cell.z; } else { cell.t = 's'; - if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"'); cell.v = s; } - if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; } + if(cell.t == 'z'){} + else if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; } else ws[encode_cell({c:C,r:R})] = cell; start = end+1; if(range.e.c < C) range.e.c = C; @@ -579,7 +594,7 @@ var PRN = (function() { case 'array': str = cc2str(d); break; default: throw new Error("Unrecognized type " + opts.type); } - if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str); + if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str.slice(3)); return prn_to_sheet_str(str, opts); } diff --git a/bits/61_fcommon.js b/bits/61_fcommon.js index a23bb9a..1590c40 100644 --- a/bits/61_fcommon.js +++ b/bits/61_fcommon.js @@ -40,3 +40,9 @@ function shift_formula_xlsx(f/*:string*/, range/*:string*/, cell/*:string*/)/*:s var delta = {r:c.r - s.r, c:c.c - s.c}; return shift_formula_str(f, delta); } + +/* TODO: parse formula */ +function fuzzyfmla(f/*:string*/)/*:boolean*/ { + if(f.length == 1) return false; + return true; +} diff --git a/bits/76_xls.js b/bits/76_xls.js index 60f26ed..32287b5 100644 --- a/bits/76_xls.js +++ b/bits/76_xls.js @@ -131,6 +131,7 @@ function parse_workbook(blob, options/*:ParseOpts*/)/*:Workbook*/ { if(file_depth > 1) return; if(!cell_valid) return; if(options.cellStyles && line.XF && line.XF.data) process_cell_style(cell, line, options); + delete line.ixfe; delete line.XF; lastcell = cell; last_cell = encode_cell(cell); if(range.s) { @@ -240,8 +241,11 @@ function parse_workbook(blob, options/*:ParseOpts*/)/*:Workbook*/ { case 'FileSharing': break; //TODO case 'CodePage': /* overrides based on test cases */ - if(val === 0x5212) val = 1200; - else if(val === 0x8001) val = 1252; + switch(val) { + case 0x5212: val = 1200; break; + case 0x8000: val = 10000; break; + case 0x8001: val = 1252; break; + } opts.codepage = val; set_cp(val); break; diff --git a/bits/79_html.js b/bits/79_html.js index 8ff4bf6..ef1f835 100644 --- a/bits/79_html.js +++ b/bits/79_html.js @@ -37,13 +37,19 @@ var HTML_ = (function() { if(range.e.c < C) range.e.c = C; if(opts.dense) { if(!ws[R]) ws[R] = []; - if(opts.raw) ws[R][C] = {t:'s', v:m}; + if(!m.length){} + else if(opts.raw) ws[R][C] = {t:'s', v:m}; + else if(m === 'TRUE') ws[R][C] = {t:'b', v:true}; + else if(m === 'FALSE') ws[R][C] = {t:'b', v:false}; else if(!isNaN(fuzzynum(m))) ws[R][C] = {t:'n', v:fuzzynum(m)}; else ws[R][C] = {t:'s', v:m}; } else { var coord/*:string*/ = encode_cell({r:R, c:C}); /* TODO: value parsing */ - if(opts.raw) ws[coord] = {t:'s', v:m}; + if(!m.length){} + else if(opts.raw) ws[coord] = {t:'s', v:m}; + else if(m === 'TRUE') ws[coord] = {t:'b', v:true}; + else if(m === 'FALSE') ws[coord] = {t:'b', v:false}; else if(!isNaN(fuzzynum(m))) ws[coord] = {t:'n', v:fuzzynum(m)}; else ws[coord] = {t:'s', v:m}; } @@ -126,7 +132,7 @@ function parse_dom_table(table/*:HTMLElement*/, _opts/*:?any*/)/*:Worksheet*/ { var row = rows[R]; var elts = row.children; for(_C = C = 0; _C < elts.length; ++_C) { - var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent; + var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent || ""; for(midx = 0; midx < merges.length; ++midx) { var m = merges[midx]; if(m.s.c == C && m.s.r <= R && R <= m.e.r) { C = m.e.c+1; midx = -1; } @@ -135,8 +141,11 @@ function parse_dom_table(table/*:HTMLElement*/, _opts/*:?any*/)/*:Worksheet*/ { CS = +elt.getAttribute("colspan") || 1; if((RS = +elt.getAttribute("rowspan"))>0 || CS>1) merges.push({s:{r:R,c:C},e:{r:R + (RS||1) - 1, c:C + CS - 1}}); var o/*:Cell*/ = {t:'s', v:v}; - if(v != null && v.length) { - if(opts.raw) o = {t:'s', v:v}; + if(v != null) { + if(v.length == 0) o.t = 'z'; + else if(opts.raw){} + else if(v === 'TRUE') o = {t:'b', v:true}; + else if(v === 'FALSE') o = {t:'b', v:false}; else if(!isNaN(fuzzynum(v))) o = {t:'n', v:fuzzynum(v)}; else if(!isNaN(fuzzydate(v).getDate())) { o = ({t:'d', v:parseDate(v)}/*:any*/); diff --git a/docbits/80_parseopts.md b/docbits/80_parseopts.md index c4c4b1c..0dce235 100644 --- a/docbits/80_parseopts.md +++ b/docbits/80_parseopts.md @@ -5,7 +5,7 @@ The exported `read` and `readFile` functions accept an options argument: | Option Name | Default | Description | | :---------- | ------: | :--------------------------------------------------- | | type | | Input data encoding (see Input Type below) | -| raw | | If true, plaintext parsing will not parse values ** | +| raw | false | If true, plaintext parsing will not parse values ** | | cellFormula | true | Save formulae to the .f field | | cellHTML | true | Parse rich text and save HTML to the `.h` field | | cellNF | false | Save number format string to the `.z` field | @@ -93,8 +93,8 @@ Plaintext format guessing follows the priority order: | XML | starts with `<` | | RTF | starts with `{\rt` | | DSV | starts with `/sep=.$/`, separator is the specified character | +| CSV | more unquoted `","` characters than `"\t"` chars in the first 1024 | | TSV | one of the first 1024 characters is a tab char `"\t"` | -| CSV | one of the first 1024 characters is a comma char `","` | | PRN | (default) | - HTML tags include: `html`, `table`, `head`, `meta`, `script`, `style`, `div` diff --git a/docbits/85_filetype.md b/docbits/85_filetype.md index cbffded..1772a8c 100644 --- a/docbits/85_filetype.md +++ b/docbits/85_filetype.md @@ -113,6 +113,14 @@ writer proactively generates cells for formulae if values are unavailable. Excel TXT uses tab as the delimiter and codepage 1200. +Notes: + +- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic + Link files. Unlike Excel, if the file does not have a valid SYLK header, it + will be proactively reinterpreted as CSV. There are some files with semicolon + delimiter that align with a valid SYLK file. For the broadest compatibility, + all cells with the value of `ID` are automatically wrapped in double-quotes. + ### Other Workbook Formats diff --git a/misc/docs/README.md b/misc/docs/README.md index 901bd87..ea0fe00 100644 --- a/misc/docs/README.md +++ b/misc/docs/README.md @@ -1274,7 +1274,7 @@ The exported `read` and `readFile` functions accept an options argument: | Option Name | Default | Description | | :---------- | ------: | :--------------------------------------------------- | | type | | Input data encoding (see Input Type below) | -| raw | | If true, plaintext parsing will not parse values ** | +| raw | false | If true, plaintext parsing will not parse values ** | | cellFormula | true | Save formulae to the .f field | | cellHTML | true | Parse rich text and save HTML to the `.h` field | | cellNF | false | Save number format string to the `.z` field | @@ -1360,8 +1360,8 @@ Plaintext format guessing follows the priority order: | XML | starts with `<` | | RTF | starts with `{\rt` | | DSV | starts with `/sep=.$/`, separator is the specified character | +| CSV | more unquoted `","` characters than `"\t"` chars in the first 1024 | | TSV | one of the first 1024 characters is a tab char `"\t"` | -| CSV | one of the first 1024 characters is a comma char `","` | | PRN | (default) | - HTML tags include: `html`, `table`, `head`, `meta`, `script`, `style`, `div` @@ -1809,6 +1809,14 @@ writer proactively generates cells for formulae if values are unavailable. Excel TXT uses tab as the delimiter and codepage 1200. +Notes: + +- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic + Link files. Unlike Excel, if the file does not have a valid SYLK header, it + will be proactively reinterpreted as CSV. There are some files with semicolon + delimiter that align with a valid SYLK file. For the broadest compatibility, + all cells with the value of `ID` are automatically wrapped in double-quotes. + ### Other Workbook Formats diff --git a/package.json b/package.json index c7c952e..c771ecd 100644 --- a/package.json +++ b/package.json @@ -31,6 +31,7 @@ "@sheetjs/uglify-js":"~2.7.3", "@types/node":"^8.0.7", "@types/commander":"^2.9.0", + "jsdom": "~11.1.0", "dtslint": "^0.1.2", "typescript": "2.2.0" }, diff --git a/test.js b/test.js index 780aa7b..57934de 100644 --- a/test.js +++ b/test.js @@ -1723,6 +1723,39 @@ describe('json output', function() { }); }); + +var codes = [["あ 1", "\u00E3\u0081\u0082 1"]] +var plaintext_val = [ + ["A1", 'n', -0.08, "-0.08"], + ["B1", 'n', 4001, "4,001"], + ["C1", 's', "あ 1", "あ 1"], + ["A2", 'n', 41.08, "$41.08"], + ["B2", 'n', 0.11, "11%"], + ["B3", 'b', true, "TRUE"], + ["C3", 'b', false, "FALSE"], + ["A3"]]; +function plaintext_test(wb, raw, sn) { + var sheet = wb.Sheets[sn || wb.SheetNames[0]]; + plaintext_val.forEach(function(x) { + var cell = get_cell(sheet, x[0]); + if(x.length == 1) { if(cell) { assert.equal(cell.t, 'z'); assert(!cell.v); } return; } + assert.equal(cell.v, x[2+!!raw]); assert.equal(cell.t, raw ? 's' : x[1]); + }); +} +function make_html_str(idx) { return ["", + "", + "", + "", +"
-0.084,001", codes[0][idx], "
$41.0811%
TRUEFALSE
" ].join(""); } +function make_csv_str(idx) { return [ '\u00EF\u00BB\u00BF' + + '-0.08,"4,001",' + codes[0][idx] + '', + '$41.08,11%', + ',TRUE,FALSE' +].join("\n"); } +var html_bstr = make_html_str(1), html_str = make_html_str(0); +var csv_bstr = make_csv_str(1); + + describe('csv', function() { describe('input', function(){ var b = "1,2,3,\nTRUE,FALSE,,sheetjs\nfoo,bar,2/19/14,0.3\n,,,\nbaz,,qux,\n"; @@ -1769,6 +1802,17 @@ describe('csv', function() { assert.equal(cell.v.getMonth(), 2); assert.equal(cell.w, "2/3/14"); }); + it('should interpret values by default', function() { plaintext_test(X.read(csv_bstr, {type:"binary"}), false); }); + it('should generate strings if raw option is passed', function() { plaintext_test(X.read(csv_bstr, {type:"binary", raw:true}), true); }); + it('should handle formulae', function() { + var bb = '=,=1+1,="100"'; + var sheet = X.read(bb, {type:"binary"}).Sheets.Sheet1; + assert.equal(get_cell(sheet, "A1").t, 's'); + assert.equal(get_cell(sheet, "A1").v, '='); + assert.equal(get_cell(sheet, "B1").f, '1+1'); + assert.equal(get_cell(sheet, "C1").t, 's'); + assert.equal(get_cell(sheet, "C1").v, '100'); + }); }); describe('output', function(){ var data, ws; @@ -1845,37 +1889,26 @@ describe('csv', function() { }); }); +var JSDOM = null; +var domtest = browser || (function(){try{return !!(JSDOM=require('jsdom').JSDOM);}catch(e){return 0;}})(); + +function get_dom_element(html) { + if(browser) { + var domelt = document.createElement('div'); + domelt.innerHTML = html; + return domelt; + } + return new JSDOM(html).window.document.body.children[0]; +} + describe('HTML', function() { - describe('input', function(){ - var b = "
-0.084,001\u00e3\u0081\u0082 1
$41.0811%
"; - it('should generate numbers by default', function() { - var sheet = X.read(b, {type:"binary"}).Sheets.Sheet1; - var cell = get_cell(sheet, "A1"); - assert.equal(cell.v, -0.08); - assert.equal(cell.t, 'n'); - cell = get_cell(sheet, "B1"); - assert.equal(cell.v, 4001); - cell = get_cell(sheet, "C1"); - assert.equal(cell.v, "あ 1"); - cell = get_cell(sheet, "A2"); - assert.equal(cell.v, 41.08); - cell = get_cell(sheet, "B2"); - assert.equal(cell.v, .11); - }); - it('should generate strings if raw option is passed', function() { - var sheet = X.read(b, {type:"binary", raw:true}).Sheets.Sheet1; - var cell = get_cell(sheet, "A1"); - assert.equal(cell.v, "-0.08"); - assert.equal(cell.t, 's'); - cell = get_cell(sheet, "B1"); - assert.equal(cell.v, "4,001"); - cell = get_cell(sheet, "C1"); - assert.equal(cell.v, "あ 1"); - cell = get_cell(sheet, "A2"); - assert.equal(cell.v, "$41.08"); - cell = get_cell(sheet, "B2"); - assert.equal(cell.v, "11%"); - }); + describe('input string', function(){ + it('should interpret values by default', function() { plaintext_test(X.read(html_bstr, {type:"binary"}), false); }); + it('should generate strings if raw option is passed', function() { plaintext_test(X.read(html_bstr, {type:"binary", raw:true}), true); }); + }); + (domtest ? describe : describe.skip)('input DOM', function() { + it('should interpret values by default', function() { plaintext_test(X.utils.table_to_book(get_dom_element(html_str)), false); }); + it('should generate strings if raw option is passed', function() { plaintext_test(X.utils.table_to_book(get_dom_element(html_str), {raw:true}), true); }); }); }); diff --git a/tests/core.js b/tests/core.js index c4aec3e..d5a7079 100644 --- a/tests/core.js +++ b/tests/core.js @@ -1723,6 +1723,39 @@ describe('json output', function() { }); }); + +var codes = [["あ 1", "\u00E3\u0081\u0082 1"]] +var plaintext_val = [ + ["A1", 'n', -0.08, "-0.08"], + ["B1", 'n', 4001, "4,001"], + ["C1", 's', "あ 1", "あ 1"], + ["A2", 'n', 41.08, "$41.08"], + ["B2", 'n', 0.11, "11%"], + ["B3", 'b', true, "TRUE"], + ["C3", 'b', false, "FALSE"], + ["A3"]]; +function plaintext_test(wb, raw, sn) { + var sheet = wb.Sheets[sn || wb.SheetNames[0]]; + plaintext_val.forEach(function(x) { + var cell = get_cell(sheet, x[0]); + if(x.length == 1) { if(cell) { assert.equal(cell.t, 'z'); assert(!cell.v); } return; } + assert.equal(cell.v, x[2+!!raw]); assert.equal(cell.t, raw ? 's' : x[1]); + }); +} +function make_html_str(idx) { return ["", + "", + "", + "", +"
-0.084,001", codes[0][idx], "
$41.0811%
TRUEFALSE
" ].join(""); } +function make_csv_str(idx) { return [ '\u00EF\u00BB\u00BF' + + '-0.08,"4,001",' + codes[0][idx] + '', + '$41.08,11%', + ',TRUE,FALSE' +].join("\n"); } +var html_bstr = make_html_str(1), html_str = make_html_str(0); +var csv_bstr = make_csv_str(1); + + describe('csv', function() { describe('input', function(){ var b = "1,2,3,\nTRUE,FALSE,,sheetjs\nfoo,bar,2/19/14,0.3\n,,,\nbaz,,qux,\n"; @@ -1769,6 +1802,17 @@ describe('csv', function() { assert.equal(cell.v.getMonth(), 2); assert.equal(cell.w, "2/3/14"); }); + it('should interpret values by default', function() { plaintext_test(X.read(csv_bstr, {type:"binary"}), false); }); + it('should generate strings if raw option is passed', function() { plaintext_test(X.read(csv_bstr, {type:"binary", raw:true}), true); }); + it('should handle formulae', function() { + var bb = '=,=1+1,="100"'; + var sheet = X.read(bb, {type:"binary"}).Sheets.Sheet1; + assert.equal(get_cell(sheet, "A1").t, 's'); + assert.equal(get_cell(sheet, "A1").v, '='); + assert.equal(get_cell(sheet, "B1").f, '1+1'); + assert.equal(get_cell(sheet, "C1").t, 's'); + assert.equal(get_cell(sheet, "C1").v, '100'); + }); }); describe('output', function(){ var data, ws; @@ -1845,37 +1889,26 @@ describe('csv', function() { }); }); +var JSDOM = null; +var domtest = browser || (function(){try{return !!(JSDOM=require('jsdom').JSDOM);}catch(e){return 0;}})(); + +function get_dom_element(html) { + if(browser) { + var domelt = document.createElement('div'); + domelt.innerHTML = html; + return domelt; + } + return new JSDOM(html).window.document.body.children[0]; +} + describe('HTML', function() { - describe('input', function(){ - var b = "
-0.084,001\u00e3\u0081\u0082 1
$41.0811%
"; - it('should generate numbers by default', function() { - var sheet = X.read(b, {type:"binary"}).Sheets.Sheet1; - var cell = get_cell(sheet, "A1"); - assert.equal(cell.v, -0.08); - assert.equal(cell.t, 'n'); - cell = get_cell(sheet, "B1"); - assert.equal(cell.v, 4001); - cell = get_cell(sheet, "C1"); - assert.equal(cell.v, "あ 1"); - cell = get_cell(sheet, "A2"); - assert.equal(cell.v, 41.08); - cell = get_cell(sheet, "B2"); - assert.equal(cell.v, .11); - }); - it('should generate strings if raw option is passed', function() { - var sheet = X.read(b, {type:"binary", raw:true}).Sheets.Sheet1; - var cell = get_cell(sheet, "A1"); - assert.equal(cell.v, "-0.08"); - assert.equal(cell.t, 's'); - cell = get_cell(sheet, "B1"); - assert.equal(cell.v, "4,001"); - cell = get_cell(sheet, "C1"); - assert.equal(cell.v, "あ 1"); - cell = get_cell(sheet, "A2"); - assert.equal(cell.v, "$41.08"); - cell = get_cell(sheet, "B2"); - assert.equal(cell.v, "11%"); - }); + describe('input string', function(){ + it('should interpret values by default', function() { plaintext_test(X.read(html_bstr, {type:"binary"}), false); }); + it('should generate strings if raw option is passed', function() { plaintext_test(X.read(html_bstr, {type:"binary", raw:true}), true); }); + }); + (domtest ? describe : describe.skip)('input DOM', function() { + it('should interpret values by default', function() { plaintext_test(X.utils.table_to_book(get_dom_element(html_str)), false); }); + it('should generate strings if raw option is passed', function() { plaintext_test(X.utils.table_to_book(get_dom_element(html_str), {raw:true}), true); }); }); }); diff --git a/types/write.ts b/types/write.ts index ae83389..540f7ef 100644 --- a/types/write.ts +++ b/types/write.ts @@ -80,7 +80,7 @@ ws['!rows'] = wsrows; /* TEST: hyperlink note: Excel does not automatically style hyperlinks */ (ws['A3']).l = { Target: "http://sheetjs.com", Tooltip: "Visit us " }; -XLSX.utils.cell_set_hyperlink(ws['A3'], "http://sheetjs.com", "Visit us " ); +XLSX.utils.cell_set_hyperlink(ws['A3'], "http://sheetjs.com", "Visit us "); /* TEST: built-in format */ (ws['B1']).z = "0%"; // Format Code 9 diff --git a/xlsx.flow.js b/xlsx.flow.js index 038b89b..68aa9ad 100644 --- a/xlsx.flow.js +++ b/xlsx.flow.js @@ -5786,6 +5786,16 @@ var PRN = (function() { return arr; } + function guess_sep(str) { + var cnt = [], instr = false, end = 0, cc = 0; + for(;end < str.length;++end) { + if((cc=str.charCodeAt(end)) == 0x22) instr = !instr; + else if(!instr) cnt[cc] = (cnt[cc]||0)+1; + } + if(cnt[0x2C] > cnt[0x09]) return ","; + return ","; + } + function dsv_to_sheet_str(str/*:string*/, opts)/*:Worksheet*/ { var o = opts || {}; var sep = ""; @@ -5793,9 +5803,8 @@ var PRN = (function() { var ws/*:Worksheet*/ = o.dense ? ([]/*:any*/) : ({}/*:any*/); var range/*:Range*/ = ({s: {c:0, r:0}, e: {c:0, r:0}}/*:any*/); - /* known sep */ if(str.substr(0,4) == "sep=" && str.charCodeAt(5) == 10) { sep = str.charAt(4); str = str.substr(6); } - else if(str.substr(0,1024).indexOf("\t") == -1) sep = ","; else sep = "\t"; + else sep = guess_sep(str.substr(0,1024)); var R = 0, C = 0, v = 0; var start = 0, end = 0, sepcc = sep.charCodeAt(0), instr = false, cc=0; str = str.replace(/\r\n/mg, "\n"); @@ -5803,24 +5812,30 @@ var PRN = (function() { function finish_cell() { var s = str.slice(start, end); var cell = ({}/*:any*/); - if(o.raw) { cell.t = 's'; cell.v = s; } - else if(s.charCodeAt(0) == 0x3D) { cell.t = 'n'; cell.f = s.substr(1); } + if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"'); + if(s.length == 0) cell.t = 'z'; + else if(o.raw) { cell.t = 's'; cell.v = s; } + else if(s.charCodeAt(0) == 0x3D) { + if(s.charCodeAt(1) == 0x22 && s.charCodeAt(s.length - 1) == 0x22) { cell.t = 's'; cell.v = s.slice(2,-1).replace(/""/g,'"'); } + else if(fuzzyfmla(s)) { cell.t = 'n'; cell.f = s.substr(1); } + else { cell.t = 's'; cell.v = s; } } else if(s == "TRUE") { cell.t = 'b'; cell.v = true; } else if(s == "FALSE") { cell.t = 'b'; cell.v = false; } - else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; cell.w = s; cell.v = v; } + else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; if(o.cellText !== false) cell.w = s; cell.v = v; } else if(!isNaN(fuzzydate(s).getDate()) || _re && s.match(_re)) { cell.z = o.dateNF || SSF._table[14]; var k = 0; if(_re && s.match(_re)){ s=dateNF_fix(s, o.dateNF, (s.match(_re)||[])); k=1; } if(o.cellDates) { cell.t = 'd'; cell.v = parseDate(s, k); } else { cell.t = 'n'; cell.v = datenum(parseDate(s, k)); } - cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v); + if(o.cellText !== false) cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v); + if(!o.cellNF) delete cell.z; } else { cell.t = 's'; - if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"'); cell.v = s; } - if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; } + if(cell.t == 'z'){} + else if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; } else ws[encode_cell({c:C,r:R})] = cell; start = end+1; if(range.e.c < C) range.e.c = C; @@ -5853,7 +5868,7 @@ var PRN = (function() { case 'array': str = cc2str(d); break; default: throw new Error("Unrecognized type " + opts.type); } - if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str); + if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str.slice(3)); return prn_to_sheet_str(str, opts); } @@ -8466,6 +8481,12 @@ function shift_formula_xlsx(f/*:string*/, range/*:string*/, cell/*:string*/)/*:s var delta = {r:c.r - s.r, c:c.c - s.c}; return shift_formula_str(f, delta); } + +/* TODO: parse formula */ +function fuzzyfmla(f/*:string*/)/*:boolean*/ { + if(f.length == 1) return false; + return true; +} /* --- formula references point to MS-XLS --- */ /* Small helpers */ function parseread(l) { return function(blob, length) { blob.l+=l; return; }; } @@ -14285,6 +14306,7 @@ function parse_workbook(blob, options/*:ParseOpts*/)/*:Workbook*/ { if(file_depth > 1) return; if(!cell_valid) return; if(options.cellStyles && line.XF && line.XF.data) process_cell_style(cell, line, options); + delete line.ixfe; delete line.XF; lastcell = cell; last_cell = encode_cell(cell); if(range.s) { @@ -14394,8 +14416,11 @@ function parse_workbook(blob, options/*:ParseOpts*/)/*:Workbook*/ { case 'FileSharing': break; //TODO case 'CodePage': /* overrides based on test cases */ - if(val === 0x5212) val = 1200; - else if(val === 0x8001) val = 1252; + switch(val) { + case 0x5212: val = 1200; break; + case 0x8000: val = 10000; break; + case 0x8001: val = 1252; break; + } opts.codepage = val; set_cp(val); break; @@ -16412,13 +16437,19 @@ var HTML_ = (function() { if(range.e.c < C) range.e.c = C; if(opts.dense) { if(!ws[R]) ws[R] = []; - if(opts.raw) ws[R][C] = {t:'s', v:m}; + if(!m.length){} + else if(opts.raw) ws[R][C] = {t:'s', v:m}; + else if(m === 'TRUE') ws[R][C] = {t:'b', v:true}; + else if(m === 'FALSE') ws[R][C] = {t:'b', v:false}; else if(!isNaN(fuzzynum(m))) ws[R][C] = {t:'n', v:fuzzynum(m)}; else ws[R][C] = {t:'s', v:m}; } else { var coord/*:string*/ = encode_cell({r:R, c:C}); /* TODO: value parsing */ - if(opts.raw) ws[coord] = {t:'s', v:m}; + if(!m.length){} + else if(opts.raw) ws[coord] = {t:'s', v:m}; + else if(m === 'TRUE') ws[coord] = {t:'b', v:true}; + else if(m === 'FALSE') ws[coord] = {t:'b', v:false}; else if(!isNaN(fuzzynum(m))) ws[coord] = {t:'n', v:fuzzynum(m)}; else ws[coord] = {t:'s', v:m}; } @@ -16501,7 +16532,7 @@ function parse_dom_table(table/*:HTMLElement*/, _opts/*:?any*/)/*:Worksheet*/ { var row = rows[R]; var elts = row.children; for(_C = C = 0; _C < elts.length; ++_C) { - var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent; + var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent || ""; for(midx = 0; midx < merges.length; ++midx) { var m = merges[midx]; if(m.s.c == C && m.s.r <= R && R <= m.e.r) { C = m.e.c+1; midx = -1; } @@ -16510,8 +16541,11 @@ function parse_dom_table(table/*:HTMLElement*/, _opts/*:?any*/)/*:Worksheet*/ { CS = +elt.getAttribute("colspan") || 1; if((RS = +elt.getAttribute("rowspan"))>0 || CS>1) merges.push({s:{r:R,c:C},e:{r:R + (RS||1) - 1, c:C + CS - 1}}); var o/*:Cell*/ = {t:'s', v:v}; - if(v != null && v.length) { - if(opts.raw) o = {t:'s', v:v}; + if(v != null) { + if(v.length == 0) o.t = 'z'; + else if(opts.raw){} + else if(v === 'TRUE') o = {t:'b', v:true}; + else if(v === 'FALSE') o = {t:'b', v:false}; else if(!isNaN(fuzzynum(v))) o = {t:'n', v:fuzzynum(v)}; else if(!isNaN(fuzzydate(v).getDate())) { o = ({t:'d', v:parseDate(v)}/*:any*/); diff --git a/xlsx.js b/xlsx.js index 8ee8c93..5bccca1 100644 --- a/xlsx.js +++ b/xlsx.js @@ -5712,6 +5712,16 @@ var PRN = (function() { return arr; } + function guess_sep(str) { + var cnt = [], instr = false, end = 0, cc = 0; + for(;end < str.length;++end) { + if((cc=str.charCodeAt(end)) == 0x22) instr = !instr; + else if(!instr) cnt[cc] = (cnt[cc]||0)+1; + } + if(cnt[0x2C] > cnt[0x09]) return ","; + return ","; + } + function dsv_to_sheet_str(str, opts) { var o = opts || {}; var sep = ""; @@ -5719,9 +5729,8 @@ var PRN = (function() { var ws = o.dense ? ([]) : ({}); var range = ({s: {c:0, r:0}, e: {c:0, r:0}}); - /* known sep */ if(str.substr(0,4) == "sep=" && str.charCodeAt(5) == 10) { sep = str.charAt(4); str = str.substr(6); } - else if(str.substr(0,1024).indexOf("\t") == -1) sep = ","; else sep = "\t"; + else sep = guess_sep(str.substr(0,1024)); var R = 0, C = 0, v = 0; var start = 0, end = 0, sepcc = sep.charCodeAt(0), instr = false, cc=0; str = str.replace(/\r\n/mg, "\n"); @@ -5729,24 +5738,30 @@ var PRN = (function() { function finish_cell() { var s = str.slice(start, end); var cell = ({}); - if(o.raw) { cell.t = 's'; cell.v = s; } - else if(s.charCodeAt(0) == 0x3D) { cell.t = 'n'; cell.f = s.substr(1); } + if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"'); + if(s.length == 0) cell.t = 'z'; + else if(o.raw) { cell.t = 's'; cell.v = s; } + else if(s.charCodeAt(0) == 0x3D) { + if(s.charCodeAt(1) == 0x22 && s.charCodeAt(s.length - 1) == 0x22) { cell.t = 's'; cell.v = s.slice(2,-1).replace(/""/g,'"'); } + else if(fuzzyfmla(s)) { cell.t = 'n'; cell.f = s.substr(1); } + else { cell.t = 's'; cell.v = s; } } else if(s == "TRUE") { cell.t = 'b'; cell.v = true; } else if(s == "FALSE") { cell.t = 'b'; cell.v = false; } - else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; cell.w = s; cell.v = v; } + else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; if(o.cellText !== false) cell.w = s; cell.v = v; } else if(!isNaN(fuzzydate(s).getDate()) || _re && s.match(_re)) { cell.z = o.dateNF || SSF._table[14]; var k = 0; if(_re && s.match(_re)){ s=dateNF_fix(s, o.dateNF, (s.match(_re)||[])); k=1; } if(o.cellDates) { cell.t = 'd'; cell.v = parseDate(s, k); } else { cell.t = 'n'; cell.v = datenum(parseDate(s, k)); } - cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v); + if(o.cellText !== false) cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v); + if(!o.cellNF) delete cell.z; } else { cell.t = 's'; - if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"'); cell.v = s; } - if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; } + if(cell.t == 'z'){} + else if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; } else ws[encode_cell({c:C,r:R})] = cell; start = end+1; if(range.e.c < C) range.e.c = C; @@ -5779,7 +5794,7 @@ var PRN = (function() { case 'array': str = cc2str(d); break; default: throw new Error("Unrecognized type " + opts.type); } - if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str); + if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str.slice(3)); return prn_to_sheet_str(str, opts); } @@ -8389,6 +8404,12 @@ function shift_formula_xlsx(f, range, cell) { var delta = {r:c.r - s.r, c:c.c - s.c}; return shift_formula_str(f, delta); } + +/* TODO: parse formula */ +function fuzzyfmla(f) { + if(f.length == 1) return false; + return true; +} /* --- formula references point to MS-XLS --- */ /* Small helpers */ function parseread(l) { return function(blob, length) { blob.l+=l; return; }; } @@ -14200,6 +14221,7 @@ function parse_workbook(blob, options) { if(file_depth > 1) return; if(!cell_valid) return; if(options.cellStyles && line.XF && line.XF.data) process_cell_style(cell, line, options); + delete line.ixfe; delete line.XF; lastcell = cell; last_cell = encode_cell(cell); if(range.s) { @@ -14308,8 +14330,11 @@ wb.opts.Date1904 = Workbook.WBProps.date1904 = val; break; case 'FileSharing': break; //TODO case 'CodePage': /* overrides based on test cases */ - if(val === 0x5212) val = 1200; - else if(val === 0x8001) val = 1252; + switch(val) { + case 0x5212: val = 1200; break; + case 0x8000: val = 10000; break; + case 0x8001: val = 1252; break; + } opts.codepage = val; set_cp(val); break; @@ -16326,13 +16351,19 @@ var HTML_ = (function() { if(range.e.c < C) range.e.c = C; if(opts.dense) { if(!ws[R]) ws[R] = []; - if(opts.raw) ws[R][C] = {t:'s', v:m}; + if(!m.length){} + else if(opts.raw) ws[R][C] = {t:'s', v:m}; + else if(m === 'TRUE') ws[R][C] = {t:'b', v:true}; + else if(m === 'FALSE') ws[R][C] = {t:'b', v:false}; else if(!isNaN(fuzzynum(m))) ws[R][C] = {t:'n', v:fuzzynum(m)}; else ws[R][C] = {t:'s', v:m}; } else { var coord = encode_cell({r:R, c:C}); /* TODO: value parsing */ - if(opts.raw) ws[coord] = {t:'s', v:m}; + if(!m.length){} + else if(opts.raw) ws[coord] = {t:'s', v:m}; + else if(m === 'TRUE') ws[coord] = {t:'b', v:true}; + else if(m === 'FALSE') ws[coord] = {t:'b', v:false}; else if(!isNaN(fuzzynum(m))) ws[coord] = {t:'n', v:fuzzynum(m)}; else ws[coord] = {t:'s', v:m}; } @@ -16415,7 +16446,7 @@ function parse_dom_table(table, _opts) { var row = rows[R]; var elts = row.children; for(_C = C = 0; _C < elts.length; ++_C) { - var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent; + var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent || ""; for(midx = 0; midx < merges.length; ++midx) { var m = merges[midx]; if(m.s.c == C && m.s.r <= R && R <= m.e.r) { C = m.e.c+1; midx = -1; } @@ -16424,8 +16455,11 @@ function parse_dom_table(table, _opts) { CS = +elt.getAttribute("colspan") || 1; if((RS = +elt.getAttribute("rowspan"))>0 || CS>1) merges.push({s:{r:R,c:C},e:{r:R + (RS||1) - 1, c:C + CS - 1}}); var o = {t:'s', v:v}; - if(v != null && v.length) { - if(opts.raw) o = {t:'s', v:v}; + if(v != null) { + if(v.length == 0) o.t = 'z'; + else if(opts.raw){} + else if(v === 'TRUE') o = {t:'b', v:true}; + else if(v === 'FALSE') o = {t:'b', v:false}; else if(!isNaN(fuzzynum(v))) o = {t:'n', v:fuzzynum(v)}; else if(!isNaN(fuzzydate(v).getDate())) { o = ({t:'d', v:parseDate(v)});