HTML and CSV parsing

- blank cells are omitted or stubbed (fixes #770 h/t @doxma)
- bare equals signs are interpred as strings
- CSV / TSV determination based on frequency (fixes #732 h/t @nknapp)
- HTML DOM Element and CSV tests
- XLS do not leak XF and index (fixes #782 h/t @the-spyke)
- cellText and cellNF apply to CSV (fixes #781 h/t @the-spyke)
This commit is contained in:
SheetJS 2017-08-18 14:10:18 -04:00
parent f6981076ec
commit 2ea9c95839
16 changed files with 314 additions and 115 deletions

View File

@ -32,6 +32,8 @@ before_install:
- "npm install blanket"
- "npm install xlsjs"
- "npm install coveralls mocha-lcov-reporter"
# note: jsdom 11.x expects node >= 6 but is missing engines.node
- "npm install jsdom@11.x"
before_script:
- "make init"
- "cd test_files; make all; cd -"

View File

@ -4,6 +4,10 @@ This log is intended to keep track of backwards-incompatible changes, including
but not limited to API changes and file location changes. Minor behavioral
changes may not be included if they are not expected to break existing code.
## Unreleased (2017-08-??)
* XLS cell ixfe/XF removed
## 0.11.0 (2017-07-31)
* Strip `require` statements from minified version

View File

@ -1385,7 +1385,7 @@ The exported `read` and `readFile` functions accept an options argument:
| Option Name | Default | Description |
| :---------- | ------: | :--------------------------------------------------- |
| type | | Input data encoding (see Input Type below) |
| raw | | If true, plaintext parsing will not parse values ** |
| raw | false | If true, plaintext parsing will not parse values ** |
| cellFormula | true | Save formulae to the .f field |
| cellHTML | true | Parse rich text and save HTML to the `.h` field |
| cellNF | false | Save number format string to the `.z` field |
@ -1473,8 +1473,8 @@ Plaintext format guessing follows the priority order:
| XML | starts with `<` |
| RTF | starts with `{\rt` |
| DSV | starts with `/sep=.$/`, separator is the specified character |
| CSV | more unquoted `","` characters than `"\t"` chars in the first 1024 |
| TSV | one of the first 1024 characters is a tab char `"\t"` |
| CSV | one of the first 1024 characters is a comma char `","` |
| PRN | (default) |
- HTML tags include: `html`, `table`, `head`, `meta`, `script`, `style`, `div`
@ -1964,6 +1964,14 @@ writer proactively generates cells for formulae if values are unavailable.
Excel TXT uses tab as the delimiter and codepage 1200.
Notes:
- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic
Link files. Unlike Excel, if the file does not have a valid SYLK header, it
will be proactively reinterpreted as CSV. There are some files with semicolon
delimiter that align with a valid SYLK file. For the broadest compatibility,
all cells with the value of `ID` are automatically wrapped in double-quotes.
</details>
### Other Workbook Formats

View File

@ -512,6 +512,16 @@ var PRN = (function() {
return arr;
}
function guess_sep(str) {
var cnt = [], instr = false, end = 0, cc = 0;
for(;end < str.length;++end) {
if((cc=str.charCodeAt(end)) == 0x22) instr = !instr;
else if(!instr) cnt[cc] = (cnt[cc]||0)+1;
}
if(cnt[0x2C] > cnt[0x09]) return ",";
return ",";
}
function dsv_to_sheet_str(str/*:string*/, opts)/*:Worksheet*/ {
var o = opts || {};
var sep = "";
@ -519,9 +529,8 @@ var PRN = (function() {
var ws/*:Worksheet*/ = o.dense ? ([]/*:any*/) : ({}/*:any*/);
var range/*:Range*/ = ({s: {c:0, r:0}, e: {c:0, r:0}}/*:any*/);
/* known sep */
if(str.substr(0,4) == "sep=" && str.charCodeAt(5) == 10) { sep = str.charAt(4); str = str.substr(6); }
else if(str.substr(0,1024).indexOf("\t") == -1) sep = ","; else sep = "\t";
else sep = guess_sep(str.substr(0,1024));
var R = 0, C = 0, v = 0;
var start = 0, end = 0, sepcc = sep.charCodeAt(0), instr = false, cc=0;
str = str.replace(/\r\n/mg, "\n");
@ -529,24 +538,30 @@ var PRN = (function() {
function finish_cell() {
var s = str.slice(start, end);
var cell = ({}/*:any*/);
if(o.raw) { cell.t = 's'; cell.v = s; }
else if(s.charCodeAt(0) == 0x3D) { cell.t = 'n'; cell.f = s.substr(1); }
if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"');
if(s.length == 0) cell.t = 'z';
else if(o.raw) { cell.t = 's'; cell.v = s; }
else if(s.charCodeAt(0) == 0x3D) {
if(s.charCodeAt(1) == 0x22 && s.charCodeAt(s.length - 1) == 0x22) { cell.t = 's'; cell.v = s.slice(2,-1).replace(/""/g,'"'); }
else if(fuzzyfmla(s)) { cell.t = 'n'; cell.f = s.substr(1); }
else { cell.t = 's'; cell.v = s; } }
else if(s == "TRUE") { cell.t = 'b'; cell.v = true; }
else if(s == "FALSE") { cell.t = 'b'; cell.v = false; }
else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; cell.w = s; cell.v = v; }
else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; if(o.cellText !== false) cell.w = s; cell.v = v; }
else if(!isNaN(fuzzydate(s).getDate()) || _re && s.match(_re)) {
cell.z = o.dateNF || SSF._table[14];
var k = 0;
if(_re && s.match(_re)){ s=dateNF_fix(s, o.dateNF, (s.match(_re)||[])); k=1; }
if(o.cellDates) { cell.t = 'd'; cell.v = parseDate(s, k); }
else { cell.t = 'n'; cell.v = datenum(parseDate(s, k)); }
cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v);
if(o.cellText !== false) cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v);
if(!o.cellNF) delete cell.z;
} else {
cell.t = 's';
if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"');
cell.v = s;
}
if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; }
if(cell.t == 'z'){}
else if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; }
else ws[encode_cell({c:C,r:R})] = cell;
start = end+1;
if(range.e.c < C) range.e.c = C;
@ -579,7 +594,7 @@ var PRN = (function() {
case 'array': str = cc2str(d); break;
default: throw new Error("Unrecognized type " + opts.type);
}
if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str);
if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str.slice(3));
return prn_to_sheet_str(str, opts);
}

View File

@ -40,3 +40,9 @@ function shift_formula_xlsx(f/*:string*/, range/*:string*/, cell/*:string*/)/*:s
var delta = {r:c.r - s.r, c:c.c - s.c};
return shift_formula_str(f, delta);
}
/* TODO: parse formula */
function fuzzyfmla(f/*:string*/)/*:boolean*/ {
if(f.length == 1) return false;
return true;
}

View File

@ -131,6 +131,7 @@ function parse_workbook(blob, options/*:ParseOpts*/)/*:Workbook*/ {
if(file_depth > 1) return;
if(!cell_valid) return;
if(options.cellStyles && line.XF && line.XF.data) process_cell_style(cell, line, options);
delete line.ixfe; delete line.XF;
lastcell = cell;
last_cell = encode_cell(cell);
if(range.s) {
@ -240,8 +241,11 @@ function parse_workbook(blob, options/*:ParseOpts*/)/*:Workbook*/ {
case 'FileSharing': break; //TODO
case 'CodePage':
/* overrides based on test cases */
if(val === 0x5212) val = 1200;
else if(val === 0x8001) val = 1252;
switch(val) {
case 0x5212: val = 1200; break;
case 0x8000: val = 10000; break;
case 0x8001: val = 1252; break;
}
opts.codepage = val;
set_cp(val);
break;

View File

@ -37,13 +37,19 @@ var HTML_ = (function() {
if(range.e.c < C) range.e.c = C;
if(opts.dense) {
if(!ws[R]) ws[R] = [];
if(opts.raw) ws[R][C] = {t:'s', v:m};
if(!m.length){}
else if(opts.raw) ws[R][C] = {t:'s', v:m};
else if(m === 'TRUE') ws[R][C] = {t:'b', v:true};
else if(m === 'FALSE') ws[R][C] = {t:'b', v:false};
else if(!isNaN(fuzzynum(m))) ws[R][C] = {t:'n', v:fuzzynum(m)};
else ws[R][C] = {t:'s', v:m};
} else {
var coord/*:string*/ = encode_cell({r:R, c:C});
/* TODO: value parsing */
if(opts.raw) ws[coord] = {t:'s', v:m};
if(!m.length){}
else if(opts.raw) ws[coord] = {t:'s', v:m};
else if(m === 'TRUE') ws[coord] = {t:'b', v:true};
else if(m === 'FALSE') ws[coord] = {t:'b', v:false};
else if(!isNaN(fuzzynum(m))) ws[coord] = {t:'n', v:fuzzynum(m)};
else ws[coord] = {t:'s', v:m};
}
@ -126,7 +132,7 @@ function parse_dom_table(table/*:HTMLElement*/, _opts/*:?any*/)/*:Worksheet*/ {
var row = rows[R];
var elts = row.children;
for(_C = C = 0; _C < elts.length; ++_C) {
var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent;
var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent || "";
for(midx = 0; midx < merges.length; ++midx) {
var m = merges[midx];
if(m.s.c == C && m.s.r <= R && R <= m.e.r) { C = m.e.c+1; midx = -1; }
@ -135,8 +141,11 @@ function parse_dom_table(table/*:HTMLElement*/, _opts/*:?any*/)/*:Worksheet*/ {
CS = +elt.getAttribute("colspan") || 1;
if((RS = +elt.getAttribute("rowspan"))>0 || CS>1) merges.push({s:{r:R,c:C},e:{r:R + (RS||1) - 1, c:C + CS - 1}});
var o/*:Cell*/ = {t:'s', v:v};
if(v != null && v.length) {
if(opts.raw) o = {t:'s', v:v};
if(v != null) {
if(v.length == 0) o.t = 'z';
else if(opts.raw){}
else if(v === 'TRUE') o = {t:'b', v:true};
else if(v === 'FALSE') o = {t:'b', v:false};
else if(!isNaN(fuzzynum(v))) o = {t:'n', v:fuzzynum(v)};
else if(!isNaN(fuzzydate(v).getDate())) {
o = ({t:'d', v:parseDate(v)}/*:any*/);

View File

@ -5,7 +5,7 @@ The exported `read` and `readFile` functions accept an options argument:
| Option Name | Default | Description |
| :---------- | ------: | :--------------------------------------------------- |
| type | | Input data encoding (see Input Type below) |
| raw | | If true, plaintext parsing will not parse values ** |
| raw | false | If true, plaintext parsing will not parse values ** |
| cellFormula | true | Save formulae to the .f field |
| cellHTML | true | Parse rich text and save HTML to the `.h` field |
| cellNF | false | Save number format string to the `.z` field |
@ -93,8 +93,8 @@ Plaintext format guessing follows the priority order:
| XML | starts with `<` |
| RTF | starts with `{\rt` |
| DSV | starts with `/sep=.$/`, separator is the specified character |
| CSV | more unquoted `","` characters than `"\t"` chars in the first 1024 |
| TSV | one of the first 1024 characters is a tab char `"\t"` |
| CSV | one of the first 1024 characters is a comma char `","` |
| PRN | (default) |
- HTML tags include: `html`, `table`, `head`, `meta`, `script`, `style`, `div`

View File

@ -113,6 +113,14 @@ writer proactively generates cells for formulae if values are unavailable.
Excel TXT uses tab as the delimiter and codepage 1200.
Notes:
- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic
Link files. Unlike Excel, if the file does not have a valid SYLK header, it
will be proactively reinterpreted as CSV. There are some files with semicolon
delimiter that align with a valid SYLK file. For the broadest compatibility,
all cells with the value of `ID` are automatically wrapped in double-quotes.
</details>
### Other Workbook Formats

View File

@ -1274,7 +1274,7 @@ The exported `read` and `readFile` functions accept an options argument:
| Option Name | Default | Description |
| :---------- | ------: | :--------------------------------------------------- |
| type | | Input data encoding (see Input Type below) |
| raw | | If true, plaintext parsing will not parse values ** |
| raw | false | If true, plaintext parsing will not parse values ** |
| cellFormula | true | Save formulae to the .f field |
| cellHTML | true | Parse rich text and save HTML to the `.h` field |
| cellNF | false | Save number format string to the `.z` field |
@ -1360,8 +1360,8 @@ Plaintext format guessing follows the priority order:
| XML | starts with `<` |
| RTF | starts with `{\rt` |
| DSV | starts with `/sep=.$/`, separator is the specified character |
| CSV | more unquoted `","` characters than `"\t"` chars in the first 1024 |
| TSV | one of the first 1024 characters is a tab char `"\t"` |
| CSV | one of the first 1024 characters is a comma char `","` |
| PRN | (default) |
- HTML tags include: `html`, `table`, `head`, `meta`, `script`, `style`, `div`
@ -1809,6 +1809,14 @@ writer proactively generates cells for formulae if values are unavailable.
Excel TXT uses tab as the delimiter and codepage 1200.
Notes:
- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic
Link files. Unlike Excel, if the file does not have a valid SYLK header, it
will be proactively reinterpreted as CSV. There are some files with semicolon
delimiter that align with a valid SYLK file. For the broadest compatibility,
all cells with the value of `ID` are automatically wrapped in double-quotes.
### Other Workbook Formats

View File

@ -31,6 +31,7 @@
"@sheetjs/uglify-js":"~2.7.3",
"@types/node":"^8.0.7",
"@types/commander":"^2.9.0",
"jsdom": "~11.1.0",
"dtslint": "^0.1.2",
"typescript": "2.2.0"
},

93
test.js
View File

@ -1723,6 +1723,39 @@ describe('json output', function() {
});
});
var codes = [["あ 1", "\u00E3\u0081\u0082 1"]]
var plaintext_val = [
["A1", 'n', -0.08, "-0.08"],
["B1", 'n', 4001, "4,001"],
["C1", 's', "あ 1", "あ 1"],
["A2", 'n', 41.08, "$41.08"],
["B2", 'n', 0.11, "11%"],
["B3", 'b', true, "TRUE"],
["C3", 'b', false, "FALSE"],
["A3"]];
function plaintext_test(wb, raw, sn) {
var sheet = wb.Sheets[sn || wb.SheetNames[0]];
plaintext_val.forEach(function(x) {
var cell = get_cell(sheet, x[0]);
if(x.length == 1) { if(cell) { assert.equal(cell.t, 'z'); assert(!cell.v); } return; }
assert.equal(cell.v, x[2+!!raw]); assert.equal(cell.t, raw ? 's' : x[1]);
});
}
function make_html_str(idx) { return ["<table>",
"<tr><td>-0.08</td><td>4,001</td><td>", codes[0][idx], "</td></tr>",
"<tr><td>$41.08</td><td>11%</td></tr>",
"<tr><td></td><td>TRUE</td><td>FALSE</td></tr>",
"</table>" ].join(""); }
function make_csv_str(idx) { return [ '\u00EF\u00BB\u00BF' +
'-0.08,"4,001",' + codes[0][idx] + '',
'$41.08,11%',
',TRUE,FALSE'
].join("\n"); }
var html_bstr = make_html_str(1), html_str = make_html_str(0);
var csv_bstr = make_csv_str(1);
describe('csv', function() {
describe('input', function(){
var b = "1,2,3,\nTRUE,FALSE,,sheetjs\nfoo,bar,2/19/14,0.3\n,,,\nbaz,,qux,\n";
@ -1769,6 +1802,17 @@ describe('csv', function() {
assert.equal(cell.v.getMonth(), 2);
assert.equal(cell.w, "2/3/14");
});
it('should interpret values by default', function() { plaintext_test(X.read(csv_bstr, {type:"binary"}), false); });
it('should generate strings if raw option is passed', function() { plaintext_test(X.read(csv_bstr, {type:"binary", raw:true}), true); });
it('should handle formulae', function() {
var bb = '=,=1+1,="100"';
var sheet = X.read(bb, {type:"binary"}).Sheets.Sheet1;
assert.equal(get_cell(sheet, "A1").t, 's');
assert.equal(get_cell(sheet, "A1").v, '=');
assert.equal(get_cell(sheet, "B1").f, '1+1');
assert.equal(get_cell(sheet, "C1").t, 's');
assert.equal(get_cell(sheet, "C1").v, '100');
});
});
describe('output', function(){
var data, ws;
@ -1845,37 +1889,26 @@ describe('csv', function() {
});
});
var JSDOM = null;
var domtest = browser || (function(){try{return !!(JSDOM=require('jsdom').JSDOM);}catch(e){return 0;}})();
function get_dom_element(html) {
if(browser) {
var domelt = document.createElement('div');
domelt.innerHTML = html;
return domelt;
}
return new JSDOM(html).window.document.body.children[0];
}
describe('HTML', function() {
describe('input', function(){
var b = "<table><tr><td>-0.08</td><td>4,001</td><td>\u00e3\u0081\u0082 1</td></tr><tr><td>$41.08</td><td>11%</td></tr></table>";
it('should generate numbers by default', function() {
var sheet = X.read(b, {type:"binary"}).Sheets.Sheet1;
var cell = get_cell(sheet, "A1");
assert.equal(cell.v, -0.08);
assert.equal(cell.t, 'n');
cell = get_cell(sheet, "B1");
assert.equal(cell.v, 4001);
cell = get_cell(sheet, "C1");
assert.equal(cell.v, "あ 1");
cell = get_cell(sheet, "A2");
assert.equal(cell.v, 41.08);
cell = get_cell(sheet, "B2");
assert.equal(cell.v, .11);
});
it('should generate strings if raw option is passed', function() {
var sheet = X.read(b, {type:"binary", raw:true}).Sheets.Sheet1;
var cell = get_cell(sheet, "A1");
assert.equal(cell.v, "-0.08");
assert.equal(cell.t, 's');
cell = get_cell(sheet, "B1");
assert.equal(cell.v, "4,001");
cell = get_cell(sheet, "C1");
assert.equal(cell.v, "あ 1");
cell = get_cell(sheet, "A2");
assert.equal(cell.v, "$41.08");
cell = get_cell(sheet, "B2");
assert.equal(cell.v, "11%");
});
describe('input string', function(){
it('should interpret values by default', function() { plaintext_test(X.read(html_bstr, {type:"binary"}), false); });
it('should generate strings if raw option is passed', function() { plaintext_test(X.read(html_bstr, {type:"binary", raw:true}), true); });
});
(domtest ? describe : describe.skip)('input DOM', function() {
it('should interpret values by default', function() { plaintext_test(X.utils.table_to_book(get_dom_element(html_str)), false); });
it('should generate strings if raw option is passed', function() { plaintext_test(X.utils.table_to_book(get_dom_element(html_str), {raw:true}), true); });
});
});

View File

@ -1723,6 +1723,39 @@ describe('json output', function() {
});
});
var codes = [["あ 1", "\u00E3\u0081\u0082 1"]]
var plaintext_val = [
["A1", 'n', -0.08, "-0.08"],
["B1", 'n', 4001, "4,001"],
["C1", 's', "あ 1", "あ 1"],
["A2", 'n', 41.08, "$41.08"],
["B2", 'n', 0.11, "11%"],
["B3", 'b', true, "TRUE"],
["C3", 'b', false, "FALSE"],
["A3"]];
function plaintext_test(wb, raw, sn) {
var sheet = wb.Sheets[sn || wb.SheetNames[0]];
plaintext_val.forEach(function(x) {
var cell = get_cell(sheet, x[0]);
if(x.length == 1) { if(cell) { assert.equal(cell.t, 'z'); assert(!cell.v); } return; }
assert.equal(cell.v, x[2+!!raw]); assert.equal(cell.t, raw ? 's' : x[1]);
});
}
function make_html_str(idx) { return ["<table>",
"<tr><td>-0.08</td><td>4,001</td><td>", codes[0][idx], "</td></tr>",
"<tr><td>$41.08</td><td>11%</td></tr>",
"<tr><td></td><td>TRUE</td><td>FALSE</td></tr>",
"</table>" ].join(""); }
function make_csv_str(idx) { return [ '\u00EF\u00BB\u00BF' +
'-0.08,"4,001",' + codes[0][idx] + '',
'$41.08,11%',
',TRUE,FALSE'
].join("\n"); }
var html_bstr = make_html_str(1), html_str = make_html_str(0);
var csv_bstr = make_csv_str(1);
describe('csv', function() {
describe('input', function(){
var b = "1,2,3,\nTRUE,FALSE,,sheetjs\nfoo,bar,2/19/14,0.3\n,,,\nbaz,,qux,\n";
@ -1769,6 +1802,17 @@ describe('csv', function() {
assert.equal(cell.v.getMonth(), 2);
assert.equal(cell.w, "2/3/14");
});
it('should interpret values by default', function() { plaintext_test(X.read(csv_bstr, {type:"binary"}), false); });
it('should generate strings if raw option is passed', function() { plaintext_test(X.read(csv_bstr, {type:"binary", raw:true}), true); });
it('should handle formulae', function() {
var bb = '=,=1+1,="100"';
var sheet = X.read(bb, {type:"binary"}).Sheets.Sheet1;
assert.equal(get_cell(sheet, "A1").t, 's');
assert.equal(get_cell(sheet, "A1").v, '=');
assert.equal(get_cell(sheet, "B1").f, '1+1');
assert.equal(get_cell(sheet, "C1").t, 's');
assert.equal(get_cell(sheet, "C1").v, '100');
});
});
describe('output', function(){
var data, ws;
@ -1845,37 +1889,26 @@ describe('csv', function() {
});
});
var JSDOM = null;
var domtest = browser || (function(){try{return !!(JSDOM=require('jsdom').JSDOM);}catch(e){return 0;}})();
function get_dom_element(html) {
if(browser) {
var domelt = document.createElement('div');
domelt.innerHTML = html;
return domelt;
}
return new JSDOM(html).window.document.body.children[0];
}
describe('HTML', function() {
describe('input', function(){
var b = "<table><tr><td>-0.08</td><td>4,001</td><td>\u00e3\u0081\u0082 1</td></tr><tr><td>$41.08</td><td>11%</td></tr></table>";
it('should generate numbers by default', function() {
var sheet = X.read(b, {type:"binary"}).Sheets.Sheet1;
var cell = get_cell(sheet, "A1");
assert.equal(cell.v, -0.08);
assert.equal(cell.t, 'n');
cell = get_cell(sheet, "B1");
assert.equal(cell.v, 4001);
cell = get_cell(sheet, "C1");
assert.equal(cell.v, "あ 1");
cell = get_cell(sheet, "A2");
assert.equal(cell.v, 41.08);
cell = get_cell(sheet, "B2");
assert.equal(cell.v, .11);
});
it('should generate strings if raw option is passed', function() {
var sheet = X.read(b, {type:"binary", raw:true}).Sheets.Sheet1;
var cell = get_cell(sheet, "A1");
assert.equal(cell.v, "-0.08");
assert.equal(cell.t, 's');
cell = get_cell(sheet, "B1");
assert.equal(cell.v, "4,001");
cell = get_cell(sheet, "C1");
assert.equal(cell.v, "あ 1");
cell = get_cell(sheet, "A2");
assert.equal(cell.v, "$41.08");
cell = get_cell(sheet, "B2");
assert.equal(cell.v, "11%");
});
describe('input string', function(){
it('should interpret values by default', function() { plaintext_test(X.read(html_bstr, {type:"binary"}), false); });
it('should generate strings if raw option is passed', function() { plaintext_test(X.read(html_bstr, {type:"binary", raw:true}), true); });
});
(domtest ? describe : describe.skip)('input DOM', function() {
it('should interpret values by default', function() { plaintext_test(X.utils.table_to_book(get_dom_element(html_str)), false); });
it('should generate strings if raw option is passed', function() { plaintext_test(X.utils.table_to_book(get_dom_element(html_str), {raw:true}), true); });
});
});

View File

@ -80,7 +80,7 @@ ws['!rows'] = wsrows;
/* TEST: hyperlink note: Excel does not automatically style hyperlinks */
(<XLSX.CellObject>ws['A3']).l = { Target: "http://sheetjs.com", Tooltip: "Visit us <SheetJS.com!>" };
XLSX.utils.cell_set_hyperlink(ws['A3'], "http://sheetjs.com", "Visit us <SheetJS.com!>" );
XLSX.utils.cell_set_hyperlink(ws['A3'], "http://sheetjs.com", "Visit us <SheetJS.com!>");
/* TEST: built-in format */
(<XLSX.CellObject>ws['B1']).z = "0%"; // Format Code 9

View File

@ -5786,6 +5786,16 @@ var PRN = (function() {
return arr;
}
function guess_sep(str) {
var cnt = [], instr = false, end = 0, cc = 0;
for(;end < str.length;++end) {
if((cc=str.charCodeAt(end)) == 0x22) instr = !instr;
else if(!instr) cnt[cc] = (cnt[cc]||0)+1;
}
if(cnt[0x2C] > cnt[0x09]) return ",";
return ",";
}
function dsv_to_sheet_str(str/*:string*/, opts)/*:Worksheet*/ {
var o = opts || {};
var sep = "";
@ -5793,9 +5803,8 @@ var PRN = (function() {
var ws/*:Worksheet*/ = o.dense ? ([]/*:any*/) : ({}/*:any*/);
var range/*:Range*/ = ({s: {c:0, r:0}, e: {c:0, r:0}}/*:any*/);
/* known sep */
if(str.substr(0,4) == "sep=" && str.charCodeAt(5) == 10) { sep = str.charAt(4); str = str.substr(6); }
else if(str.substr(0,1024).indexOf("\t") == -1) sep = ","; else sep = "\t";
else sep = guess_sep(str.substr(0,1024));
var R = 0, C = 0, v = 0;
var start = 0, end = 0, sepcc = sep.charCodeAt(0), instr = false, cc=0;
str = str.replace(/\r\n/mg, "\n");
@ -5803,24 +5812,30 @@ var PRN = (function() {
function finish_cell() {
var s = str.slice(start, end);
var cell = ({}/*:any*/);
if(o.raw) { cell.t = 's'; cell.v = s; }
else if(s.charCodeAt(0) == 0x3D) { cell.t = 'n'; cell.f = s.substr(1); }
if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"');
if(s.length == 0) cell.t = 'z';
else if(o.raw) { cell.t = 's'; cell.v = s; }
else if(s.charCodeAt(0) == 0x3D) {
if(s.charCodeAt(1) == 0x22 && s.charCodeAt(s.length - 1) == 0x22) { cell.t = 's'; cell.v = s.slice(2,-1).replace(/""/g,'"'); }
else if(fuzzyfmla(s)) { cell.t = 'n'; cell.f = s.substr(1); }
else { cell.t = 's'; cell.v = s; } }
else if(s == "TRUE") { cell.t = 'b'; cell.v = true; }
else if(s == "FALSE") { cell.t = 'b'; cell.v = false; }
else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; cell.w = s; cell.v = v; }
else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; if(o.cellText !== false) cell.w = s; cell.v = v; }
else if(!isNaN(fuzzydate(s).getDate()) || _re && s.match(_re)) {
cell.z = o.dateNF || SSF._table[14];
var k = 0;
if(_re && s.match(_re)){ s=dateNF_fix(s, o.dateNF, (s.match(_re)||[])); k=1; }
if(o.cellDates) { cell.t = 'd'; cell.v = parseDate(s, k); }
else { cell.t = 'n'; cell.v = datenum(parseDate(s, k)); }
cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v);
if(o.cellText !== false) cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v);
if(!o.cellNF) delete cell.z;
} else {
cell.t = 's';
if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"');
cell.v = s;
}
if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; }
if(cell.t == 'z'){}
else if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; }
else ws[encode_cell({c:C,r:R})] = cell;
start = end+1;
if(range.e.c < C) range.e.c = C;
@ -5853,7 +5868,7 @@ var PRN = (function() {
case 'array': str = cc2str(d); break;
default: throw new Error("Unrecognized type " + opts.type);
}
if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str);
if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str.slice(3));
return prn_to_sheet_str(str, opts);
}
@ -8466,6 +8481,12 @@ function shift_formula_xlsx(f/*:string*/, range/*:string*/, cell/*:string*/)/*:s
var delta = {r:c.r - s.r, c:c.c - s.c};
return shift_formula_str(f, delta);
}
/* TODO: parse formula */
function fuzzyfmla(f/*:string*/)/*:boolean*/ {
if(f.length == 1) return false;
return true;
}
/* --- formula references point to MS-XLS --- */
/* Small helpers */
function parseread(l) { return function(blob, length) { blob.l+=l; return; }; }
@ -14285,6 +14306,7 @@ function parse_workbook(blob, options/*:ParseOpts*/)/*:Workbook*/ {
if(file_depth > 1) return;
if(!cell_valid) return;
if(options.cellStyles && line.XF && line.XF.data) process_cell_style(cell, line, options);
delete line.ixfe; delete line.XF;
lastcell = cell;
last_cell = encode_cell(cell);
if(range.s) {
@ -14394,8 +14416,11 @@ function parse_workbook(blob, options/*:ParseOpts*/)/*:Workbook*/ {
case 'FileSharing': break; //TODO
case 'CodePage':
/* overrides based on test cases */
if(val === 0x5212) val = 1200;
else if(val === 0x8001) val = 1252;
switch(val) {
case 0x5212: val = 1200; break;
case 0x8000: val = 10000; break;
case 0x8001: val = 1252; break;
}
opts.codepage = val;
set_cp(val);
break;
@ -16412,13 +16437,19 @@ var HTML_ = (function() {
if(range.e.c < C) range.e.c = C;
if(opts.dense) {
if(!ws[R]) ws[R] = [];
if(opts.raw) ws[R][C] = {t:'s', v:m};
if(!m.length){}
else if(opts.raw) ws[R][C] = {t:'s', v:m};
else if(m === 'TRUE') ws[R][C] = {t:'b', v:true};
else if(m === 'FALSE') ws[R][C] = {t:'b', v:false};
else if(!isNaN(fuzzynum(m))) ws[R][C] = {t:'n', v:fuzzynum(m)};
else ws[R][C] = {t:'s', v:m};
} else {
var coord/*:string*/ = encode_cell({r:R, c:C});
/* TODO: value parsing */
if(opts.raw) ws[coord] = {t:'s', v:m};
if(!m.length){}
else if(opts.raw) ws[coord] = {t:'s', v:m};
else if(m === 'TRUE') ws[coord] = {t:'b', v:true};
else if(m === 'FALSE') ws[coord] = {t:'b', v:false};
else if(!isNaN(fuzzynum(m))) ws[coord] = {t:'n', v:fuzzynum(m)};
else ws[coord] = {t:'s', v:m};
}
@ -16501,7 +16532,7 @@ function parse_dom_table(table/*:HTMLElement*/, _opts/*:?any*/)/*:Worksheet*/ {
var row = rows[R];
var elts = row.children;
for(_C = C = 0; _C < elts.length; ++_C) {
var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent;
var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent || "";
for(midx = 0; midx < merges.length; ++midx) {
var m = merges[midx];
if(m.s.c == C && m.s.r <= R && R <= m.e.r) { C = m.e.c+1; midx = -1; }
@ -16510,8 +16541,11 @@ function parse_dom_table(table/*:HTMLElement*/, _opts/*:?any*/)/*:Worksheet*/ {
CS = +elt.getAttribute("colspan") || 1;
if((RS = +elt.getAttribute("rowspan"))>0 || CS>1) merges.push({s:{r:R,c:C},e:{r:R + (RS||1) - 1, c:C + CS - 1}});
var o/*:Cell*/ = {t:'s', v:v};
if(v != null && v.length) {
if(opts.raw) o = {t:'s', v:v};
if(v != null) {
if(v.length == 0) o.t = 'z';
else if(opts.raw){}
else if(v === 'TRUE') o = {t:'b', v:true};
else if(v === 'FALSE') o = {t:'b', v:false};
else if(!isNaN(fuzzynum(v))) o = {t:'n', v:fuzzynum(v)};
else if(!isNaN(fuzzydate(v).getDate())) {
o = ({t:'d', v:parseDate(v)}/*:any*/);

66
xlsx.js
View File

@ -5712,6 +5712,16 @@ var PRN = (function() {
return arr;
}
function guess_sep(str) {
var cnt = [], instr = false, end = 0, cc = 0;
for(;end < str.length;++end) {
if((cc=str.charCodeAt(end)) == 0x22) instr = !instr;
else if(!instr) cnt[cc] = (cnt[cc]||0)+1;
}
if(cnt[0x2C] > cnt[0x09]) return ",";
return ",";
}
function dsv_to_sheet_str(str, opts) {
var o = opts || {};
var sep = "";
@ -5719,9 +5729,8 @@ var PRN = (function() {
var ws = o.dense ? ([]) : ({});
var range = ({s: {c:0, r:0}, e: {c:0, r:0}});
/* known sep */
if(str.substr(0,4) == "sep=" && str.charCodeAt(5) == 10) { sep = str.charAt(4); str = str.substr(6); }
else if(str.substr(0,1024).indexOf("\t") == -1) sep = ","; else sep = "\t";
else sep = guess_sep(str.substr(0,1024));
var R = 0, C = 0, v = 0;
var start = 0, end = 0, sepcc = sep.charCodeAt(0), instr = false, cc=0;
str = str.replace(/\r\n/mg, "\n");
@ -5729,24 +5738,30 @@ var PRN = (function() {
function finish_cell() {
var s = str.slice(start, end);
var cell = ({});
if(o.raw) { cell.t = 's'; cell.v = s; }
else if(s.charCodeAt(0) == 0x3D) { cell.t = 'n'; cell.f = s.substr(1); }
if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"');
if(s.length == 0) cell.t = 'z';
else if(o.raw) { cell.t = 's'; cell.v = s; }
else if(s.charCodeAt(0) == 0x3D) {
if(s.charCodeAt(1) == 0x22 && s.charCodeAt(s.length - 1) == 0x22) { cell.t = 's'; cell.v = s.slice(2,-1).replace(/""/g,'"'); }
else if(fuzzyfmla(s)) { cell.t = 'n'; cell.f = s.substr(1); }
else { cell.t = 's'; cell.v = s; } }
else if(s == "TRUE") { cell.t = 'b'; cell.v = true; }
else if(s == "FALSE") { cell.t = 'b'; cell.v = false; }
else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; cell.w = s; cell.v = v; }
else if(!isNaN(v = fuzzynum(s))) { cell.t = 'n'; if(o.cellText !== false) cell.w = s; cell.v = v; }
else if(!isNaN(fuzzydate(s).getDate()) || _re && s.match(_re)) {
cell.z = o.dateNF || SSF._table[14];
var k = 0;
if(_re && s.match(_re)){ s=dateNF_fix(s, o.dateNF, (s.match(_re)||[])); k=1; }
if(o.cellDates) { cell.t = 'd'; cell.v = parseDate(s, k); }
else { cell.t = 'n'; cell.v = datenum(parseDate(s, k)); }
cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v);
if(o.cellText !== false) cell.w = SSF.format(cell.z, cell.v instanceof Date ? datenum(cell.v):cell.v);
if(!o.cellNF) delete cell.z;
} else {
cell.t = 's';
if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"');
cell.v = s;
}
if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; }
if(cell.t == 'z'){}
else if(o.dense) { if(!ws[R]) ws[R] = []; ws[R][C] = cell; }
else ws[encode_cell({c:C,r:R})] = cell;
start = end+1;
if(range.e.c < C) range.e.c = C;
@ -5779,7 +5794,7 @@ var PRN = (function() {
case 'array': str = cc2str(d); break;
default: throw new Error("Unrecognized type " + opts.type);
}
if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str);
if(bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) str = utf8read(str.slice(3));
return prn_to_sheet_str(str, opts);
}
@ -8389,6 +8404,12 @@ function shift_formula_xlsx(f, range, cell) {
var delta = {r:c.r - s.r, c:c.c - s.c};
return shift_formula_str(f, delta);
}
/* TODO: parse formula */
function fuzzyfmla(f) {
if(f.length == 1) return false;
return true;
}
/* --- formula references point to MS-XLS --- */
/* Small helpers */
function parseread(l) { return function(blob, length) { blob.l+=l; return; }; }
@ -14200,6 +14221,7 @@ function parse_workbook(blob, options) {
if(file_depth > 1) return;
if(!cell_valid) return;
if(options.cellStyles && line.XF && line.XF.data) process_cell_style(cell, line, options);
delete line.ixfe; delete line.XF;
lastcell = cell;
last_cell = encode_cell(cell);
if(range.s) {
@ -14308,8 +14330,11 @@ wb.opts.Date1904 = Workbook.WBProps.date1904 = val; break;
case 'FileSharing': break; //TODO
case 'CodePage':
/* overrides based on test cases */
if(val === 0x5212) val = 1200;
else if(val === 0x8001) val = 1252;
switch(val) {
case 0x5212: val = 1200; break;
case 0x8000: val = 10000; break;
case 0x8001: val = 1252; break;
}
opts.codepage = val;
set_cp(val);
break;
@ -16326,13 +16351,19 @@ var HTML_ = (function() {
if(range.e.c < C) range.e.c = C;
if(opts.dense) {
if(!ws[R]) ws[R] = [];
if(opts.raw) ws[R][C] = {t:'s', v:m};
if(!m.length){}
else if(opts.raw) ws[R][C] = {t:'s', v:m};
else if(m === 'TRUE') ws[R][C] = {t:'b', v:true};
else if(m === 'FALSE') ws[R][C] = {t:'b', v:false};
else if(!isNaN(fuzzynum(m))) ws[R][C] = {t:'n', v:fuzzynum(m)};
else ws[R][C] = {t:'s', v:m};
} else {
var coord = encode_cell({r:R, c:C});
/* TODO: value parsing */
if(opts.raw) ws[coord] = {t:'s', v:m};
if(!m.length){}
else if(opts.raw) ws[coord] = {t:'s', v:m};
else if(m === 'TRUE') ws[coord] = {t:'b', v:true};
else if(m === 'FALSE') ws[coord] = {t:'b', v:false};
else if(!isNaN(fuzzynum(m))) ws[coord] = {t:'n', v:fuzzynum(m)};
else ws[coord] = {t:'s', v:m};
}
@ -16415,7 +16446,7 @@ function parse_dom_table(table, _opts) {
var row = rows[R];
var elts = row.children;
for(_C = C = 0; _C < elts.length; ++_C) {
var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent;
var elt = elts[_C], v = elts[_C].innerText || elts[_C].textContent || "";
for(midx = 0; midx < merges.length; ++midx) {
var m = merges[midx];
if(m.s.c == C && m.s.r <= R && R <= m.e.r) { C = m.e.c+1; midx = -1; }
@ -16424,8 +16455,11 @@ function parse_dom_table(table, _opts) {
CS = +elt.getAttribute("colspan") || 1;
if((RS = +elt.getAttribute("rowspan"))>0 || CS>1) merges.push({s:{r:R,c:C},e:{r:R + (RS||1) - 1, c:C + CS - 1}});
var o = {t:'s', v:v};
if(v != null && v.length) {
if(opts.raw) o = {t:'s', v:v};
if(v != null) {
if(v.length == 0) o.t = 'z';
else if(opts.raw){}
else if(v === 'TRUE') o = {t:'b', v:true};
else if(v === 'FALSE') o = {t:'b', v:false};
else if(!isNaN(fuzzynum(v))) o = {t:'n', v:fuzzynum(v)};
else if(!isNaN(fuzzydate(v).getDate())) {
o = ({t:'d', v:parseDate(v)});