From 8124fcbae0e577048059d4882846facd3a59fcf1 Mon Sep 17 00:00:00 2001 From: SheetJS Date: Wed, 20 Apr 2022 13:31:11 -0400 Subject: [PATCH] newline normalization --- bits/22_xmlutils.js | 8 ++++++-- bits/38_xlstypes.js | 1 + bits/40_harb.js | 9 ++++++--- bits/41_lotus.js | 35 ++++++++++++++++++++++++++++++----- bits/42_sstxml.js | 4 ++-- bits/45_rtf.js | 20 +++++++++++++------- bits/67_wsxml.js | 4 ++-- bits/68_wsbin.js | 4 ++-- bits/86_writezip.js | 8 ++++---- index.html | 10 +++++----- test_files | 2 +- 11 files changed, 72 insertions(+), 33 deletions(-) diff --git a/bits/22_xmlutils.js b/bits/22_xmlutils.js index d873e88..d4f378a 100644 --- a/bits/22_xmlutils.js +++ b/bits/22_xmlutils.js @@ -48,11 +48,15 @@ var rencoding = /*#__PURE__*/evert(encodings); var unescapexml/*:StringConv*/ = /*#__PURE__*/(function() { /* 22.4.2.4 bstr (Basic String) */ var encregex = /&(?:quot|apos|gt|lt|amp|#x?([\da-fA-F]+));/ig, coderegex = /_x([\da-fA-F]{4})_/ig; - return function unescapexml(text/*:string*/)/*:string*/ { + function raw_unescapexml(text/*:string*/)/*:string*/ { var s = text + '', i = s.indexOf("-1?16:10))||$$; }).replace(coderegex,function(m,c) {return String.fromCharCode(parseInt(c,16));}); var j = s.indexOf("]]>"); - return unescapexml(s.slice(0, i)) + s.slice(i+9,j) + unescapexml(s.slice(j+3)); + return raw_unescapexml(s.slice(0, i)) + s.slice(i+9,j) + raw_unescapexml(s.slice(j+3)); + } + return function unescapexml(text/*:string*/, xlsx/*:boolean*/) { + var out = raw_unescapexml(text); + return xlsx ? out.replace(/\r\n/g, "\n") : out; }; })(); diff --git a/bits/38_xlstypes.js b/bits/38_xlstypes.js index 23631c3..a54083e 100644 --- a/bits/38_xlstypes.js +++ b/bits/38_xlstypes.js @@ -185,6 +185,7 @@ function parse_PropertySet(blob, PIDSI) { if(fail) throw new Error("Read Error: Expected address " + Props[i][1] + ' at ' + blob.l + ' :' + i); } if(PIDSI) { + if(Props[i][0] == 0 && Props.length > i+1 && Props[i][1] == Props[i+1][1]) continue; // R9 var piddsi = PIDSI[Props[i][0]]; PropH[piddsi.n] = parse_TypedPropertyValue(blob, piddsi.t, {raw:true}); if(piddsi.p === 'version') PropH[piddsi.n] = String(PropH[piddsi.n] >> 16) + "." + ("0000" + String(PropH[piddsi.n] & 0xFFFF)).slice(-4); diff --git a/bits/40_harb.js b/bits/40_harb.js index 9e30280..8950193 100644 --- a/bits/40_harb.js +++ b/bits/40_harb.js @@ -905,10 +905,9 @@ var PRN = /*#__PURE__*/(function() { else sep = guess_sep(str.slice(0,1024)); var R = 0, C = 0, v = 0; var start = 0, end = 0, sepcc = sep.charCodeAt(0), instr = false, cc=0, startcc=str.charCodeAt(0); - str = str.replace(/\r\n/mg, "\n"); var _re/*:?RegExp*/ = o.dateNF != null ? dateNF_regex(o.dateNF) : null; function finish_cell() { - var s = str.slice(start, end); + var s = str.slice(start, end); if(s.slice(-1) == "\r") s = s.slice(0, -1); var cell = ({}/*:any*/); if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"'); if(s.length === 0) cell.t = 'z'; @@ -943,7 +942,11 @@ var PRN = /*#__PURE__*/(function() { } outer: for(;end < str.length;++end) switch((cc=str.charCodeAt(end))) { case 0x22: if(startcc === 0x22) instr = !instr; break; - case sepcc: case 0x0a: case 0x0d: if(!instr && finish_cell()) break outer; break; + case 0x0d: + if(instr) break; + if(str.charCodeAt(end+1) == 0x0a) ++end; + /* falls through */ + case sepcc: case 0x0a: if(!instr && finish_cell()) break outer; break; default: break; } if(end - start > 0) finish_cell(); diff --git a/bits/41_lotus.js b/bits/41_lotus.js index 522fb12..4b58eb7 100644 --- a/bits/41_lotus.js +++ b/bits/41_lotus.js @@ -47,12 +47,17 @@ var WK_ = /*#__PURE__*/(function() { o.vers = val; if(val >= 0x1000) o.qpro = true; break; + case 0xFF: /* BOF (works 3+) */ + o.vers = val; + o.works = true; + break; case 0x06: refguess = val; break; /* RANGE */ case 0xCC: if(val) next_n = val; break; /* SHEETNAMECS */ case 0xDE: next_n = val; break; /* SHEETNAMELP */ case 0x0F: /* LABEL */ case 0x33: /* STRING */ - if(!o.qpro) val[1].v = val[1].v.slice(1); + if((!o.qpro && !o.works || RT == 0x33) && val[1].v.charCodeAt(0) < 0x30) val[1].v = val[1].v.slice(1); + if(o.works || o.works2) val[1].v = val[1].v.replace(/\r\n/g, "\n"); /* falls through */ case 0x0D: /* INTEGER */ case 0x0E: /* NUMBER */ @@ -86,6 +91,7 @@ var WK_ = /*#__PURE__*/(function() { s[val[0].r][val[0].c] = val[1]; } else s[encode_cell(val[0])] = val[1]; break; + case 0x5405: o.works2 = true; break; default: }}, o); } else if(d[2] == 0x1A || d[2] == 0x0E) { @@ -94,7 +100,9 @@ var WK_ = /*#__PURE__*/(function() { lotushopper(d, function(val, R, RT) { switch(RT) { case 0xCC: n = val; break; /* SHEETNAMECS */ case 0x16: /* LABEL16 */ - val[1].v = val[1].v.slice(1); + if(val[1].v.charCodeAt(0) < 0x30) val[1].v = val[1].v.slice(1); + // TODO: R9 appears to encode control codes this way -- verify against other versions + val[1].v = val[1].v.replace(/\x0F./g, function($$) { return String.fromCharCode($$.charCodeAt(1) - 0x20); }).replace(/\r\n/g, "\n"); /* falls through */ case 0x17: /* NUMBER17 */ case 0x18: /* NUMBER18 */ @@ -289,6 +297,9 @@ var WK_ = /*#__PURE__*/(function() { o[3] = blob.read_shift(1); o[0].r = blob.read_shift(2); blob.l+=2; + } else if(opts.works) { // TODO: verify with more complex works3-4 examples + o[0].c = blob.read_shift(2); o[0].r = blob.read_shift(2); + o[2] = blob.read_shift(2); } else { o[2] = blob.read_shift(1); o[0].c = blob.read_shift(2); o[0].r = blob.read_shift(2); @@ -324,6 +335,18 @@ var WK_ = /*#__PURE__*/(function() { o.write_shift(1, 0); return o; } + function parse_STRING(blob, length, opts) { + var tgt = blob.l + length; + var o = parse_cell(blob, length, opts); + o[1].t = 's'; + if(opts.vers == 0x5120) { + var len = blob.read_shift(1); + o[1].v = blob.read_shift(len, 'utf8'); + return o; + } + o[1].v = blob.read_shift(tgt - blob.l, 'cstr'); + return o; + } function parse_INTEGER(blob, length, opts) { var o = parse_cell(blob, length, opts); @@ -382,6 +405,7 @@ var WK_ = /*#__PURE__*/(function() { 0x33: ["FALSE", 0], 0x34: ["TRUE", 0], 0x46: ["LEN", 1], + 0x4A: ["CHAR", 1], 0x50: ["SUM", 69], 0x51: ["AVERAGEA", 69], 0x52: ["COUNTA", 69], @@ -572,8 +596,8 @@ var WK_ = /*#__PURE__*/(function() { } function parse_FORMULA_28(blob, length) { - var o = parse_NUMBER_27(blob, 14); - blob.l += length - 10; /* TODO: formula */ + var o = parse_NUMBER_27(blob, 12); + blob.l += length - 12; /* TODO: formula */ return o; } @@ -663,7 +687,7 @@ var WK_ = /*#__PURE__*/(function() { /*::[*/0x0030/*::]*/: { n:"UNFORMATTED" }, /*::[*/0x0031/*::]*/: { n:"CURSORW12" }, /*::[*/0x0032/*::]*/: { n:"WINDOW" }, - /*::[*/0x0033/*::]*/: { n:"STRING", f:parse_LABEL }, + /*::[*/0x0033/*::]*/: { n:"STRING", f:parse_STRING }, /*::[*/0x0037/*::]*/: { n:"PASSWORD" }, /*::[*/0x0038/*::]*/: { n:"LOCKED" }, /*::[*/0x003C/*::]*/: { n:"QUERY" }, @@ -687,6 +711,7 @@ var WK_ = /*#__PURE__*/(function() { /*::[*/0x0069/*::]*/: { n:"MRANGES??" }, /*::[*/0x00CC/*::]*/: { n:"SHEETNAMECS", f:parse_SHEETNAMECS }, /*::[*/0x00DE/*::]*/: { n:"SHEETNAMELP", f:parse_SHEETNAMELP }, + /*::[*/0x00FF/*::]*/: { n:"BOF", f:parseuint16 }, /*::[*/0xFFFF/*::]*/: { n:"" } }; diff --git a/bits/42_sstxml.js b/bits/42_sstxml.js index 0d9675f..df443b0 100644 --- a/bits/42_sstxml.js +++ b/bits/42_sstxml.js @@ -178,14 +178,14 @@ function parse_si(x, opts) { /* 18.4.12 t ST_Xstring (Plaintext String) */ // TODO: is whitespace actually valid here? if(x.match(/^\s*<(?:\w+:)?t[^>]*>/)) { - z.t = unescapexml(utf8read(x.slice(x.indexOf(">")+1).split(/<\/(?:\w+:)?t>/)[0]||"")); + z.t = unescapexml(utf8read(x.slice(x.indexOf(">")+1).split(/<\/(?:\w+:)?t>/)[0]||""), true); z.r = utf8read(x); if(html) z.h = escapehtml(z.t); } /* 18.4.4 r CT_RElt (Rich Text Run) */ else if((/*y = */x.match(sirregex))) { z.r = utf8read(x); - z.t = unescapexml(utf8read((x.replace(sirphregex, '').match(sitregex)||[]).join("").replace(tagregex,""))); + z.t = unescapexml(utf8read((x.replace(sirphregex, '').match(sitregex)||[]).join("").replace(tagregex,"")), true); if(html) z.h = rs_to_html(parse_rs(z.r)); } /* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */ diff --git a/bits/45_rtf.js b/bits/45_rtf.js index e01c25a..b63dac7 100644 --- a/bits/45_rtf.js +++ b/bits/45_rtf.js @@ -14,27 +14,33 @@ var RTF = /*#__PURE__*/(function() { var o = opts || {}; var ws/*:Worksheet*/ = o.dense ? ([]/*:any*/) : ({}/*:any*/); - var rows = str.match(/\\trowd.*?\\row\b/g); + var rows = str.match(/\\trowd[\s\S]*?\\row\b/g); if(!rows.length) throw new Error("RTF missing table"); var range/*:Range*/ = ({s: {c:0, r:0}, e: {c:0, r:rows.length - 1}}/*:any*/); rows.forEach(function(rowtf, R) { if(Array.isArray(ws)) ws[R] = []; - var rtfre = /\\\w+\b/g; + var rtfre = /\\[\w\-]+\b/g; var last_index = 0; var res; var C = -1; + var payload = []; while((res = rtfre.exec(rowtf))) { + var data = rowtf.slice(last_index, rtfre.lastIndex - res[0].length); + if(data.charCodeAt(0) == 0x20) data = data.slice(1); + if(data.length) payload.push(data); switch(res[0]) { case "\\cell": - var data = rowtf.slice(last_index, rtfre.lastIndex - res[0].length); - if(data[0] == " ") data = data.slice(1); ++C; - if(data.length) { + if(payload.length) { // TODO: value parsing, including codepage adjustments - var cell = {v: data, t:"s"}; + var cell = {v: payload.join(""), t:"s"}; if(Array.isArray(ws)) ws[R][C] = cell; else ws[encode_cell({r:R, c:C})] = cell; } + payload = []; + break; + case "\\par": // NOTE: Excel serializes both "\r" and "\n" as "\\par" + payload.push("\n"); break; } last_index = rtfre.lastIndex; @@ -60,7 +66,7 @@ var RTF = /*#__PURE__*/(function() { var coord = encode_cell({r:R,c:C}); cell = dense ? (ws[R]||[])[C]: ws[coord]; if(!cell || cell.v == null && (!cell.f || cell.F)) continue; - o.push(" " + (cell.w || (format_cell(cell), cell.w))); + o.push(" " + (cell.w || (format_cell(cell), cell.w)).replace(/[\r\n]/g, "\\par ")); o.push("\\cell"); } o.push("\\pard\\intbl\\row"); diff --git a/bits/67_wsxml.js b/bits/67_wsxml.js index 45ad1bd..a21ea5f 100644 --- a/bits/67_wsxml.js +++ b/bits/67_wsxml.js @@ -388,7 +388,7 @@ return function parse_ws_xml_data(sdata/*:string*/, s, opts, guess/*:Range*/, th if(opts.cellFormula) { if((cref=d.match(match_f))!= null && /*::cref != null && */cref[1] !== '') { /* TODO: match against XLSXFutureFunctions */ - p.f=unescapexml(utf8read(cref[1])).replace(/\r\n/g, "\n"); + p.f=unescapexml(utf8read(cref[1]), true); if(!opts.xlfn) p.f = _xlfn(p.f); if(/*::cref != null && cref[0] != null && */cref[0].indexOf('t="array"') > -1) { p.F = (d.match(refregex)||[])[1]; @@ -442,7 +442,7 @@ return function parse_ws_xml_data(sdata/*:string*/, s, opts, guess/*:Range*/, th break; case 'str': p.t = "s"; - p.v = (p.v!=null) ? utf8read(p.v) : ''; + p.v = (p.v!=null) ? unescapexml(utf8read(p.v), true) : ''; if(opts.cellHTML) p.h = escapehtml(p.v); break; case 'inlineStr': diff --git a/bits/68_wsbin.js b/bits/68_wsbin.js index 95f9c45..7d96929 100644 --- a/bits/68_wsbin.js +++ b/bits/68_wsbin.js @@ -803,6 +803,8 @@ function parse_ws_bin(data, _opts, idx, rels, wb/*:WBWBProps*/, themes, styles)/ /* TODO: something useful -- this is a stub */ function write_ws_bin_cell(ba/*:BufArray*/, cell/*:Cell*/, R/*:number*/, C/*:number*/, opts, ws/*:Worksheet*/, last_seen/*:boolean*/)/*:boolean*/ { + var o/*:any*/ = ({r:R, c:C}/*:any*/); + if(cell.c) ws['!comments'].push([encode_cell(o), cell.c]); if(cell.v === undefined) return false; var vv = ""; switch(cell.t) { @@ -816,11 +818,9 @@ function write_ws_bin_cell(ba/*:BufArray*/, cell/*:Cell*/, R/*:number*/, C/*:num case 'n': case 'e': vv = ''+cell.v; break; default: vv = cell.v; break; } - var o/*:any*/ = ({r:R, c:C}/*:any*/); /* TODO: cell style */ o.s = get_cell_style(opts.cellXfs, cell, opts); if(cell.l) ws['!links'].push([encode_cell(o), cell.l]); - if(cell.c) ws['!comments'].push([encode_cell(o), cell.c]); switch(cell.t) { case 's': case 'str': if(opts.bookSST) { diff --git a/bits/86_writezip.js b/bits/86_writezip.js index a6c7405..a3569f0 100644 --- a/bits/86_writezip.js +++ b/bits/86_writezip.js @@ -13,8 +13,8 @@ function write_zip_xlsb(wb/*:Workbook*/, opts/*:WriteOpts*/)/*:ZIP*/ { opts.Strings = /*::((*/[]/*:: :any):SST)*/; opts.Strings.Count = 0; opts.Strings.Unique = 0; if(browser_has_Map) opts.revStrings = new Map(); else { opts.revStrings = {}; opts.revStrings.foo = []; delete opts.revStrings.foo; } - var wbext = opts.bookType == "xlsb" ? "bin" : "xml"; - var vbafmt = VBAFMTS.indexOf(opts.bookType) > -1; + var wbext = "bin"; + var vbafmt = true; var ct = new_ct(); fix_write_opts(opts = opts || {}); var zip = zip_new(); @@ -214,10 +214,10 @@ function write_zip_xlsx(wb/*:Workbook*/, opts/*:WriteOpts*/)/*:ZIP*/ { carr[1].forEach(function(c) { if(c.T == true) needtc = true; }); }); if(needtc) { - cf = "xl/threadedComments/threadedComment" + rId + "." + wbext; + cf = "xl/threadedComments/threadedComment" + rId + ".xml"; zip_add_file(zip, cf, write_tcmnt_xml(comments, people, opts)); ct.threadedcomments.push(cf); - add_rels(wsrels, -1, "../threadedComments/threadedComment" + rId + "." + wbext, RELS.TCMNT); + add_rels(wsrels, -1, "../threadedComments/threadedComment" + rId + ".xml", RELS.TCMNT); } cf = "xl/comments" + rId + "." + wbext; diff --git a/index.html b/index.html index 02346f1..ea19448 100644 --- a/index.html +++ b/index.html @@ -192,7 +192,7 @@ var do_file = (function() { })(); (function() { - var dropZone = document.getElementById('drop-zone') + var dropZone = document.getElementById('drop-zone'); if(!dropZone.addEventListener && !window.addEventListener) return; function handleDrop(e) { @@ -217,15 +217,15 @@ var do_file = (function() { dropZone.style.zIndex = zIndex; } - window.addEventListener('drop' , handleDrop) - window.addEventListener('dragover' , handleDragover) + window.addEventListener('drop' , handleDrop); + window.addEventListener('dragover' , handleDragover); window.addEventListener('dragenter' , function(e){ dropZoneDisplay(e, true); - }) + }); dropZone.addEventListener('dragleave' , function(e){ dropZoneDisplay(e, false); - }) + }); })(); (function() { diff --git a/test_files b/test_files index 1935dfc..57645de 160000 --- a/test_files +++ b/test_files @@ -1 +1 @@ -Subproject commit 1935dfc8b9edf59b00a3dc031ddfefa8add40179 +Subproject commit 57645de9ec3abd7c5ffd94d2eeb26c3a1074e507