From 8124fcbae0e577048059d4882846facd3a59fcf1 Mon Sep 17 00:00:00 2001
From: SheetJS <dev@sheetjs.com>
Date: Wed, 20 Apr 2022 13:31:11 -0400
Subject: [PATCH] newline normalization

---
 bits/22_xmlutils.js |  8 ++++++--
 bits/38_xlstypes.js |  1 +
 bits/40_harb.js     |  9 ++++++---
 bits/41_lotus.js    | 35 ++++++++++++++++++++++++++++++-----
 bits/42_sstxml.js   |  4 ++--
 bits/45_rtf.js      | 20 +++++++++++++-------
 bits/67_wsxml.js    |  4 ++--
 bits/68_wsbin.js    |  4 ++--
 bits/86_writezip.js |  8 ++++----
 index.html          | 10 +++++-----
 test_files          |  2 +-
 11 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/bits/22_xmlutils.js b/bits/22_xmlutils.js
index d873e88..d4f378a 100644
--- a/bits/22_xmlutils.js
+++ b/bits/22_xmlutils.js
@@ -48,11 +48,15 @@ var rencoding = /*#__PURE__*/evert(encodings);
 var unescapexml/*:StringConv*/ = /*#__PURE__*/(function() {
 	/* 22.4.2.4 bstr (Basic String) */
 	var encregex = /&(?:quot|apos|gt|lt|amp|#x?([\da-fA-F]+));/ig, coderegex = /_x([\da-fA-F]{4})_/ig;
-	return function unescapexml(text/*:string*/)/*:string*/ {
+	function raw_unescapexml(text/*:string*/)/*:string*/ {
 		var s = text + '', i = s.indexOf("<![CDATA[");
 		if(i == -1) return s.replace(encregex, function($$, $1) { return encodings[$$]||String.fromCharCode(parseInt($1,$$.indexOf("x")>-1?16:10))||$$; }).replace(coderegex,function(m,c) {return String.fromCharCode(parseInt(c,16));});
 		var j = s.indexOf("]]>");
-		return unescapexml(s.slice(0, i)) + s.slice(i+9,j) + unescapexml(s.slice(j+3));
+		return raw_unescapexml(s.slice(0, i)) + s.slice(i+9,j) + raw_unescapexml(s.slice(j+3));
+	}
+	return function unescapexml(text/*:string*/, xlsx/*:boolean*/) {
+		var out = raw_unescapexml(text);
+		return xlsx ? out.replace(/\r\n/g, "\n") : out;
 	};
 })();
 
diff --git a/bits/38_xlstypes.js b/bits/38_xlstypes.js
index 23631c3..a54083e 100644
--- a/bits/38_xlstypes.js
+++ b/bits/38_xlstypes.js
@@ -185,6 +185,7 @@ function parse_PropertySet(blob, PIDSI) {
 			if(fail) throw new Error("Read Error: Expected address " + Props[i][1] + ' at ' + blob.l + ' :' + i);
 		}
 		if(PIDSI) {
+			if(Props[i][0] == 0 && Props.length > i+1 && Props[i][1] == Props[i+1][1]) continue; // R9
 			var piddsi = PIDSI[Props[i][0]];
 			PropH[piddsi.n] = parse_TypedPropertyValue(blob, piddsi.t, {raw:true});
 			if(piddsi.p === 'version') PropH[piddsi.n] = String(PropH[piddsi.n] >> 16) + "." + ("0000" + String(PropH[piddsi.n] & 0xFFFF)).slice(-4);
diff --git a/bits/40_harb.js b/bits/40_harb.js
index 9e30280..8950193 100644
--- a/bits/40_harb.js
+++ b/bits/40_harb.js
@@ -905,10 +905,9 @@ var PRN = /*#__PURE__*/(function() {
 		else sep = guess_sep(str.slice(0,1024));
 		var R = 0, C = 0, v = 0;
 		var start = 0, end = 0, sepcc = sep.charCodeAt(0), instr = false, cc=0, startcc=str.charCodeAt(0);
-		str = str.replace(/\r\n/mg, "\n");
 		var _re/*:?RegExp*/ = o.dateNF != null ? dateNF_regex(o.dateNF) : null;
 		function finish_cell() {
-			var s = str.slice(start, end);
+			var s = str.slice(start, end); if(s.slice(-1) == "\r") s = s.slice(0, -1);
 			var cell = ({}/*:any*/);
 			if(s.charAt(0) == '"' && s.charAt(s.length - 1) == '"') s = s.slice(1,-1).replace(/""/g,'"');
 			if(s.length === 0) cell.t = 'z';
@@ -943,7 +942,11 @@ var PRN = /*#__PURE__*/(function() {
 		}
 		outer: for(;end < str.length;++end) switch((cc=str.charCodeAt(end))) {
 			case 0x22: if(startcc === 0x22) instr = !instr; break;
-			case sepcc: case 0x0a: case 0x0d: if(!instr && finish_cell()) break outer; break;
+			case 0x0d:
+				if(instr) break;
+				if(str.charCodeAt(end+1) == 0x0a) ++end;
+				/* falls through */
+			case sepcc: case 0x0a: if(!instr && finish_cell()) break outer; break;
 			default: break;
 		}
 		if(end - start > 0) finish_cell();
diff --git a/bits/41_lotus.js b/bits/41_lotus.js
index 522fb12..4b58eb7 100644
--- a/bits/41_lotus.js
+++ b/bits/41_lotus.js
@@ -47,12 +47,17 @@ var WK_ = /*#__PURE__*/(function() {
 					o.vers = val;
 					if(val >= 0x1000) o.qpro = true;
 					break;
+				case 0xFF: /* BOF (works 3+) */
+					o.vers = val;
+					o.works = true;
+					break;
 				case 0x06: refguess = val; break; /* RANGE */
 				case 0xCC: if(val) next_n = val; break; /* SHEETNAMECS */
 				case 0xDE: next_n = val; break; /* SHEETNAMELP */
 				case 0x0F: /* LABEL */
 				case 0x33: /* STRING */
-					if(!o.qpro) val[1].v = val[1].v.slice(1);
+					if((!o.qpro && !o.works || RT == 0x33) && val[1].v.charCodeAt(0) < 0x30) val[1].v = val[1].v.slice(1);
+					if(o.works || o.works2) val[1].v = val[1].v.replace(/\r\n/g, "\n");
 					/* falls through */
 				case 0x0D: /* INTEGER */
 				case 0x0E: /* NUMBER */
@@ -86,6 +91,7 @@ var WK_ = /*#__PURE__*/(function() {
 						s[val[0].r][val[0].c] = val[1];
 					} else s[encode_cell(val[0])] = val[1];
 					break;
+				case 0x5405: o.works2 = true; break;
 				default:
 			}}, o);
 		} else if(d[2] == 0x1A || d[2] == 0x0E) {
@@ -94,7 +100,9 @@ var WK_ = /*#__PURE__*/(function() {
 			lotushopper(d, function(val, R, RT) { switch(RT) {
 				case 0xCC: n = val; break; /* SHEETNAMECS */
 				case 0x16: /* LABEL16 */
-					val[1].v = val[1].v.slice(1);
+					if(val[1].v.charCodeAt(0) < 0x30) val[1].v = val[1].v.slice(1);
+					// TODO: R9 appears to encode control codes this way -- verify against other versions
+					val[1].v = val[1].v.replace(/\x0F./g, function($$) { return String.fromCharCode($$.charCodeAt(1) - 0x20); }).replace(/\r\n/g, "\n");
 					/* falls through */
 				case 0x17: /* NUMBER17 */
 				case 0x18: /* NUMBER18 */
@@ -289,6 +297,9 @@ var WK_ = /*#__PURE__*/(function() {
 			o[3] = blob.read_shift(1);
 			o[0].r = blob.read_shift(2);
 			blob.l+=2;
+		} else if(opts.works) { // TODO: verify with more complex works3-4 examples
+			o[0].c = blob.read_shift(2); o[0].r = blob.read_shift(2);
+			o[2] = blob.read_shift(2);
 		} else {
 			o[2] = blob.read_shift(1);
 			o[0].c = blob.read_shift(2); o[0].r = blob.read_shift(2);
@@ -324,6 +335,18 @@ var WK_ = /*#__PURE__*/(function() {
 		o.write_shift(1, 0);
 		return o;
 	}
+	function parse_STRING(blob, length, opts) {
+		var tgt = blob.l + length;
+		var o = parse_cell(blob, length, opts);
+		o[1].t = 's';
+		if(opts.vers == 0x5120) {
+			var len = blob.read_shift(1);
+			o[1].v = blob.read_shift(len, 'utf8');
+			return o;
+		}
+		o[1].v = blob.read_shift(tgt - blob.l, 'cstr');
+		return o;
+	}
 
 	function parse_INTEGER(blob, length, opts) {
 		var o = parse_cell(blob, length, opts);
@@ -382,6 +405,7 @@ var WK_ = /*#__PURE__*/(function() {
 		0x33: ["FALSE", 0],
 		0x34: ["TRUE", 0],
 		0x46: ["LEN", 1],
+		0x4A: ["CHAR", 1],
 		0x50: ["SUM", 69],
 		0x51: ["AVERAGEA", 69],
 		0x52: ["COUNTA", 69],
@@ -572,8 +596,8 @@ var WK_ = /*#__PURE__*/(function() {
 	}
 
 	function parse_FORMULA_28(blob, length) {
-		var o = parse_NUMBER_27(blob, 14);
-		blob.l += length - 10; /* TODO: formula */
+		var o = parse_NUMBER_27(blob, 12);
+		blob.l += length - 12; /* TODO: formula */
 		return o;
 	}
 
@@ -663,7 +687,7 @@ var WK_ = /*#__PURE__*/(function() {
 		/*::[*/0x0030/*::]*/: { n:"UNFORMATTED" },
 		/*::[*/0x0031/*::]*/: { n:"CURSORW12" },
 		/*::[*/0x0032/*::]*/: { n:"WINDOW" },
-		/*::[*/0x0033/*::]*/: { n:"STRING", f:parse_LABEL },
+		/*::[*/0x0033/*::]*/: { n:"STRING", f:parse_STRING },
 		/*::[*/0x0037/*::]*/: { n:"PASSWORD" },
 		/*::[*/0x0038/*::]*/: { n:"LOCKED" },
 		/*::[*/0x003C/*::]*/: { n:"QUERY" },
@@ -687,6 +711,7 @@ var WK_ = /*#__PURE__*/(function() {
 		/*::[*/0x0069/*::]*/: { n:"MRANGES??" },
 		/*::[*/0x00CC/*::]*/: { n:"SHEETNAMECS", f:parse_SHEETNAMECS },
 		/*::[*/0x00DE/*::]*/: { n:"SHEETNAMELP", f:parse_SHEETNAMELP },
+		/*::[*/0x00FF/*::]*/: { n:"BOF", f:parseuint16 },
 		/*::[*/0xFFFF/*::]*/: { n:"" }
 	};
 
diff --git a/bits/42_sstxml.js b/bits/42_sstxml.js
index 0d9675f..df443b0 100644
--- a/bits/42_sstxml.js
+++ b/bits/42_sstxml.js
@@ -178,14 +178,14 @@ function parse_si(x, opts) {
 	/* 18.4.12 t ST_Xstring (Plaintext String) */
 	// TODO: is whitespace actually valid here?
 	if(x.match(/^\s*<(?:\w+:)?t[^>]*>/)) {
-		z.t = unescapexml(utf8read(x.slice(x.indexOf(">")+1).split(/<\/(?:\w+:)?t>/)[0]||""));
+		z.t = unescapexml(utf8read(x.slice(x.indexOf(">")+1).split(/<\/(?:\w+:)?t>/)[0]||""), true);
 		z.r = utf8read(x);
 		if(html) z.h = escapehtml(z.t);
 	}
 	/* 18.4.4 r CT_RElt (Rich Text Run) */
 	else if((/*y = */x.match(sirregex))) {
 		z.r = utf8read(x);
-		z.t = unescapexml(utf8read((x.replace(sirphregex, '').match(sitregex)||[]).join("").replace(tagregex,"")));
+		z.t = unescapexml(utf8read((x.replace(sirphregex, '').match(sitregex)||[]).join("").replace(tagregex,"")), true);
 		if(html) z.h = rs_to_html(parse_rs(z.r));
 	}
 	/* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */
diff --git a/bits/45_rtf.js b/bits/45_rtf.js
index e01c25a..b63dac7 100644
--- a/bits/45_rtf.js
+++ b/bits/45_rtf.js
@@ -14,27 +14,33 @@ var RTF = /*#__PURE__*/(function() {
 		var o = opts || {};
 		var ws/*:Worksheet*/ = o.dense ? ([]/*:any*/) : ({}/*:any*/);
 
-		var rows = str.match(/\\trowd.*?\\row\b/g);
+		var rows = str.match(/\\trowd[\s\S]*?\\row\b/g);
 		if(!rows.length) throw new Error("RTF missing table");
 		var range/*:Range*/ = ({s: {c:0, r:0}, e: {c:0, r:rows.length - 1}}/*:any*/);
 		rows.forEach(function(rowtf, R) {
 			if(Array.isArray(ws)) ws[R] = [];
-			var rtfre = /\\\w+\b/g;
+			var rtfre = /\\[\w\-]+\b/g;
 			var last_index = 0;
 			var res;
 			var C = -1;
+			var payload = [];
 			while((res = rtfre.exec(rowtf))) {
+				var data = rowtf.slice(last_index, rtfre.lastIndex - res[0].length);
+				if(data.charCodeAt(0) == 0x20) data = data.slice(1);
+				if(data.length) payload.push(data);
 				switch(res[0]) {
 					case "\\cell":
-						var data = rowtf.slice(last_index, rtfre.lastIndex - res[0].length);
-						if(data[0] == " ") data = data.slice(1);
 						++C;
-						if(data.length) {
+						if(payload.length) {
 							// TODO: value parsing, including codepage adjustments
-							var cell = {v: data, t:"s"};
+							var cell = {v: payload.join(""), t:"s"};
 							if(Array.isArray(ws)) ws[R][C] = cell;
 							else ws[encode_cell({r:R, c:C})] = cell;
 						}
+						payload = [];
+						break;
+					case "\\par": // NOTE: Excel serializes both "\r" and "\n" as "\\par"
+						payload.push("\n");
 						break;
 				}
 				last_index = rtfre.lastIndex;
@@ -60,7 +66,7 @@ var RTF = /*#__PURE__*/(function() {
 				var coord = encode_cell({r:R,c:C});
 				cell = dense ? (ws[R]||[])[C]: ws[coord];
 				if(!cell || cell.v == null && (!cell.f || cell.F)) continue;
-				o.push(" " + (cell.w || (format_cell(cell), cell.w)));
+				o.push(" " + (cell.w || (format_cell(cell), cell.w)).replace(/[\r\n]/g, "\\par "));
 				o.push("\\cell");
 			}
 			o.push("\\pard\\intbl\\row");
diff --git a/bits/67_wsxml.js b/bits/67_wsxml.js
index 45ad1bd..a21ea5f 100644
--- a/bits/67_wsxml.js
+++ b/bits/67_wsxml.js
@@ -388,7 +388,7 @@ return function parse_ws_xml_data(sdata/*:string*/, s, opts, guess/*:Range*/, th
 			if(opts.cellFormula) {
 				if((cref=d.match(match_f))!= null && /*::cref != null && */cref[1] !== '') {
 					/* TODO: match against XLSXFutureFunctions */
-					p.f=unescapexml(utf8read(cref[1])).replace(/\r\n/g, "\n");
+					p.f=unescapexml(utf8read(cref[1]), true);
 					if(!opts.xlfn) p.f = _xlfn(p.f);
 					if(/*::cref != null && cref[0] != null && */cref[0].indexOf('t="array"') > -1) {
 						p.F = (d.match(refregex)||[])[1];
@@ -442,7 +442,7 @@ return function parse_ws_xml_data(sdata/*:string*/, s, opts, guess/*:Range*/, th
 					break;
 				case 'str':
 					p.t = "s";
-					p.v = (p.v!=null) ? utf8read(p.v) : '';
+					p.v = (p.v!=null) ? unescapexml(utf8read(p.v), true) : '';
 					if(opts.cellHTML) p.h = escapehtml(p.v);
 					break;
 				case 'inlineStr':
diff --git a/bits/68_wsbin.js b/bits/68_wsbin.js
index 95f9c45..7d96929 100644
--- a/bits/68_wsbin.js
+++ b/bits/68_wsbin.js
@@ -803,6 +803,8 @@ function parse_ws_bin(data, _opts, idx, rels, wb/*:WBWBProps*/, themes, styles)/
 
 /* TODO: something useful -- this is a stub */
 function write_ws_bin_cell(ba/*:BufArray*/, cell/*:Cell*/, R/*:number*/, C/*:number*/, opts, ws/*:Worksheet*/, last_seen/*:boolean*/)/*:boolean*/ {
+	var o/*:any*/ = ({r:R, c:C}/*:any*/);
+	if(cell.c) ws['!comments'].push([encode_cell(o), cell.c]);
 	if(cell.v === undefined) return false;
 	var vv = "";
 	switch(cell.t) {
@@ -816,11 +818,9 @@ function write_ws_bin_cell(ba/*:BufArray*/, cell/*:Cell*/, R/*:number*/, C/*:num
 		case 'n': case 'e': vv = ''+cell.v; break;
 		default: vv = cell.v; break;
 	}
-	var o/*:any*/ = ({r:R, c:C}/*:any*/);
 	/* TODO: cell style */
 	o.s = get_cell_style(opts.cellXfs, cell, opts);
 	if(cell.l) ws['!links'].push([encode_cell(o), cell.l]);
-	if(cell.c) ws['!comments'].push([encode_cell(o), cell.c]);
 	switch(cell.t) {
 		case 's': case 'str':
 			if(opts.bookSST) {
diff --git a/bits/86_writezip.js b/bits/86_writezip.js
index a6c7405..a3569f0 100644
--- a/bits/86_writezip.js
+++ b/bits/86_writezip.js
@@ -13,8 +13,8 @@ function write_zip_xlsb(wb/*:Workbook*/, opts/*:WriteOpts*/)/*:ZIP*/ {
 	opts.Strings = /*::((*/[]/*:: :any):SST)*/; opts.Strings.Count = 0; opts.Strings.Unique = 0;
 	if(browser_has_Map) opts.revStrings = new Map();
 	else { opts.revStrings = {}; opts.revStrings.foo = []; delete opts.revStrings.foo; }
-	var wbext = opts.bookType == "xlsb" ? "bin" : "xml";
-	var vbafmt = VBAFMTS.indexOf(opts.bookType) > -1;
+	var wbext = "bin";
+	var vbafmt = true;
 	var ct = new_ct();
 	fix_write_opts(opts = opts || {});
 	var zip = zip_new();
@@ -214,10 +214,10 @@ function write_zip_xlsx(wb/*:Workbook*/, opts/*:WriteOpts*/)/*:ZIP*/ {
 					carr[1].forEach(function(c) { if(c.T == true) needtc = true; });
 				});
 				if(needtc) {
-					cf = "xl/threadedComments/threadedComment" + rId + "." + wbext;
+					cf = "xl/threadedComments/threadedComment" + rId + ".xml";
 					zip_add_file(zip, cf, write_tcmnt_xml(comments, people, opts));
 					ct.threadedcomments.push(cf);
-					add_rels(wsrels, -1, "../threadedComments/threadedComment" + rId + "." + wbext, RELS.TCMNT);
+					add_rels(wsrels, -1, "../threadedComments/threadedComment" + rId + ".xml", RELS.TCMNT);
 				}
 
 				cf = "xl/comments" + rId + "." + wbext;
diff --git a/index.html b/index.html
index 02346f1..ea19448 100644
--- a/index.html
+++ b/index.html
@@ -192,7 +192,7 @@ var do_file = (function() {
 })();
 
 (function() {
-	var dropZone = document.getElementById('drop-zone')
+	var dropZone = document.getElementById('drop-zone');
 	if(!dropZone.addEventListener && !window.addEventListener) return;
 
 	function handleDrop(e) {
@@ -217,15 +217,15 @@ var do_file = (function() {
 		dropZone.style.zIndex = zIndex;
 	}
 
-	window.addEventListener('drop' , handleDrop)
-	window.addEventListener('dragover' , handleDragover)
+	window.addEventListener('drop' , handleDrop);
+	window.addEventListener('dragover' , handleDragover);
 	window.addEventListener('dragenter' , function(e){
 		dropZoneDisplay(e, true);
-	})
+	});
 
 	dropZone.addEventListener('dragleave' , function(e){
 		dropZoneDisplay(e, false);
-	})
+	});
 })();
 
 (function() {
diff --git a/test_files b/test_files
index 1935dfc..57645de 160000
--- a/test_files
+++ b/test_files
@@ -1 +1 @@
-Subproject commit 1935dfc8b9edf59b00a3dc031ddfefa8add40179
+Subproject commit 57645de9ec3abd7c5ffd94d2eeb26c3a1074e507