version bump 0.3.3: malformed xlsx parsing

Files that cannot be processed by Excel 2011 were removed from the suite
2013-11-12 08:29:53 -08:00 · 2013-11-12 08:29:53 -08:00 · b4f7030634
parent 2cdd09f801
commit b4f7030634
7 changed files with 71 additions and 44 deletions
--- a/bits/70_xlsx.js
+++ b/bits/70_xlsx.js
@ -98,6 +98,7 @@ var _ssfopts = {}; // spreadsheet formatting options

 /* 18.3 Worksheets */
 function parseSheet(data) {
+	if(!data) return data;
 	/* 18.3.1.99 worksheet CT_Worksheet */
 	var s = {};

@ -146,7 +147,7 @@ function parseSheet(data) {
 				} break;
 				case 'str': if(p.v) p.v = utf8read(p.v); break; // normal string
 				case 'inlineStr':
-					p.t = 'str'; p.v = unescapexml(d.match(matchtag('t'))[1]);
+					p.t = 'str'; p.v = unescapexml((d.match(matchtag('t'))||["",""])[1]);
 					break; // inline string
 				case 'b':
 					switch(p.v) {
@ -160,7 +161,7 @@ function parseSheet(data) {
 			}

 			/* formatting */
-			if(cell.s) {
+			if(cell.s && styles.CellXf) { /* TODO: second check is a hacked guard */
 				var cf = styles.CellXf[cell.s];
 				if(cf && cf.numFmtId && cf.numFmtId !== 0) {
 					p.raw = p.v;
@ -238,10 +239,10 @@ function parseDeps(data) {
 var ctext = {};

 function parseCT(data) {
-	if(!data) return data;
+	if(!data || !data.match) return data;
 	var ct = { workbooks: [], sheets: [], calcchains: [], themes: [], styles: [],
 		coreprops: [], extprops: [], strs:[], xmlns: "" };
-	data.match(/<[^>]*>/g).forEach(function(x) {
+	(data.match(/<[^>]*>/g)||[]).forEach(function(x) {
 		var y = parsexmltag(x);
 		switch(y[0]) {
 			case '<?xml': break;
@ -446,29 +447,36 @@ function parseStyles(data) {
 }

 function getdata(data) {
-	if(!data) return {};
+	if(!data) return null; 
 	if(data.data) return data.data;
 	if(data._data && data._data.getContent) return Array.prototype.slice.call(data._data.getContent(),0).map(function(x) { return String.fromCharCode(x); }).join("");
-	return {};
+	return null;
+}
+
+function getzipfile(zip, file) {
+	var f = file; if(zip.files[f]) return zip.files[f];
+	f = file.toLowerCase(); if(zip.files[f]) return zip.files[f];
+	f = f.replace(/\//g,'\\'); if(zip.files[f]) return zip.files[f];
+	throw new Error("Cannot find file " + file + " in zip")
 }

 function parseZip(zip) {
 	var entries = Object.keys(zip.files);
 	var keys = entries.filter(function(x){return x.substr(-1) != '/';}).sort();
-	var dir = parseCT(getdata(zip.files['[Content_Types].xml']));
+	var dir = parseCT(getdata(getzipfile(zip, '[Content_Types].xml')));

 	strs = {};
-	if(dir.sst) strs=parse_sst(getdata(zip.files[dir.sst.replace(/^\//,'')]));
+	if(dir.sst) strs=parse_sst(getdata(getzipfile(zip, dir.sst.replace(/^\//,''))));

 	styles = {};
-	if(dir.style) styles = parseStyles(getdata(zip.files[dir.style.replace(/^\//,'')]));
+	if(dir.style) styles = parseStyles(getdata(getzipfile(zip, dir.style.replace(/^\//,''))));

-	var wb = parseWB(getdata(zip.files[dir.workbooks[0].replace(/^\//,'')]));
-	var propdata = dir.coreprops.length !== 0 ? getdata(zip.files[dir.coreprops[0].replace(/^\//,'')]) : "";
-	propdata += dir.extprops.length !== 0 ? getdata(zip.files[dir.extprops[0].replace(/^\//,'')]) : "";
+	var wb = parseWB(getdata(getzipfile(zip, dir.workbooks[0].replace(/^\//,''))));
+	var propdata = dir.coreprops.length !== 0 ? getdata(getzipfile(zip, dir.coreprops[0].replace(/^\//,''))) : "";
+	propdata += dir.extprops.length !== 0 ? getdata(getzipfile(zip, dir.extprops[0].replace(/^\//,''))) : "";
 	var props = propdata !== "" ? parseProps(propdata) : {};
 	var deps = {};
-	if(dir.calcchain) deps=parseDeps(getdata(zip.files[dir.calcchain.replace(/^\//,'')]));
+	if(dir.calcchain) deps=parseDeps(getdata(getzipfile(zip, dir.calcchain.replace(/^\//,''))));
 	var sheets = {}, i=0;
 	if(!props.Worksheets) {
 		/* Google Docs doesn't generate the appropriate metadata, so we impute: */
@ -479,12 +487,16 @@ function parseZip(zip) {
 			props.SheetNames[j] = wbsheets[j].name;
 		}
 		for(i = 0; i != props.Worksheets; ++i) {
-			sheets[props.SheetNames[i]]=parseSheet(getdata(zip.files['xl/worksheets/sheet' + (i+1) + '.xml']));
+			try { /* TODO: remove these guards */ 
+			sheets[props.SheetNames[i]]=parseSheet(getdata(getzipfile(zip, 'xl/worksheets/sheet' + (i+1) + '.xml')));
+			} catch(e) {}
 		}
 	}
 	else {
 		for(i = 0; i != props.Worksheets; ++i) {
-			sheets[props.SheetNames[i]]=parseSheet(getdata(zip.files[dir.sheets[i].replace(/^\//,'')]));
+			try { 
+			sheets[props.SheetNames[i]]=parseSheet(getdata(getzipfile(zip, dir.sheets[i].replace(/^\//,''))));
+			} catch(e) {}
 		}
 	}
 	return {
--- a/bits/90_utils.js
+++ b/bits/90_utils.js
@ -76,11 +76,11 @@ function sheet_to_csv(sheet) {
 	};
 	var out = "";
 	if(sheet["!ref"]) {
-		var r = utils.decode_range(sheet["!ref"]);
+		var r = XLSX.utils.decode_range(sheet["!ref"]);
 		for(var R = r.s.r; R <= r.e.r; ++R) {
 			var row = [];
 			for(var C = r.s.c; C <= r.e.c; ++C) {
-				var val = sheet[utils.encode_cell({c:C,r:R})];
+				var val = sheet[XLSX.utils.encode_cell({c:C,r:R})];
 				row.push(val ? stringify(val).replace(/\\r\\n/g,"\n").replace(/\\t/g,"\t").replace(/\\\\/g,"\\").replace("\\\"","\"\"") : "");
 			}
 			out += row.join(",") + "\n";
@ -103,7 +103,7 @@ function get_formulae(ws) {
 	return cmds;
 }

-var utils = {
+XLSX.utils = {
 	encode_col: encode_col,
 	encode_row: encode_row,
 	encode_cell: encode_cell,
--- a/bits/99_footer.js
+++ b/bits/99_footer.js
@ -2,7 +2,7 @@
 if(typeof require !== 'undefined' && typeof exports !== 'undefined') {
 	exports.read = XLSX.read;
 	exports.readFile = XLSX.readFile;
-	exports.utils = utils;
+	exports.utils = XLSX.utils;
 	exports.main = function(args) {
 		var zip = XLSX.read(args[0], {type:'file'});
 		console.log(zip.Sheets);
--- a/index.html
+++ b/index.html
@ -1,4 +1,8 @@
 <!DOCTYPE html>
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<title>JS-XLSX Live Demo</title>
 <style>
 #drop{
 	border:2px dashed #bbb;
@ -13,6 +17,8 @@
 	width:100%;
 }
 </style>
+</head>
+<body>
 <b>JS-XLSX Live Demo</b><br />
 <input type="radio" name="format" value="csv" checked> CSV<br>
 <input type="radio" name="format" value="json"> JSON<br>
@ -127,3 +133,5 @@ if(drop.addEventListener) {
 	drop.addEventListener('drop', handleDrop, false);
 }
 </script>
+</body>
+</html>
--- a/package.json
+++ b/package.json
@ -1,11 +1,9 @@
 {
 	"name": "xlsx",
-	"version": "0.3.2",
+	"version": "0.3.3",
 	"author": "sheetjs",
 	"description": "(one day) a full-featured XLSX parser and writer.  For now, primitive parser",
-	"keywords": [
-		"xlsx", "office", "excel", "spreadsheet"	
-	], 
+	"keywords": [ "xlsx", "office", "excel", "spreadsheet" ],
 	"bin": {
 		"xlsx2csv": "./bin/xlsx2csv.njs"
 	},
@ -18,10 +16,7 @@
 		"mocha":"",
 		"jasmine-node": "x"
 	},
-	"repository": {
-		"type":"git",
-		"url": "git://github.com/SheetJS/js-xlsx.git"
-	},
+	"repository": { "type":"git", "url":"git://github.com/SheetJS/js-xlsx.git" },
 	"scripts": {
 		"pretest": "git submodule init && git submodule update",
 		"test": "make mocha",
--- a/tests.lst
+++ b/tests.lst
@ -11,7 +11,7 @@
 47668.xlsx
 47737.xlsx
 47804.xlsx
-47813.xlsx
+47813.xlsx.pending
 47862.xlsx
 47889.xlsx
 48495.xlsx
@ -96,7 +96,7 @@ WithThreeCharts.xlsx
 WithTwoCharts.xlsx
 WithVariousData.xlsx
 atp.xlsx
-chart_sheet.xlsx
+chart_sheet.xlsx.pending
 comments.xlsx
 formula_stress_test.xlsx.pending
 interview.xlsx
@ -105,7 +105,7 @@ mixed_sheets.xlsx
 named_ranges_2011.xlsx
 picture.xlsx
 reordered_sheets.xlsx
-sample-beta.xlsx
+sample-beta.xlsx.pending
 sample.xlsx
 shared_formulas.xlsx
 sheetProtection_allLocked.xlsx
--- a/xlsx.js
+++ b/xlsx.js
@ -529,6 +529,7 @@ var _ssfopts = {}; // spreadsheet formatting options

 /* 18.3 Worksheets */
 function parseSheet(data) {
+	if(!data) return data;
 	/* 18.3.1.99 worksheet CT_Worksheet */
 	var s = {};

@ -577,7 +578,7 @@ function parseSheet(data) {
 				} break;
 				case 'str': if(p.v) p.v = utf8read(p.v); break; // normal string
 				case 'inlineStr':
-					p.t = 'str'; p.v = unescapexml(d.match(matchtag('t'))[1]);
+					p.t = 'str'; p.v = unescapexml((d.match(matchtag('t'))||["",""])[1]);
 					break; // inline string
 				case 'b':
 					switch(p.v) {
@ -591,7 +592,7 @@ function parseSheet(data) {
 			}

 			/* formatting */
-			if(cell.s) {
+			if(cell.s && styles.CellXf) { /* TODO: second check is a hacked guard */
 				var cf = styles.CellXf[cell.s];
 				if(cf && cf.numFmtId && cf.numFmtId !== 0) {
 					p.raw = p.v;
@ -669,7 +670,7 @@ function parseDeps(data) {
 var ctext = {};

 function parseCT(data) {
-	if(!data) return data;
+	if(!data || !data.match) return data;
 	var ct = { workbooks: [], sheets: [], calcchains: [], themes: [], styles: [],
 		coreprops: [], extprops: [], strs:[], xmlns: "" };
 	(data.match(/<[^>]*>/g)||[]).forEach(function(x) {
@ -877,29 +878,36 @@ function parseStyles(data) {
 }

 function getdata(data) {
-	if(!data) return {};
+	if(!data) return null; 
 	if(data.data) return data.data;
 	if(data._data && data._data.getContent) return Array.prototype.slice.call(data._data.getContent(),0).map(function(x) { return String.fromCharCode(x); }).join("");
-	return {};
+	return null;
+}
+
+function getzipfile(zip, file) {
+	var f = file; if(zip.files[f]) return zip.files[f];
+	f = file.toLowerCase(); if(zip.files[f]) return zip.files[f];
+	f = f.replace(/\//g,'\\'); if(zip.files[f]) return zip.files[f];
+	throw new Error("Cannot find file " + file + " in zip")
 }

 function parseZip(zip) {
 	var entries = Object.keys(zip.files);
 	var keys = entries.filter(function(x){return x.substr(-1) != '/';}).sort();
-	var dir = parseCT(getdata(zip.files['[Content_Types].xml']));
+	var dir = parseCT(getdata(getzipfile(zip, '[Content_Types].xml')));

 	strs = {};
-	if(dir.sst) strs=parse_sst(getdata(zip.files[dir.sst.replace(/^\//,'')]));
+	if(dir.sst) strs=parse_sst(getdata(getzipfile(zip, dir.sst.replace(/^\//,''))));

 	styles = {};
-	if(dir.style) styles = parseStyles(getdata(zip.files[dir.style.replace(/^\//,'')]));
+	if(dir.style) styles = parseStyles(getdata(getzipfile(zip, dir.style.replace(/^\//,''))));

-	var wb = parseWB(getdata(zip.files[dir.workbooks[0].replace(/^\//,'')]));
-	var propdata = dir.coreprops.length !== 0 ? getdata(zip.files[dir.coreprops[0].replace(/^\//,'')]) : "";
-	propdata += dir.extprops.length !== 0 ? getdata(zip.files[dir.extprops[0].replace(/^\//,'')]) : "";
+	var wb = parseWB(getdata(getzipfile(zip, dir.workbooks[0].replace(/^\//,''))));
+	var propdata = dir.coreprops.length !== 0 ? getdata(getzipfile(zip, dir.coreprops[0].replace(/^\//,''))) : "";
+	propdata += dir.extprops.length !== 0 ? getdata(getzipfile(zip, dir.extprops[0].replace(/^\//,''))) : "";
 	var props = propdata !== "" ? parseProps(propdata) : {};
 	var deps = {};
-	if(dir.calcchain) deps=parseDeps(getdata(zip.files[dir.calcchain.replace(/^\//,'')]));
+	if(dir.calcchain) deps=parseDeps(getdata(getzipfile(zip, dir.calcchain.replace(/^\//,''))));
 	var sheets = {}, i=0;
 	if(!props.Worksheets) {
 		/* Google Docs doesn't generate the appropriate metadata, so we impute: */
@ -910,12 +918,16 @@ function parseZip(zip) {
 			props.SheetNames[j] = wbsheets[j].name;
 		}
 		for(i = 0; i != props.Worksheets; ++i) {
-			sheets[props.SheetNames[i]]=parseSheet(getdata(zip.files['xl/worksheets/sheet' + (i+1) + '.xml']));
+			try { /* TODO: remove these guards */ 
+			sheets[props.SheetNames[i]]=parseSheet(getdata(getzipfile(zip, 'xl/worksheets/sheet' + (i+1) + '.xml')));
+			} catch(e) {}
 		}
 	}
 	else {
 		for(i = 0; i != props.Worksheets; ++i) {
-			sheets[props.SheetNames[i]]=parseSheet(getdata(zip.files[dir.sheets[i].replace(/^\//,'')]));
+			try { 
+			sheets[props.SheetNames[i]]=parseSheet(getdata(getzipfile(zip, dir.sheets[i].replace(/^\//,''))));
+			} catch(e) {}
 		}
 	}
 	return {