Parsing xlsx with broken docProps #759

Closed
opened 2017-08-03 10:26:37 +00:00 by djbeaumont · 3 comments
djbeaumont commented 2017-08-03 10:26:37 +00:00 (Migrated from github.com)

I'm having trouble reading a spreadsheet which has been generated by another company's report server. The file docProps/app.xml contains this invalid looking ooxml. Note that the vector is of size 2 but only contains one string:

    <HeadingPairs>
        <vt:vector size="2" baseType="variant">
            <vt:variant><vt:lpstr>Worksheets</vt:lpstr></vt:variant>
            <vt:variant><vt:i4>2</vt:i4></vt:variant>
        </vt:vector>
    </HeadingPairs>
    <TitlesOfParts>
        <vt:vector size="2" baseType="lpstr">
            <vt:lpstr>calendar</vt:lpstr>
            
        </vt:vector>
    </TitlesOfParts>

The parser throws when it reaches this line:
https://github.com/SheetJS/js-xlsx/blob/master/bits/22_xmlutils.js#L145

The application name in the doc properties is given as Microsoft Excel but that may well be untrue.

For our particular case, commenting out the check on the size of the vector allows the parser to carry on succesfully. Would it make sense to have a switch to turn errors into warnings or maybe a switch to disable reading properties altogether? Commenting out this line also lets us parse the file:
https://github.com/SheetJS/js-xlsx/blob/master/bits/85_parsezip.js#L89

I'm having trouble reading a spreadsheet which has been generated by another company's report server. The file `docProps/app.xml` contains this invalid looking ooxml. Note that the vector is of `size` 2 but only contains one string: ``` <HeadingPairs> <vt:vector size="2" baseType="variant"> <vt:variant><vt:lpstr>Worksheets</vt:lpstr></vt:variant> <vt:variant><vt:i4>2</vt:i4></vt:variant> </vt:vector> </HeadingPairs> <TitlesOfParts> <vt:vector size="2" baseType="lpstr"> <vt:lpstr>calendar</vt:lpstr> </vt:vector> </TitlesOfParts> ``` The parser throws when it reaches this line: https://github.com/SheetJS/js-xlsx/blob/master/bits/22_xmlutils.js#L145 The application name in the doc properties is given as `Microsoft Excel` but that may well be untrue. For our particular case, commenting out the check on the size of the vector allows the parser to carry on succesfully. Would it make sense to have a switch to turn errors into warnings or maybe a switch to disable reading properties altogether? Commenting out this line also lets us parse the file: https://github.com/SheetJS/js-xlsx/blob/master/bits/85_parsezip.js#L89
djbeaumont commented 2017-08-03 12:53:49 +00:00 (Migrated from github.com)
Skipping properties seems to work for us: https://github.com/SheetJS/js-xlsx/compare/master...djbeaumont:skip-props?expand=1
SheetJSDev commented 2017-08-03 14:59:40 +00:00 (Migrated from github.com)

@djbeaumont that's certainly unexpected! Instead of a new option to skip all of the extended properties, let's change that error to only throw in WTF mode. This is the git diff from the current HEAD:

diff --git a/bits/22_xmlutils.js b/bits/22_xmlutils.js
--- a/bits/22_xmlutils.js
+++ b/bits/22_xmlutils.js
@@ -138,12 +138,15 @@ var vtregex = (function(){ var vt_cache = {};
 		return (vt_cache[bt] = new RegExp("<(?:vt:)?" + bt + ">([\\s\\S]*?)</(?:vt:)?" + bt + ">", 'g') );
 };})();
 var vtvregex = /<\/?(?:vt:)?variant>/g, vtmregex = /<(?:vt:)([^>]*)>([\s\S]*)</;
-function parseVector(data) {
+function parseVector(data, opts) {
 	var h = parsexmltag(data);
 
 	var matches = data.match(vtregex(h.baseType))||[];
-	if(matches.length != h.size) throw new Error("unexpected vector length " + matches.length + " != " + h.size);
 	var res = [];
+	if(matches.length != h.size) {
+		if(opts.WTF) throw new Error("unexpected vector length " + matches.length + " != " + h.size);
+		return res;
+	}
 	matches.forEach(function(x) {
 		var v = x.replace(vtvregex,"").match(vtmregex);
 		res.push({v:utf8read(v[2]), t:v[1]});
diff --git a/bits/34_extprops.js b/bits/34_extprops.js
--- a/bits/34_extprops.js
+++ b/bits/34_extprops.js
@@ -17,7 +17,7 @@ var EXT_PROPS/*:Array<Array<string> >*/ = [
 XMLNS.EXT_PROPS = "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties";
 RELS.EXT_PROPS  = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties';
 
-function parse_ext_props(data, p) {
+function parse_ext_props(data, p, opts) {
        var q = {}; if(!p) p = {};
 
        EXT_PROPS.forEach(function(f) {
@@ -32,10 +32,10 @@ function parse_ext_props(data, p) {
        });
 
        if(q.HeadingPairs && q.TitlesOfParts) {
-               var v = parseVector(q.HeadingPairs);
-               var parts = parseVector(q.TitlesOfParts).map(function(x) { return x.v; });
+               var v = parseVector(q.HeadingPairs, opts);
+               var parts = parseVector(q.TitlesOfParts, opts).map(function(x) { return x.v; });
                var idx = 0, len = 0;
-               for(var i = 0; i !== v.length; i+=2) {
+               if(parts.length > 0) for(var i = 0; i !== v.length; i+=2) {
                        len = +(v[i+1].v);
                        switch(v[i].v) {
                                case "Worksheets":
diff --git a/bits/85_parsezip.js b/bits/85_parsezip.js
--- a/bits/85_parsezip.js
+++ b/bits/85_parsezip.js
@@ -86,7 +86,7 @@ function parse_zip(zip/*:ZIP*/, opts/*:?ParseOpts*/)/*:Workbook*/ {
 		if(propdata) props = parse_core_props(propdata);
 		if(dir.extprops.length !== 0) {
 			propdata = getzipstr(zip, dir.extprops[0].replace(/^\//,''), true);
-			if(propdata) parse_ext_props(propdata, props);
+			if(propdata) parse_ext_props(propdata, props, opts);
 		}
 	}

If you could confirm that works, we'd accept it as a PR :)

@djbeaumont that's certainly unexpected! Instead of a new option to skip all of the extended properties, let's change that error to only throw in WTF mode. This is the git diff from the current HEAD: ```diff diff --git a/bits/22_xmlutils.js b/bits/22_xmlutils.js --- a/bits/22_xmlutils.js +++ b/bits/22_xmlutils.js @@ -138,12 +138,15 @@ var vtregex = (function(){ var vt_cache = {}; return (vt_cache[bt] = new RegExp("<(?:vt:)?" + bt + ">([\\s\\S]*?)</(?:vt:)?" + bt + ">", 'g') ); };})(); var vtvregex = /<\/?(?:vt:)?variant>/g, vtmregex = /<(?:vt:)([^>]*)>([\s\S]*)</; -function parseVector(data) { +function parseVector(data, opts) { var h = parsexmltag(data); var matches = data.match(vtregex(h.baseType))||[]; - if(matches.length != h.size) throw new Error("unexpected vector length " + matches.length + " != " + h.size); var res = []; + if(matches.length != h.size) { + if(opts.WTF) throw new Error("unexpected vector length " + matches.length + " != " + h.size); + return res; + } matches.forEach(function(x) { var v = x.replace(vtvregex,"").match(vtmregex); res.push({v:utf8read(v[2]), t:v[1]}); diff --git a/bits/34_extprops.js b/bits/34_extprops.js --- a/bits/34_extprops.js +++ b/bits/34_extprops.js @@ -17,7 +17,7 @@ var EXT_PROPS/*:Array<Array<string> >*/ = [ XMLNS.EXT_PROPS = "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"; RELS.EXT_PROPS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties'; -function parse_ext_props(data, p) { +function parse_ext_props(data, p, opts) { var q = {}; if(!p) p = {}; EXT_PROPS.forEach(function(f) { @@ -32,10 +32,10 @@ function parse_ext_props(data, p) { }); if(q.HeadingPairs && q.TitlesOfParts) { - var v = parseVector(q.HeadingPairs); - var parts = parseVector(q.TitlesOfParts).map(function(x) { return x.v; }); + var v = parseVector(q.HeadingPairs, opts); + var parts = parseVector(q.TitlesOfParts, opts).map(function(x) { return x.v; }); var idx = 0, len = 0; - for(var i = 0; i !== v.length; i+=2) { + if(parts.length > 0) for(var i = 0; i !== v.length; i+=2) { len = +(v[i+1].v); switch(v[i].v) { case "Worksheets": diff --git a/bits/85_parsezip.js b/bits/85_parsezip.js --- a/bits/85_parsezip.js +++ b/bits/85_parsezip.js @@ -86,7 +86,7 @@ function parse_zip(zip/*:ZIP*/, opts/*:?ParseOpts*/)/*:Workbook*/ { if(propdata) props = parse_core_props(propdata); if(dir.extprops.length !== 0) { propdata = getzipstr(zip, dir.extprops[0].replace(/^\//,''), true); - if(propdata) parse_ext_props(propdata, props); + if(propdata) parse_ext_props(propdata, props, opts); } } ``` If you could confirm that works, we'd accept it as a PR :)
djbeaumont commented 2017-08-03 16:20:48 +00:00 (Migrated from github.com)

That works for me 👍 Thanks for the very quick response @SheetJSDev

That works for me 👍 Thanks for the very quick response @SheetJSDev
Sign in to join this conversation.
No Milestone
No Assignees
1 Participants
Notifications
Due Date
The due date is invalid or out of range. Please use the format 'yyyy-mm-dd'.

No due date set.

Dependencies

No dependencies set.

Reference: sheetjs/sheetjs#759
No description provided.