From 5d43dffc79361448f8aa8a05df8c1322afc58cd5 Mon Sep 17 00:00:00 2001 From: Hugues Malphettes Date: Sat, 18 Jan 2014 21:45:49 +0800 Subject: [PATCH] Share the code for parsing the rich text and use it to parse the comments. --- bits/65_sst.js | 221 ++++++++++++++++++++++----------------------- bits/70_xlsx.js | 15 ++- test.js | 13 ++- xlsx.js | 236 ++++++++++++++++++++++++------------------------ 4 files changed, 241 insertions(+), 244 deletions(-) diff --git a/bits/65_sst.js b/bits/65_sst.js index 0ce03dd..85c0c3e 100644 --- a/bits/65_sst.js +++ b/bits/65_sst.js @@ -1,131 +1,130 @@ -/* 18.4 Shared String Table */ -var parse_sst = (function(){ +/* Parse a list of tags */ +var parse_rs = (function() { var tregex = matchtag("t"), rpregex = matchtag("rPr"); - /* Parse a list of tags */ - var parse_rs = (function() { - /* 18.4.7 rPr CT_RPrElt */ - var parse_rpr = function(rpr, intro, outro) { - var font = {}; - (rpr.match(/<[^>]*>/g)||[]).forEach(function(x) { - var y = parsexmltag(x); - switch(y[0]) { - /* 18.8.12 condense CT_BooleanProperty */ - /* ** not required . */ - case ']*>/g)||[]).forEach(function(x) { + var y = parsexmltag(x); + switch(y[0]) { + /* 18.8.12 condense CT_BooleanProperty */ + /* ** not required . */ + case '': font.strike = 1; break; - case '': break; + /* 18.4.10 strike CT_BooleanProperty */ + case '': font.strike = 1; break; + case '': break; - /* 18.4.13 u CT_UnderlineProperty */ - case '': font.u = 1; break; - case '': break; + /* 18.4.13 u CT_UnderlineProperty */ + case '': font.u = 1; break; + case '': break; - /* 18.8.2 b */ - case '': font.b = 1; break; - case '': break; + /* 18.8.2 b */ + case '': font.b = 1; break; + case '': break; - /* 18.8.26 i */ - case '': font.i = 1; break; - case '': break; + /* 18.8.26 i */ + case '': font.i = 1; break; + case '': break; - /* 18.3.1.15 color CT_Color TODO: tint, theme, auto, indexed */ - case ''); - outro.push(""); - }; - - /* 18.4.4 r CT_RElt */ - function parse_r(r) { - var terms = [[],"",[]]; - /* 18.4.12 t ST_Xstring */ - var t = r.match(tregex); - if(!isval(t)) return ""; - terms[1] = t[1]; - - var rpr = r.match(rpregex); - if(isval(rpr)) parse_rpr(rpr[1], terms[0], terms[2]); - return terms[0].join("") + terms[1].replace(/\r\n/g,'
') + terms[2].join(""); - } - return function(rs) { - return rs.replace(//g,"").split(/<\/r>/).map(parse_r).join(""); - }; - })(); - - /* 18.4.8 si CT_Rst */ - var parse_si = function(x) { - var z = {}; - if(!x) return z; - var y; - /* 18.4.12 t ST_Xstring (Plaintext String) */ - if(x[1] === 't') { - z.t = utf8read(unescapexml(x.replace(/<[^>]*>/g,""))); - z.raw = x; - z.r = z.t; - } - /* 18.4.4 r CT_RElt (Rich Text Run) */ - else if((y = x.match(//))) { - z.raw = x; - /* TODO: properly parse (note: no other valid child can have body text) */ - z.t = utf8read(unescapexml(x.replace(/<[^>]*>/gm,""))); - z.r = parse_rs(x); - } - /* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */ - /* 18.4.6 rPh CT_PhoneticRun (TODO: needed for Asian support) */ - return z; + default: + if(y[0][2] !== '/') throw 'Unrecognized rich format ' + y[0]; + } + }); + /* TODO: These should be generated styles, not inline */ + var style = []; + if(font.b) style.push("font-weight: bold;"); + if(font.i) style.push("font-style: italic;"); + intro.push(''); + outro.push(""); }; + /* 18.4.4 r CT_RElt */ + function parse_r(r) { + var terms = [[],"",[]]; + /* 18.4.12 t ST_Xstring */ + var t = r.match(tregex); + if(!isval(t)) return ""; + terms[1] = t[1]; + var rpr = r.match(rpregex); + if(isval(rpr)) parse_rpr(rpr[1], terms[0], terms[2]); + return terms[0].join("") + terms[1].replace(/\r\n/g,'
') + terms[2].join(""); + } + return function(rs) { + return rs.replace(//g,"").split(/<\/r>/).map(parse_r).join(""); + }; +})(); + +/* 18.4.8 si CT_Rst */ +var parse_si = function(x) { + var z = {}; + if(!x) return z; + var y; + /* 18.4.12 t ST_Xstring (Plaintext String) */ + if(x[1] === 't') { + z.t = utf8read(unescapexml(x.replace(/<[^>]*>/g,""))); + z.raw = x; + z.r = z.t; + } + /* 18.4.4 r CT_RElt (Rich Text Run) */ + else if((y = x.match(//))) { + z.raw = x; + /* TODO: properly parse (note: no other valid child can have body text) */ + z.t = utf8read(unescapexml(x.replace(/<[^>]*>/gm,""))); + z.r = parse_rs(x); + } + /* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */ + /* 18.4.6 rPh CT_PhoneticRun (TODO: needed for Asian support) */ + return z; +}; + +/* 18.4 Shared String Table */ +var parse_sst = (function(){ return function(data) { var s = []; /* 18.4.9 sst CT_Sst */ diff --git a/bits/70_xlsx.js b/bits/70_xlsx.js index 3941e1c..fdbefca 100644 --- a/bits/70_xlsx.js +++ b/bits/70_xlsx.js @@ -498,16 +498,13 @@ function parseComments(data) { data.match(/([^\u2603]*)<\/commentList>/m)[1].split('').forEach(function(x, index) { if(x === "" || x.trim() === "") return; var y = parsexmltag(x.match(/]*>/)[0]); - var comment = { author: y.authorId && authors[y.authorId] ? authors[y.authorId] : undefined, ref: y.ref, guid: y.guid, texts:[] }; + var comment = { author: y.authorId && authors[y.authorId] ? authors[y.authorId] : undefined, ref: y.ref, guid: y.guid }; var textMatch = x.match(/([^\u2603]*)<\/text>/m); if (!textMatch || !textMatch[1]) return; // a comment may contain an empty text tag. - textMatch[1].split('').forEach(function(r) { - if(r === "" || r.trim() === "") return; - /* 18.4.12 t ST_Xstring */ - var ct = r.match(matchtag('t')); - comment.texts.push(utf8read(unescapexml(ct[1]))); - // TODO: parse rich text format - }); + var rt = parse_si(textMatch[1]); + comment.raw = rt.raw; + comment.t = rt.t; + comment.r = rt.r; commentList.push(comment); }); return commentList; @@ -543,7 +540,7 @@ function insertCommentsIntoSheet(sheetName, sheet, comments) { if (!cell.c) { cell.c = []; } - cell.c.push({a: comment.author, t: comment.texts}); + cell.c.push({a: comment.author, t: comment.t, raw: comment.raw, r: comment.r}); }); } diff --git a/test.js b/test.js index 98395a6..dfd4088 100644 --- a/test.js +++ b/test.js @@ -41,12 +41,17 @@ describe('should parse test files', function() { }); describe('should have comment as part of cell\'s properties', function(){ - it('Parse comments.xml and insert into cell',function(){ + var ws; + before(function() { + XLSX = require('./xlsx'); var wb = XLSX.readFile('./test_files/apachepoi_SimpleWithComments.xlsx'); var sheetName = 'Sheet1'; - var ws = wb.Sheets[sheetName]; + ws = wb.Sheets[sheetName]; + }); + it('Parse comments.xml and insert into cell',function(){ assert.equal(ws.B1.c.length, 1,"must have 1 comment"); - assert.equal(ws.B1.c[0].t.length, 2,"must have 2 texts"); - assert.equal(ws.B1.c[0].a, 'Yegor Kozlov',"must have the same author"); + assert.equal(ws.B1.c[0].t, "Yegor Kozlov:\r\nfirst cell", "must have the concatenated texts"); + assert.equal(ws.B1.c[0].r, 'Yegor Kozlov:
first cell
', "must have the html representation"); + assert.equal(ws.B1.c[0].a, "Yegor Kozlov","must have the same author"); }); }); diff --git a/xlsx.js b/xlsx.js index ac664aa..d346cf1 100644 --- a/xlsx.js +++ b/xlsx.js @@ -439,134 +439,133 @@ function parseVector(data) { } function isval(x) { return typeof x !== "undefined" && x !== null; } -/* 18.4 Shared String Table */ -var parse_sst = (function(){ +/* Parse a list of tags */ +var parse_rs = (function() { var tregex = matchtag("t"), rpregex = matchtag("rPr"); - /* Parse a list of tags */ - var parse_rs = (function() { - /* 18.4.7 rPr CT_RPrElt */ - var parse_rpr = function(rpr, intro, outro) { - var font = {}; - (rpr.match(/<[^>]*>/g)||[]).forEach(function(x) { - var y = parsexmltag(x); - switch(y[0]) { - /* 18.8.12 condense CT_BooleanProperty */ - /* ** not required . */ - case ']*>/g)||[]).forEach(function(x) { + var y = parsexmltag(x); + switch(y[0]) { + /* 18.8.12 condense CT_BooleanProperty */ + /* ** not required . */ + case '': font.strike = 1; break; - case '': break; + /* 18.4.10 strike CT_BooleanProperty */ + case '': font.strike = 1; break; + case '': break; - /* 18.4.13 u CT_UnderlineProperty */ - case '': font.u = 1; break; - case '': break; + /* 18.4.13 u CT_UnderlineProperty */ + case '': font.u = 1; break; + case '': break; - /* 18.8.2 b */ - case '': font.b = 1; break; - case '': break; + /* 18.8.2 b */ + case '': font.b = 1; break; + case '': break; - /* 18.8.26 i */ - case '': font.i = 1; break; - case '': break; + /* 18.8.26 i */ + case '': font.i = 1; break; + case '': break; - /* 18.3.1.15 color CT_Color TODO: tint, theme, auto, indexed */ - case ''); - outro.push(""); - }; - - /* 18.4.4 r CT_RElt */ - function parse_r(r) { - var terms = [[],"",[]]; - /* 18.4.12 t ST_Xstring */ - var t = r.match(tregex); - if(!isval(t)) return ""; - terms[1] = t[1]; - - var rpr = r.match(rpregex); - if(isval(rpr)) parse_rpr(rpr[1], terms[0], terms[2]); - return terms[0].join("") + terms[1].replace(/\r\n/g,'
') + terms[2].join(""); - } - return function(rs) { - return rs.replace(//g,"").split(/<\/r>/).map(parse_r).join(""); - }; - })(); - - /* 18.4.8 si CT_Rst */ - var parse_si = function(x) { - var z = {}; - if(!x) return z; - var y; - /* 18.4.12 t ST_Xstring (Plaintext String) */ - if(x[1] === 't') { - z.t = utf8read(unescapexml(x.replace(/<[^>]*>/g,""))); - z.raw = x; - z.r = z.t; - } - /* 18.4.4 r CT_RElt (Rich Text Run) */ - else if((y = x.match(//))) { - z.raw = x; - /* TODO: properly parse (note: no other valid child can have body text) */ - z.t = utf8read(unescapexml(x.replace(/<[^>]*>/gm,""))); - z.r = parse_rs(x); - } - /* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */ - /* 18.4.6 rPh CT_PhoneticRun (TODO: needed for Asian support) */ - return z; + default: + if(y[0][2] !== '/') throw 'Unrecognized rich format ' + y[0]; + } + }); + /* TODO: These should be generated styles, not inline */ + var style = []; + if(font.b) style.push("font-weight: bold;"); + if(font.i) style.push("font-style: italic;"); + intro.push(''); + outro.push(""); }; + /* 18.4.4 r CT_RElt */ + function parse_r(r) { + var terms = [[],"",[]]; + /* 18.4.12 t ST_Xstring */ + var t = r.match(tregex); + if(!isval(t)) return ""; + terms[1] = t[1]; + var rpr = r.match(rpregex); + if(isval(rpr)) parse_rpr(rpr[1], terms[0], terms[2]); + return terms[0].join("") + terms[1].replace(/\r\n/g,'
') + terms[2].join(""); + } + return function(rs) { + return rs.replace(//g,"").split(/<\/r>/).map(parse_r).join(""); + }; +})(); + +/* 18.4.8 si CT_Rst */ +var parse_si = function(x) { + var z = {}; + if(!x) return z; + var y; + /* 18.4.12 t ST_Xstring (Plaintext String) */ + if(x[1] === 't') { + z.t = utf8read(unescapexml(x.replace(/<[^>]*>/g,""))); + z.raw = x; + z.r = z.t; + } + /* 18.4.4 r CT_RElt (Rich Text Run) */ + else if((y = x.match(//))) { + z.raw = x; + /* TODO: properly parse (note: no other valid child can have body text) */ + z.t = utf8read(unescapexml(x.replace(/<[^>]*>/gm,""))); + z.r = parse_rs(x); + } + /* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */ + /* 18.4.6 rPh CT_PhoneticRun (TODO: needed for Asian support) */ + return z; +}; + +/* 18.4 Shared String Table */ +var parse_sst = (function(){ return function(data) { var s = []; /* 18.4.9 sst CT_Sst */ @@ -1078,16 +1077,13 @@ function parseComments(data) { data.match(/([^\u2603]*)<\/commentList>/m)[1].split('').forEach(function(x, index) { if(x === "" || x.trim() === "") return; var y = parsexmltag(x.match(/]*>/)[0]); - var comment = { author: y.authorId && authors[y.authorId] ? authors[y.authorId] : undefined, ref: y.ref, guid: y.guid, texts:[] }; + var comment = { author: y.authorId && authors[y.authorId] ? authors[y.authorId] : undefined, ref: y.ref, guid: y.guid }; var textMatch = x.match(/([^\u2603]*)<\/text>/m); if (!textMatch || !textMatch[1]) return; // a comment may contain an empty text tag. - textMatch[1].split('').forEach(function(r) { - if(r === "" || r.trim() === "") return; - /* 18.4.12 t ST_Xstring */ - var ct = r.match(matchtag('t')); - comment.texts.push(utf8read(unescapexml(ct[1]))); - // TODO: parse rich text format - }); + var rt = parse_si(textMatch[1]); + comment.raw = rt.raw; + comment.t = rt.t; + comment.r = rt.r; commentList.push(comment); }); return commentList; @@ -1123,7 +1119,7 @@ function insertCommentsIntoSheet(sheetName, sheet, comments) { if (!cell.c) { cell.c = []; } - cell.c.push({a: comment.author, t: comment.texts}); + cell.c.push({a: comment.author, t: comment.t, raw: comment.raw, r: comment.r}); }); }