Share the code for parsing the rich text and use it to parse the comments.

This commit is contained in:
Hugues Malphettes 2014-01-18 21:45:49 +08:00
parent 36f7080a68
commit 5d43dffc79
4 changed files with 241 additions and 244 deletions

@ -1,131 +1,130 @@
/* 18.4 Shared String Table */
var parse_sst = (function(){
/* Parse a list of <r> tags */
var parse_rs = (function() {
var tregex = matchtag("t"), rpregex = matchtag("rPr");
/* Parse a list of <r> tags */
var parse_rs = (function() {
/* 18.4.7 rPr CT_RPrElt */
var parse_rpr = function(rpr, intro, outro) {
var font = {};
(rpr.match(/<[^>]*>/g)||[]).forEach(function(x) {
var y = parsexmltag(x);
switch(y[0]) {
/* 18.8.12 condense CT_BooleanProperty */
/* ** not required . */
case '<condense': break;
/* 18.8.17 extend CT_BooleanProperty */
/* ** not required . */
case '<extend': break;
/* 18.8.36 shadow CT_BooleanProperty */
/* ** not required . */
case '<shadow': break;
/* 18.4.7 rPr CT_RPrElt */
var parse_rpr = function(rpr, intro, outro) {
var font = {};
(rpr.match(/<[^>]*>/g)||[]).forEach(function(x) {
var y = parsexmltag(x);
switch(y[0]) {
/* 18.8.12 condense CT_BooleanProperty */
/* ** not required . */
case '<condense': break;
/* 18.8.17 extend CT_BooleanProperty */
/* ** not required . */
case '<extend': break;
/* 18.8.36 shadow CT_BooleanProperty */
/* ** not required . */
case '<shadow': break;
/* 18.4.1 charset CT_IntProperty TODO */
case '<charset': break;
/* 18.4.1 charset CT_IntProperty TODO */
case '<charset': break;
/* 18.4.2 outline CT_BooleanProperty TODO */
case '<outline': break;
/* 18.4.2 outline CT_BooleanProperty TODO */
case '<outline': break;
/* 18.4.5 rFont CT_FontName */
case '<rFont': font.name = y.val; break;
/* 18.4.5 rFont CT_FontName */
case '<rFont': font.name = y.val; break;
/* 18.4.11 sz CT_FontSize */
case '<sz': font.sz = y.val; break;
/* 18.4.11 sz CT_FontSize */
case '<sz': font.sz = y.val; break;
/* 18.4.10 strike CT_BooleanProperty */
case '<strike':
if(!y.val) break;
/* falls through */
case '<strike/>': font.strike = 1; break;
case '</strike>': break;
/* 18.4.10 strike CT_BooleanProperty */
case '<strike':
if(!y.val) break;
/* falls through */
case '<strike/>': font.strike = 1; break;
case '</strike>': break;
/* 18.4.13 u CT_UnderlineProperty */
case '<u':
if(!y.val) break;
/* falls through */
case '<u/>': font.u = 1; break;
case '</u>': break;
/* 18.4.13 u CT_UnderlineProperty */
case '<u':
if(!y.val) break;
/* falls through */
case '<u/>': font.u = 1; break;
case '</u>': break;
/* 18.8.2 b */
case '<b':
if(!y.val) break;
/* falls through */
case '<b/>': font.b = 1; break;
case '</b>': break;
/* 18.8.2 b */
case '<b':
if(!y.val) break;
/* falls through */
case '<b/>': font.b = 1; break;
case '</b>': break;
/* 18.8.26 i */
case '<i':
if(!y.val) break;
/* falls through */
case '<i/>': font.i = 1; break;
case '</i>': break;
/* 18.8.26 i */
case '<i':
if(!y.val) break;
/* falls through */
case '<i/>': font.i = 1; break;
case '</i>': break;
/* 18.3.1.15 color CT_Color TODO: tint, theme, auto, indexed */
case '<color':
if(y.rgb) font.color = y.rgb.substr(2,6);
break;
/* 18.3.1.15 color CT_Color TODO: tint, theme, auto, indexed */
case '<color':
if(y.rgb) font.color = y.rgb.substr(2,6);
break;
/* 18.8.18 family ST_FontFamily */
case '<family': font.family = y.val; break;
/* 18.8.18 family ST_FontFamily */
case '<family': font.family = y.val; break;
/* 18.4.14 vertAlign CT_VerticalAlignFontProperty TODO */
case '<vertAlign': break;
/* 18.4.14 vertAlign CT_VerticalAlignFontProperty TODO */
case '<vertAlign': break;
/* 18.8.35 scheme CT_FontScheme TODO */
case '<scheme': break;
/* 18.8.35 scheme CT_FontScheme TODO */
case '<scheme': break;
default:
if(y[0][2] !== '/') throw 'Unrecognized rich format ' + y[0];
}
});
/* TODO: These should be generated styles, not inline */
var style = [];
if(font.b) style.push("font-weight: bold;");
if(font.i) style.push("font-style: italic;");
intro.push('<span style="' + style.join("") + '">');
outro.push("</span>");
};
/* 18.4.4 r CT_RElt */
function parse_r(r) {
var terms = [[],"",[]];
/* 18.4.12 t ST_Xstring */
var t = r.match(tregex);
if(!isval(t)) return "";
terms[1] = t[1];
var rpr = r.match(rpregex);
if(isval(rpr)) parse_rpr(rpr[1], terms[0], terms[2]);
return terms[0].join("") + terms[1].replace(/\r\n/g,'<br/>') + terms[2].join("");
}
return function(rs) {
return rs.replace(/<r>/g,"").split(/<\/r>/).map(parse_r).join("");
};
})();
/* 18.4.8 si CT_Rst */
var parse_si = function(x) {
var z = {};
if(!x) return z;
var y;
/* 18.4.12 t ST_Xstring (Plaintext String) */
if(x[1] === 't') {
z.t = utf8read(unescapexml(x.replace(/<[^>]*>/g,"")));
z.raw = x;
z.r = z.t;
}
/* 18.4.4 r CT_RElt (Rich Text Run) */
else if((y = x.match(/<r>/))) {
z.raw = x;
/* TODO: properly parse (note: no other valid child can have body text) */
z.t = utf8read(unescapexml(x.replace(/<[^>]*>/gm,"")));
z.r = parse_rs(x);
}
/* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */
/* 18.4.6 rPh CT_PhoneticRun (TODO: needed for Asian support) */
return z;
default:
if(y[0][2] !== '/') throw 'Unrecognized rich format ' + y[0];
}
});
/* TODO: These should be generated styles, not inline */
var style = [];
if(font.b) style.push("font-weight: bold;");
if(font.i) style.push("font-style: italic;");
intro.push('<span style="' + style.join("") + '">');
outro.push("</span>");
};
/* 18.4.4 r CT_RElt */
function parse_r(r) {
var terms = [[],"",[]];
/* 18.4.12 t ST_Xstring */
var t = r.match(tregex);
if(!isval(t)) return "";
terms[1] = t[1];
var rpr = r.match(rpregex);
if(isval(rpr)) parse_rpr(rpr[1], terms[0], terms[2]);
return terms[0].join("") + terms[1].replace(/\r\n/g,'<br/>') + terms[2].join("");
}
return function(rs) {
return rs.replace(/<r>/g,"").split(/<\/r>/).map(parse_r).join("");
};
})();
/* 18.4.8 si CT_Rst */
var parse_si = function(x) {
var z = {};
if(!x) return z;
var y;
/* 18.4.12 t ST_Xstring (Plaintext String) */
if(x[1] === 't') {
z.t = utf8read(unescapexml(x.replace(/<[^>]*>/g,"")));
z.raw = x;
z.r = z.t;
}
/* 18.4.4 r CT_RElt (Rich Text Run) */
else if((y = x.match(/<r>/))) {
z.raw = x;
/* TODO: properly parse (note: no other valid child can have body text) */
z.t = utf8read(unescapexml(x.replace(/<[^>]*>/gm,"")));
z.r = parse_rs(x);
}
/* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */
/* 18.4.6 rPh CT_PhoneticRun (TODO: needed for Asian support) */
return z;
};
/* 18.4 Shared String Table */
var parse_sst = (function(){
return function(data) {
var s = [];
/* 18.4.9 sst CT_Sst */

@ -498,16 +498,13 @@ function parseComments(data) {
data.match(/<commentList>([^\u2603]*)<\/commentList>/m)[1].split('</comment>').forEach(function(x, index) {
if(x === "" || x.trim() === "") return;
var y = parsexmltag(x.match(/<comment[^>]*>/)[0]);
var comment = { author: y.authorId && authors[y.authorId] ? authors[y.authorId] : undefined, ref: y.ref, guid: y.guid, texts:[] };
var comment = { author: y.authorId && authors[y.authorId] ? authors[y.authorId] : undefined, ref: y.ref, guid: y.guid };
var textMatch = x.match(/<text>([^\u2603]*)<\/text>/m);
if (!textMatch || !textMatch[1]) return; // a comment may contain an empty text tag.
textMatch[1].split('</r>').forEach(function(r) {
if(r === "" || r.trim() === "") return;
/* 18.4.12 t ST_Xstring */
var ct = r.match(matchtag('t'));
comment.texts.push(utf8read(unescapexml(ct[1])));
// TODO: parse rich text format
});
var rt = parse_si(textMatch[1]);
comment.raw = rt.raw;
comment.t = rt.t;
comment.r = rt.r;
commentList.push(comment);
});
return commentList;
@ -543,7 +540,7 @@ function insertCommentsIntoSheet(sheetName, sheet, comments) {
if (!cell.c) {
cell.c = [];
}
cell.c.push({a: comment.author, t: comment.texts});
cell.c.push({a: comment.author, t: comment.t, raw: comment.raw, r: comment.r});
});
}

13
test.js

@ -41,12 +41,17 @@ describe('should parse test files', function() {
});
describe('should have comment as part of cell\'s properties', function(){
it('Parse comments.xml and insert into cell',function(){
var ws;
before(function() {
XLSX = require('./xlsx');
var wb = XLSX.readFile('./test_files/apachepoi_SimpleWithComments.xlsx');
var sheetName = 'Sheet1';
var ws = wb.Sheets[sheetName];
ws = wb.Sheets[sheetName];
});
it('Parse comments.xml and insert into cell',function(){
assert.equal(ws.B1.c.length, 1,"must have 1 comment");
assert.equal(ws.B1.c[0].t.length, 2,"must have 2 texts");
assert.equal(ws.B1.c[0].a, 'Yegor Kozlov',"must have the same author");
assert.equal(ws.B1.c[0].t, "Yegor Kozlov:\r\nfirst cell", "must have the concatenated texts");
assert.equal(ws.B1.c[0].r, '<span style="font-weight: bold;">Yegor Kozlov:</span><span style=""><br/>first cell</span>', "must have the html representation");
assert.equal(ws.B1.c[0].a, "Yegor Kozlov","must have the same author");
});
});

236
xlsx.js

@ -439,134 +439,133 @@ function parseVector(data) {
}
function isval(x) { return typeof x !== "undefined" && x !== null; }
/* 18.4 Shared String Table */
var parse_sst = (function(){
/* Parse a list of <r> tags */
var parse_rs = (function() {
var tregex = matchtag("t"), rpregex = matchtag("rPr");
/* Parse a list of <r> tags */
var parse_rs = (function() {
/* 18.4.7 rPr CT_RPrElt */
var parse_rpr = function(rpr, intro, outro) {
var font = {};
(rpr.match(/<[^>]*>/g)||[]).forEach(function(x) {
var y = parsexmltag(x);
switch(y[0]) {
/* 18.8.12 condense CT_BooleanProperty */
/* ** not required . */
case '<condense': break;
/* 18.8.17 extend CT_BooleanProperty */
/* ** not required . */
case '<extend': break;
/* 18.8.36 shadow CT_BooleanProperty */
/* ** not required . */
case '<shadow': break;
/* 18.4.7 rPr CT_RPrElt */
var parse_rpr = function(rpr, intro, outro) {
var font = {};
(rpr.match(/<[^>]*>/g)||[]).forEach(function(x) {
var y = parsexmltag(x);
switch(y[0]) {
/* 18.8.12 condense CT_BooleanProperty */
/* ** not required . */
case '<condense': break;
/* 18.8.17 extend CT_BooleanProperty */
/* ** not required . */
case '<extend': break;
/* 18.8.36 shadow CT_BooleanProperty */
/* ** not required . */
case '<shadow': break;
/* 18.4.1 charset CT_IntProperty TODO */
case '<charset': break;
/* 18.4.1 charset CT_IntProperty TODO */
case '<charset': break;
/* 18.4.2 outline CT_BooleanProperty TODO */
case '<outline': break;
/* 18.4.2 outline CT_BooleanProperty TODO */
case '<outline': break;
/* 18.4.5 rFont CT_FontName */
case '<rFont': font.name = y.val; break;
/* 18.4.5 rFont CT_FontName */
case '<rFont': font.name = y.val; break;
/* 18.4.11 sz CT_FontSize */
case '<sz': font.sz = y.val; break;
/* 18.4.11 sz CT_FontSize */
case '<sz': font.sz = y.val; break;
/* 18.4.10 strike CT_BooleanProperty */
case '<strike':
if(!y.val) break;
/* falls through */
case '<strike/>': font.strike = 1; break;
case '</strike>': break;
/* 18.4.10 strike CT_BooleanProperty */
case '<strike':
if(!y.val) break;
/* falls through */
case '<strike/>': font.strike = 1; break;
case '</strike>': break;
/* 18.4.13 u CT_UnderlineProperty */
case '<u':
if(!y.val) break;
/* falls through */
case '<u/>': font.u = 1; break;
case '</u>': break;
/* 18.4.13 u CT_UnderlineProperty */
case '<u':
if(!y.val) break;
/* falls through */
case '<u/>': font.u = 1; break;
case '</u>': break;
/* 18.8.2 b */
case '<b':
if(!y.val) break;
/* falls through */
case '<b/>': font.b = 1; break;
case '</b>': break;
/* 18.8.2 b */
case '<b':
if(!y.val) break;
/* falls through */
case '<b/>': font.b = 1; break;
case '</b>': break;
/* 18.8.26 i */
case '<i':
if(!y.val) break;
/* falls through */
case '<i/>': font.i = 1; break;
case '</i>': break;
/* 18.8.26 i */
case '<i':
if(!y.val) break;
/* falls through */
case '<i/>': font.i = 1; break;
case '</i>': break;
/* 18.3.1.15 color CT_Color TODO: tint, theme, auto, indexed */
case '<color':
if(y.rgb) font.color = y.rgb.substr(2,6);
break;
/* 18.3.1.15 color CT_Color TODO: tint, theme, auto, indexed */
case '<color':
if(y.rgb) font.color = y.rgb.substr(2,6);
break;
/* 18.8.18 family ST_FontFamily */
case '<family': font.family = y.val; break;
/* 18.8.18 family ST_FontFamily */
case '<family': font.family = y.val; break;
/* 18.4.14 vertAlign CT_VerticalAlignFontProperty TODO */
case '<vertAlign': break;
/* 18.4.14 vertAlign CT_VerticalAlignFontProperty TODO */
case '<vertAlign': break;
/* 18.8.35 scheme CT_FontScheme TODO */
case '<scheme': break;
/* 18.8.35 scheme CT_FontScheme TODO */
case '<scheme': break;
default:
if(y[0][2] !== '/') throw 'Unrecognized rich format ' + y[0];
}
});
/* TODO: These should be generated styles, not inline */
var style = [];
if(font.b) style.push("font-weight: bold;");
if(font.i) style.push("font-style: italic;");
intro.push('<span style="' + style.join("") + '">');
outro.push("</span>");
};
/* 18.4.4 r CT_RElt */
function parse_r(r) {
var terms = [[],"",[]];
/* 18.4.12 t ST_Xstring */
var t = r.match(tregex);
if(!isval(t)) return "";
terms[1] = t[1];
var rpr = r.match(rpregex);
if(isval(rpr)) parse_rpr(rpr[1], terms[0], terms[2]);
return terms[0].join("") + terms[1].replace(/\r\n/g,'<br/>') + terms[2].join("");
}
return function(rs) {
return rs.replace(/<r>/g,"").split(/<\/r>/).map(parse_r).join("");
};
})();
/* 18.4.8 si CT_Rst */
var parse_si = function(x) {
var z = {};
if(!x) return z;
var y;
/* 18.4.12 t ST_Xstring (Plaintext String) */
if(x[1] === 't') {
z.t = utf8read(unescapexml(x.replace(/<[^>]*>/g,"")));
z.raw = x;
z.r = z.t;
}
/* 18.4.4 r CT_RElt (Rich Text Run) */
else if((y = x.match(/<r>/))) {
z.raw = x;
/* TODO: properly parse (note: no other valid child can have body text) */
z.t = utf8read(unescapexml(x.replace(/<[^>]*>/gm,"")));
z.r = parse_rs(x);
}
/* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */
/* 18.4.6 rPh CT_PhoneticRun (TODO: needed for Asian support) */
return z;
default:
if(y[0][2] !== '/') throw 'Unrecognized rich format ' + y[0];
}
});
/* TODO: These should be generated styles, not inline */
var style = [];
if(font.b) style.push("font-weight: bold;");
if(font.i) style.push("font-style: italic;");
intro.push('<span style="' + style.join("") + '">');
outro.push("</span>");
};
/* 18.4.4 r CT_RElt */
function parse_r(r) {
var terms = [[],"",[]];
/* 18.4.12 t ST_Xstring */
var t = r.match(tregex);
if(!isval(t)) return "";
terms[1] = t[1];
var rpr = r.match(rpregex);
if(isval(rpr)) parse_rpr(rpr[1], terms[0], terms[2]);
return terms[0].join("") + terms[1].replace(/\r\n/g,'<br/>') + terms[2].join("");
}
return function(rs) {
return rs.replace(/<r>/g,"").split(/<\/r>/).map(parse_r).join("");
};
})();
/* 18.4.8 si CT_Rst */
var parse_si = function(x) {
var z = {};
if(!x) return z;
var y;
/* 18.4.12 t ST_Xstring (Plaintext String) */
if(x[1] === 't') {
z.t = utf8read(unescapexml(x.replace(/<[^>]*>/g,"")));
z.raw = x;
z.r = z.t;
}
/* 18.4.4 r CT_RElt (Rich Text Run) */
else if((y = x.match(/<r>/))) {
z.raw = x;
/* TODO: properly parse (note: no other valid child can have body text) */
z.t = utf8read(unescapexml(x.replace(/<[^>]*>/gm,"")));
z.r = parse_rs(x);
}
/* 18.4.3 phoneticPr CT_PhoneticPr (TODO: needed for Asian support) */
/* 18.4.6 rPh CT_PhoneticRun (TODO: needed for Asian support) */
return z;
};
/* 18.4 Shared String Table */
var parse_sst = (function(){
return function(data) {
var s = [];
/* 18.4.9 sst CT_Sst */
@ -1078,16 +1077,13 @@ function parseComments(data) {
data.match(/<commentList>([^\u2603]*)<\/commentList>/m)[1].split('</comment>').forEach(function(x, index) {
if(x === "" || x.trim() === "") return;
var y = parsexmltag(x.match(/<comment[^>]*>/)[0]);
var comment = { author: y.authorId && authors[y.authorId] ? authors[y.authorId] : undefined, ref: y.ref, guid: y.guid, texts:[] };
var comment = { author: y.authorId && authors[y.authorId] ? authors[y.authorId] : undefined, ref: y.ref, guid: y.guid };
var textMatch = x.match(/<text>([^\u2603]*)<\/text>/m);
if (!textMatch || !textMatch[1]) return; // a comment may contain an empty text tag.
textMatch[1].split('</r>').forEach(function(r) {
if(r === "" || r.trim() === "") return;
/* 18.4.12 t ST_Xstring */
var ct = r.match(matchtag('t'));
comment.texts.push(utf8read(unescapexml(ct[1])));
// TODO: parse rich text format
});
var rt = parse_si(textMatch[1]);
comment.raw = rt.raw;
comment.t = rt.t;
comment.r = rt.r;
commentList.push(comment);
});
return commentList;
@ -1123,7 +1119,7 @@ function insertCommentsIntoSheet(sheetName, sheet, comments) {
if (!cell.c) {
cell.c = [];
}
cell.c.push({a: comment.author, t: comment.texts});
cell.c.push({a: comment.author, t: comment.t, raw: comment.raw, r: comment.r});
});
}