initial

2012-12-04 14:27:20 -05:00 · 2012-12-04 14:27:20 -05:00 · 5806249485
commit 5806249485
parent 13fbe76d6b
6 changed files with 2589 additions and 3 deletions
--- a/14
+++ b/14
@ -0,0 +1,14 @@
+Copyright (C) 2012 Niggler 
+
+The MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+Except where noted, this license applies to any and all software programs and associated documentation files created by the Original Author and distributed with the Software:
+
+'jszip.js' is a modified version of JSZip, Copyright (c) Stuart Knightley, David Duponchel, Franz Buchinger, Ant'onio Afonso.  JSZip is dual licensed and is used according to the terms of the MIT License. 
--- a/README.md
+++ b/README.md
@ -1,4 +1,54 @@
-js-xlsx
-=======
+# xlsx
+
+Currently a parser for XLSX files.  Cleanroom implementation from the ISO 29500  Office Open XML specifications.
+
+This has been tested on some very basic XLSX files generated from Excel 2011.
+
+*THIS WAS WHIPPED UP VERY QUICKLY TO SATISFY A VERY SPECIFIC NEED*.  If you need something that is not currently supported, file an issue and attach a sample file.  I will get to it :)
+
+## Installation
+
+In node:
+
+    npm install xlsx
+
+In the browser:
+
+    <script lang="javascript" src="/path/to/jszip.js"></script>
+    <script lang="javascript" src="/path/to/xlsx.js"></script>
+
+## Usage
+
+See `xlsx2csv.njs` in the bin directory for usage in node.
+
+See http://niggler.github.com/js-xlsx/ for a browser example.
+
+## Notes 
+
+`.SheetNames` is an ordered list of the sheets in the workbook
+
+`.Sheets[sheetname]` returns a data structure representing the sheet.  Each key
+that does not start with `!` corresponds to a cell (using `A-1` notation).  
+
+`.Sheets[sheetname][address].v` returns the value of the cell and `.Sheets[sheetname][address].t` returns the type of the cell (constrained to the enumeration `ST_CellType` as documented in page 4215 of ISO/IEC 29500-1:2012(E) ) 
+
+Simple usage:
+
+    var XLSX = require('xlsx')
+    var xlsx = XLSX.readFile('test.xlsx');
+    var sheet_name_list = xlsx.SheetNames;
+    xlsx.SheetNames.forEach(function(y) {
+      for (z in zip.Sheets[y]) {
+        if(z[0] === '!') continue;
+        console.log(y + "!" + z + "=" + JSON.stringify(zip.Sheets[y][z].v));
+      }
+    });
+
+## License
+
+Please consult the attached LICENSE file for details.  All rights not explicitly granted by the MIT license are reserved by the Original Author.
+
+## References
+
+ISO/IEC 29500:2012(E) "Information technology — Document description and processing languages — Office Open XML File Formats"

-Javascript XLSX parser and (one day) writer
--- a/bin/xlsx2csv.njs
+++ b/bin/xlsx2csv.njs
@ -0,0 +1,46 @@
+#!/usr/bin/env node
+
+var XLSX = require('../xlsx');
+var utils = XLSX.utils;
+var filename = process.argv[2];
+if(!filename || filename == "-h" || filename === "--help") {
+	console.log("usage:",process.argv[1],"<workbook> [sheet]");
+	console.log("  when sheet = :list, print a list of sheets in the workbook");
+	process.exit(0);
+}
+var fs = require('fs');
+if(!fs.existsSync(filename)) {
+	console.error("error:",filename,"does not exist!");
+	process.exit(1);
+}
+var xlsx = XLSX.readFile(filename);
+var sheetname = process.argv[3] || xlsx.SheetNames[0];
+if(sheetname === ":list") {
+	xlsx.SheetNames.forEach(function(x) { console.log(x); });
+	process.exit(0);
+}
+if(xlsx.SheetNames.indexOf(sheetname)===-1) {
+	console.error("Sheet", sheetname, "not found in", filename, ".  I see:");
+	xlsx.SheetNames.forEach(function(x) { console.error(" - " + x); });
+	process.exit(1);
+}
+
+function stringify(val) {
+	switch(val.t){
+		case 'n': return val.v;
+		case 's': case 'str': return JSON.stringify(val.v);
+		default: throw 'unrecognized type ' + val.t;
+	}
+}
+var sheet = xlsx.Sheets[sheetname];
+if(sheet["!ref"]) {
+	var r = utils.decode_range(sheet["!ref"]);
+	for(var R = r.s.r; R <= r.e.r; ++R) { 
+		var row = [];
+		for(var C = r.s.c; C <= r.e.c; ++C) {
+			var val = sheet[utils.encode_cell({c:C,r:R})];
+			row.push(val ? stringify(val) : "");
+		}
+		console.log(row.join(","));
+	}
+}
--- a/jszip.js
+++ b/jszip.js
--- a/package.json
+++ b/package.json
@ -0,0 +1,23 @@
+{
+	"name": "xlsx",
+	"version": "0.0.3",
+	"author": "Niggler",
+	"description": "(one day) a full-featured XLSX parser and writer.  For now, primitive parser",
+	"keywords": [
+		"xlsx", "office", "excel", "spreadsheet"	
+	], 
+	"bin": {
+		"xlsx2csv": "./bin/xlsx2csv.njs"
+	},
+	"main": "./xlsx",
+	"repository": {
+		"type":"git",
+		"url": "git://github.com/Niggler/js-xlsx.git"
+	},
+	"bugs": {
+		"url": "https://github.com/Niggler/js-xlsx/issues"
+	},
+	"engines": {
+		"node": ">=0.8"
+	}
+}
--- a/xlsx.js
+++ b/xlsx.js
@ -0,0 +1,325 @@
+/* vim: set ts=2:*/
+/*jshint eqnull:true */
+var XLSX = (function(){
+var debug = 0;
+var ct2type = {
+	"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml": "workbooks",
+	"application/vnd.openxmlformats-package.core-properties+xml": "coreprops",
+	"application/vnd.openxmlformats-officedocument.extended-properties+xml": "extprops",
+	"application/vnd.openxmlformats-officedocument.spreadsheetml.calcChain+xml": "calcchains",
+	"application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml":"sheets",
+	"application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml": "strs",	
+	"application/vnd.openxmlformats-officedocument.spreadsheetml.styles+xml":"styles",
+	"application/vnd.openxmlformats-officedocument.theme+xml":"themes",
+	"foo": "bar"
+};
+
+var WBPropsDef = {
+	allowRefreshQuery: '0',
+	autoCompressPictures: '1',
+	backupFile: '0',
+	checkCompatibility: '0',
+	codeName: '',
+	date1904: '0',
+	dateCompatibility: '1',
+	//defaultThemeVersion: '0',
+	filterPrivacy: '0',
+	hidePivotFieldList: '0',
+	promptedSolutions: '0',
+	publishItems: '0',
+	refreshAllConnections: false,
+	saveExternalLinkValues: '1',
+	showBorderUnselectedTables: '1',
+	showInkAnnotation: '1',
+	showObjects: 'all',
+	showPivotChartFilter: '0'
+	//updateLinks: 'userSet'
+};
+
+var WBViewDef = {
+	activeTab: '0',
+	autoFilterDateGrouping: '1',
+	firstSheet: '0',
+	minimized: '0',
+	showHorizontalScroll: '1',
+	showSheetTabs: '1',
+	showVerticalScroll: '1',
+	tabRatio: '600',
+	visibility: 'visible'
+	//window{Height,Width}, {x,y}Window
+};
+
+var SheetDef = {
+	state: 'visible'
+};
+
+var CalcPrDef = {
+	calcCompleted: '1',
+	calcMode: 'auto',
+	calcOnSave: '1',
+	concurrentCalc: '1',
+	fullCalcOnLoad: '0',
+	iterate: 'false',
+	iterateCount: '100',
+	iterateDelta: '0.001',
+	refMode: 'A1'
+};
+
+var XMLNS_CT = 'http://schemas.openxmlformats.org/package/2006/content-types';
+var XMLNS_WB = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
+
+var encodings = {
+	'&gt;': '>',
+	'&lt;': '<',
+	'&amp;': '&'
+};
+
+function unescapexml(text){
+	var s = text + '';
+	for(var y in encodings) s = s.replace(new RegExp(y,'g'), encodings[y]);
+	return s;
+}
+
+function parsexmltag(tag) {
+	var words = tag.split(/\s+/);
+	var z = {'0': words[0]};
+	if(words.length === 1) return z;
+	tag.match(/(\w+)="([^"]*)"/g).map(
+		function(x){var y=x.match(/(\w+)="([^"]*)"/); z[y[1]] = y[2]; });
+	return z; 
+}
+
+
+var strs = {}; // shared strings
+
+
+function parseSheet(data) { //TODO: use a real xml parser
+	var s = {};
+	s["!ref"] = data.match(/<dimension ref="([^"]*)"\s*\/>/)[1];
+	//s.rows = {};
+	//s.cells = {};
+	var q = ["v","f"];
+	data.match(/<sheetData>(.*)<\/sheetData>/)[1].split("</row>").forEach(function(x) { if(x === "") return;
+		var row = parsexmltag(x.match(/<row[^>]*>/)[0]); //s.rows[row.r]=row.spans;
+		var cells = x.substr(x.indexOf('>')+1).split("</c>");
+		cells.forEach(function(c) { if(c === "") return;
+			var cell = parsexmltag(c.match(/<c[^>]*>/)[0]); delete cell[0];
+			var d = c.substr(c.indexOf('>')+1);
+			var p = {};
+			q.forEach(function(f){var x=d.match(matchtag(f));if(x)p[f]=unescapexml(x[1]);});
+			p.t = (cell.t ? cell.t : "n"); // default is "n" in schema
+			switch(p.t) {
+				case 'n': p.v = parseFloat(p.v); break;
+				case 's': p.v = strs[parseInt(p.v, 10)].t; break;
+				case 'str': break; // normal string
+				default: throw "Unrecognized cell type: " + p.t;
+			}
+			//s.cells[cell.r] = p;
+			s[cell.r] = p;
+		});
+	});
+	
+	if(debug) s.rawdata = data;
+	return s;
+}
+
+// matches <foo>...</foo> extracts content
+function matchtag(f,g) {return new RegExp('<' + f + '>(.*)</' + f + '>',g||"");}
+
+function parseStrs(data) { 
+	var s = [];
+	var sst = data.match(new RegExp("<sst ([^>]*)>(.*)<\/sst>"));
+	s = sst[2].replace(/<si>/g,"").split(/<\/si>/).map(function(x) { var z = {};
+		var y=x.match(/<(.*)>(.*)<\/.*/); if(x) z[y[1]]=unescapexml(y[2]); return z;});
+	
+	sst = parsexmltag(sst[1]); s.count = sst.count; s.uniqueCount = sst.uniqueCount;
+	if(debug) s.rawdata = data;
+	return s;
+}
+
+function parseProps(data) {
+	var p = { Company:'' }, q = {};
+	var strings = ["Application", "DocSecurity", "Company", "AppVersion"];
+	var bools = ["HyperlinksChanged","SharedDoc","LinksUpToDate","ScaleCrop"];
+	var xtra = ["HeadingPairs", "TitlesOfParts","dc:creator","cp:lastModifiedBy","dcterms:created", "dcterms:modified"];
+	
+	strings.forEach(function(f){p[f] = data.match(matchtag(f))[1];});
+	bools.forEach(function(f){p[f] = data.match(matchtag(f))[1] == "true";});
+	xtra.forEach(function(f){q[f] = data.match(new RegExp("<" + f + "[^>]*>(.*)<\/" + f + ">"))[1];});
+
+	p["Worksheets"] = parseInt(q["HeadingPairs"].match(new RegExp("<vt:i4>(.*)<\/vt:i4>"))[1], 10); 
+	p["SheetNames"] = q["TitlesOfParts"].match(new RegExp("<vt:lpstr>([^<]*)<\/vt:lpstr>","g")).map(function(x){return x.match(new RegExp("<vt:lpstr>([^<]*)<\/vt:lpstr>"))[1];});
+	p["Creator"] = q["dc:creator"];
+	p["LastModifiedBy"] = q["cp:lastModifiedBy"];
+	p["CreatedDate"] = new Date(q["dcterms:created"]);
+	p["ModifiedDate"] = new Date(q["dcterms:modified"]);
+	
+	if(debug) p.rawdata = data;
+	return p;
+}
+
+function parseDeps(data) {
+	var d = [];
+	var l = 0, i = 1;
+	data.match(/<[^>]*>/g).forEach(function(x) {
+		var y = parsexmltag(x);
+		switch(y[0]) {
+			case '<?xml': break;
+			case '<calcChain': break;
+			case '<c': delete y[0]; if(y.i) i = y.i; else y.i = i; d.push(y); break;
+		}
+	});
+	if(debug) d.rawdata = data;
+	return d;
+}
+
+var ctext = {};
+
+function parseCT(data) {
+	var ct = { workbooks: [], sheets: [], calcchains: [], themes: [], styles: [], 
+		coreprops: [], extprops: [], strs:[], xmlns: "" };
+	if(data == null) return data;
+	data.match(/<[^>]*>/g).forEach(function(x) {
+		var y = parsexmltag(x);
+		switch(y[0]) {
+			case '<?xml': break;
+			case '<Types': ct.xmlns = y.xmlns; break;
+			case '<Default': ctext[y.Extension] = y.ContentType; break;
+			case '<Override': 
+				if(y.ContentType in ct2type)ct[ct2type[y.ContentType]].push(y.PartName);
+				break;
+		}
+	});
+	if(ct.xmlns !== XMLNS_CT) throw "Unknown Namespace: " + ct.xmlns;
+	ct.calcchain = ct.calcchains.length > 0 ? ct.calcchains[0] : "";
+	delete ct.calcchains;
+	if(debug) ct.rawdata = data;
+	return ct;
+}
+
+
+function parseWB(data) {
+	var wb = { AppVersion:{}, WBProps:{}, WBView:[], Sheets:[], CalcPr:{}, xmlns: "" };
+	data.match(/<[^>]*>/g).forEach(function(x) {
+		var y = parsexmltag(x);
+		switch(y[0]) {
+			case '<?xml': break;
+			case '<workbook': wb.xmlns = y.xmlns; break;
+			case '<fileVersion':
+				if(y.appName != "xl") throw "Unexpected workbook.appName: "+y.appName;
+				delete y[0]; wb.AppVersion = y; break;
+			case '<workbookPr': delete y[0]; wb.WBProps = y; break;
+			case '<bookViews>': case '</bookViews>': break; // aggregate workbookView
+			case '<workbookView': delete y[0]; wb.WBView.push(y); break;
+			case '<sheets>': case '</sheets>': break; // aggregate sheet
+			case '<sheet': delete y[0]; wb.Sheets.push(y); break; 
+			case '</ext>': case '</extLst>': case '</workbook>': break;
+			case '<extLst>': break; 
+			case '<calcPr': delete y[0]; wb.CalcPr = y; break;
+			
+			case '<mx:ArchID': break;
+			case '<ext': break;//TODO: check with different versions of excel
+			default: console.log(y);
+		}
+	});
+	if(wb.xmlns !== XMLNS_WB) throw "Unknown Namespace: " + wb.xmlns;
+	
+	var z;
+	for(z in WBPropsDef) if(null == wb.WBProps[z]) wb.WBProps[z] = WBPropsDef[z];
+	wb.WBView.forEach(function(w){for(var z in WBViewDef) if(null==w[z]) w[z]=WBViewDef[z]; });
+	for(z in CalcPrDef) if(null == wb.CalcPr[z]) wb.CalcPr[z] = CalcPrDef[z];
+	wb.Sheets.forEach(function(w){for(var z in SheetDef) if(null==w[z]) w[z]=SheetDef[z]; }); 
+	if(debug) wb.rawdata = data;
+	return wb;
+}
+
+function parseZip(zip) {
+	var entries = Object.keys(zip.files);
+	var keys = entries.filter(function(x){return x.substr(-1) != '/';}).sort();
+	var dir = parseCT((zip.files['[Content_Types].xml']||{}).data);
+	var wb = parseWB(zip.files[dir.workbooks[0].replace(/^\//,'')].data);
+	var props = parseProps(zip.files[dir.coreprops[0].replace(/^\//,'')].data + zip.files[dir.extprops[0].replace(/^\//,'')].data);
+	var deps = {};
+	if(dir.calcchain) deps=parseDeps(zip.files[dir.calcchain.replace(/^\//,'')].data);
+	if(dir.strs[0]) strs=parseStrs(zip.files[dir.strs[0].replace(/^\//,'')].data);
+	var sheets = {};
+	for(var i = 0; i != props.Worksheets; ++i) {
+		sheets[props.SheetNames[i]]=parseSheet(zip.files[dir.sheets[i].replace(/^\//,'')].data);
+	}
+	
+	return {
+		Directory: dir,
+		Workbook: wb,
+		Props: props,
+		Deps: deps,
+		Sheets: sheets,
+		SheetNames: props.SheetNames,
+		Strings: strs,
+		keys: keys,
+		files: zip.files
+	};
+}
+
+var fs, jszip;
+if(typeof JSZip !== "undefined") jszip = JSZip;
+if(typeof require !== "undefined") {
+	if(typeof jszip === 'undefined') jszip = require('./jszip').JSZip;
+	fs = require('fs');
+}
+
+function readSync(data, options) {
+	var zip, d = data;
+	var o = options||{};
+	switch((o.type||"base64")){
+		case "file": d = fs.readFileSync(data).toString('base64');
+			/* falls through */
+		case "base64": zip = new jszip(d, { base64:true }); break;
+		case "binary": zip = new jszip(d, { base64:false }); break;
+	}
+	return parseZip(zip);
+}
+
+function readFileSync(data, options) {
+	var o = options||{}; o.type = 'file';
+	return readSync(data, o);
+}
+
+this.read = readSync;
+this.readFile = readFileSync;
+this.parseZip = parseZip;
+return this;
+
+})();
+
+function encode_col(col) { var s=""; for(++col; col; col=Math.floor((col-1)/26)) s = String.fromCharCode(((col-1)%26) + 65) + s; return s; }
+function encode_row(row) { return "" + (row + 1); }
+function encode_cell(cell) { return encode_col(cell.c) + encode_row(cell.r); }
+
+function decode_col(c) { var d = 0, i = 0; for(; i !== c.length; ++i) d = 26*d + c.charCodeAt(i) - 64; return d - 1; }
+function decode_row(rowstr) { return Number(rowstr) - 1; }
+function split_cell(cstr) { return cstr.replace(/(\$?[A-Z]*)(\$?[0-9]*)/,"$1,$2").split(","); }
+function decode_cell(cstr) { var splt = split_cell(cstr); return { c:decode_col(splt[0]), r:decode_row(splt[1]) }; }
+function decode_range(range) { var x =range.split(":").map(decode_cell); return {s:x[0],e:x[x.length-1]}; }
+
+var utils = {
+	encode_col: encode_col,
+	encode_row: encode_row,
+	encode_cell: encode_cell,
+	decode_col: decode_col,
+	decode_row: decode_row,
+	split_cell: split_cell,
+	decode_cell: decode_cell,
+	decode_range: decode_range
+};
+
+if(typeof require !== 'undefined' && typeof exports !== 'undefined') {
+	exports.read = XLSX.read;
+	exports.readFile = XLSX.readFile;
+	exports.utils = utils;
+	exports.main = function(args) {
+		var zip = XLSX.read(args[0], {type:'file'});
+		console.log(zip.Sheets);
+	};
+if(typeof module !== 'undefined' && require.main === module) 
+	exports.main(process.argv.slice(2));
+}