diff --git a/Makefile b/Makefile index fbda0e5..e92afb3 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,21 @@ -.PHONY: init test lint clean -test: init +DEPS=$(wildcard bits/*.js) +TARGET=cfb.js + +$(TARGET): $(DEPS) + cat $^ > $@ + +.PHONY: test mocha +test mocha: init mocha -R spec +.PHONY: lint lint: jshint cfb.js +.PHONY: init init: - if [ ! -e test_files ]; then git clone https://github.com/SheetJS/test_files; fi - cd test_files; make - + if [ ! -e test_files ]; then git clone https://github.com/SheetJS/test_files; cd test_files; make; fi + +.PHONY: clean clean: - rm -rf ./test_files/ + rm -rf $(TARGET) ./test_files/ diff --git a/README.md b/README.md index 7e91a73..293b6c9 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,11 @@ In node: For example, to get the Workbook content from an XLS file: var cfb = CFB.read(filename, {type: 'file'}); - var has_vba = cfb.Directory['Workbook'] + var workbook = cfb.find('Workbook') -## API +# API + +Typescript definitions are maintained in `misc/cfb.d.ts`. The CFB object exposes the following methods and properties: @@ -67,18 +69,10 @@ It has the following properties and methods: - `.raw` contains the raw header and sectors -- `.Paths` is an array of the names of all of the streams (files) and storages - (directories) in the container. There is no disambiguation in the case of - streams with the same name. - -- `.Directory` is an object whose keys are entries in `.Paths` and whose values - are objects with metadata and content. Since collisions are not properly - handled here, `.FullPathDir` is the better option for new projects. - ## Entry Object Description -The entry objects are available from `FullPathDir`, `FileIndex`, and `Directory` -elements of the container object. +The entry objects are available from `FullPathDir` and `FileIndex` elements of the +container object. - `.name` is the (case sensitive) internal name - `.type` is the type (`stream` for files, `storage` for dirs, `root` for root) diff --git a/bits/00_header.js b/bits/00_header.js new file mode 100644 index 0000000..442b5bd --- /dev/null +++ b/bits/00_header.js @@ -0,0 +1,4 @@ +/* cfb.js (C) 2013 SheetJS -- http://sheetjs.com */ +/* vim: set ts=2: */ +/*jshint eqnull:true */ + diff --git a/bits/08_blob.js b/bits/08_blob.js new file mode 100644 index 0000000..70b28c5 --- /dev/null +++ b/bits/08_blob.js @@ -0,0 +1,132 @@ +/** Helper Functions */ +function readIEEE754(buf, idx, isLE, nl, ml) { + if(isLE === undefined) isLE = true; + if(!nl) nl = 8; + if(!ml && nl === 8) ml = 52; + var e, m, el = nl * 8 - ml - 1, eMax = (1 << el) - 1, eBias = eMax >> 1; + var bits = -7, d = isLE ? -1 : 1, i = isLE ? (nl - 1) : 0, s = buf[idx + i]; + + i += d; + e = s & ((1 << (-bits)) - 1); s >>>= (-bits); bits += el; + for (; bits > 0; e = e * 256 + buf[idx + i], i += d, bits -= 8); + m = e & ((1 << (-bits)) - 1); e >>>= (-bits); bits += ml; + for (; bits > 0; m = m * 256 + buf[idx + i], i += d, bits -= 8); + if (e === eMax) return m ? NaN : ((s ? -1 : 1) * Infinity); + else if (e === 0) e = 1 - eBias; + else { m = m + Math.pow(2, ml); e = e - eBias; } + return (s ? -1 : 1) * m * Math.pow(2, e - ml); +} + +var Base64 = (function(){ + var map = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; + return { + encode: function(input, utf8) { + var o = ""; + var c1, c2, c3, e1, e2, e3, e4; + for(var i = 0; i < input.length; ) { + c1 = input.charCodeAt(i++); + c2 = input.charCodeAt(i++); + c3 = input.charCodeAt(i++); + e1 = c1 >> 2; + e2 = (c1 & 3) << 4 | c2 >> 4; + e3 = (c2 & 15) << 2 | c3 >> 6; + e4 = c3 & 63; + if (isNaN(c2)) { e3 = e4 = 64; } + else if (isNaN(c3)) { e4 = 64; } + o += map.charAt(e1) + map.charAt(e2) + map.charAt(e3) + map.charAt(e4); + } + return o; + }, + decode: function(input, utf8) { + var o = ""; + var c1, c2, c3; + var e1, e2, e3, e4; + input = input.replace(/[^A-Za-z0-9\+\/\=]/g, ""); + for(var i = 0; i < input.length;) { + e1 = map.indexOf(input.charAt(i++)); + e2 = map.indexOf(input.charAt(i++)); + e3 = map.indexOf(input.charAt(i++)); + e4 = map.indexOf(input.charAt(i++)); + c1 = e1 << 2 | e2 >> 4; + c2 = (e2 & 15) << 4 | e3 >> 2; + c3 = (e3 & 3) << 6 | e4; + o += String.fromCharCode(c1); + if (e3 != 64) { o += String.fromCharCode(c2); } + if (e4 != 64) { o += String.fromCharCode(c3); } + } + return o; + } + }; +})(); + +function s2a(s) { + if(typeof Buffer !== 'undefined') return new Buffer(s, "binary"); + var w = s.split("").map(function(x){return x.charCodeAt(0);}); + return w; +} + +if(typeof Buffer !== "undefined") { + Buffer.prototype.hexlify= function() { return this.toString('hex'); }; + Buffer.prototype.utf16le= function(s,e){return this.toString('utf16le',s,e).replace(/\u0000/,'').replace(/[\u0001-\u0006]/,'!');}; + Buffer.prototype.utf8 = function(s,e) { return this.toString('utf8',s,e); }; +} + +Array.prototype.readUInt8 = function(idx) { return this[idx]; }; +Array.prototype.readUInt16LE = function(idx) { return this[idx+1]*(1<<8)+this[idx]; }; +Array.prototype.readInt16LE = function(idx) { var u = this.readUInt16LE(idx); if(!(u & 0x8000)) return u; return (0xffff - u + 1) * -1; }; +Array.prototype.readUInt32LE = function(idx) { return this[idx+3]*(1<<24)+this[idx+2]*(1<<16)+this[idx+1]*(1<<8)+this[idx]; }; +Array.prototype.readDoubleLE = function(idx) { return readIEEE754(this, idx||0);}; + +Array.prototype.hexlify = function() { return this.map(function(x){return (x<16?"0":"") + x.toString(16);}).join(""); }; + +Array.prototype.utf16le = function(s,e) { var str = ""; for(var i=s; i=MAXREGSECT) break; + fat_addrs[j++] = q; +} + + diff --git a/bits/43_readfat.js b/bits/43_readfat.js new file mode 100644 index 0000000..c68075f --- /dev/null +++ b/bits/43_readfat.js @@ -0,0 +1,52 @@ +/** Break the file up into sectors */ +var nsectors = Math.ceil((file.length - ssz)/ssz); +var sectors = []; +for(var i=1; i != nsectors; ++i) sectors[i-1] = file.slice(i*ssz,(i+1)*ssz); +sectors[nsectors-1] = file.slice(nsectors*ssz); + +/** Chase down the rest of the DIFAT chain to build a comprehensive list + DIFAT chains by storing the next sector number as the last 32 bytes */ +function sleuth_fat(idx, cnt) { + if(idx === ENDOFCHAIN) { + if(cnt !== 0) throw "DIFAT chain shorter than expected"; + return; + } + if(idx !== FREESECT) { + var sector = sectors[idx]; + for(var i = 0; i != ssz/4-1; ++i) { + if((q = sector.readUInt32LE(i*4)) === ENDOFCHAIN) break; + fat_addrs.push(q); + } + sleuth_fat(sector.readUInt32LE(ssz-4),cnt - 1); + } +} +sleuth_fat(difat_start, ndfs); + +/** DONT CAT THE FAT! Just calculate where we need to go */ +function get_buffer(byte_addr, bytes) { + var addr = fat_addrs[Math.floor(byte_addr*4/ssz)]; + if(ssz - (byte_addr*4 % ssz) < (bytes || 0)) + throw "FAT boundary crossed: " + byte_addr + " "+bytes+" "+ssz; + return sectors[addr].slice((byte_addr*4 % ssz)); +} + +function get_buffer_u32(byte_addr) { + return get_buffer(byte_addr,4).readUInt32LE(0); +} + +function get_next_sector(idx) { return get_buffer_u32(idx); } + +/** Chains */ +var chkd = new Array(sectors.length), sector_list = []; +var get_sector = function get_sector(k) { return sectors[k]; }; +for(i=0; i != sectors.length; ++i) { + var buf = [], k = (i + dir_start) % sectors.length; + if(chkd[k]) continue; + for(j=k; j<=MAXREGSECT; buf.push(j),j=get_next_sector(j)) chkd[j] = true; + sector_list[k] = {nodes: buf}; + sector_list[k].data = Array(buf.map(get_sector)).toBuffer(); +} +sector_list[dir_start].name = "!Directory"; +if(nmfs > 0 && minifat_start !== ENDOFCHAIN) sector_list[minifat_start].name = "!MiniFAT"; +sector_list[fat_addrs[0]].name = "!FAT"; + diff --git a/bits/44_readdir.js b/bits/44_readdir.js new file mode 100644 index 0000000..9777dd2 --- /dev/null +++ b/bits/44_readdir.js @@ -0,0 +1,59 @@ +/* [MS-CFB] 2.6.1 Compound File Directory Entry */ +var files = {}, Paths = [], FileIndex = [], FullPaths = [], FullPathDir = {}; +function read_directory(idx) { + var blob, read, w; + var sector = sector_list[idx].data; + for(var i = 0; i != sector.length; i+= 128) { + blob = sector.slice(i, i+128); + prep_blob(blob, 64); + read = ReadShift.bind(blob); + var namelen = read(2); + if(namelen === 0) return; + var name = blob.utf16le(0,namelen-(Paths.length?2:0)); // OLE + Paths.push(name); + var o = { name: name }; + o.type = EntryTypes[read(1)]; + o.color = read(1); + o.left = read(4); if(o.left === NOSTREAM) delete o.left; + o.right = read(4); if(o.right === NOSTREAM) delete o.right; + o.child = read(4); if(o.child === NOSTREAM) delete o.child; + o.clsid = read(16); + o.state = read(4); + var ctime = read(8); if(ctime != "0000000000000000") o.ctime = ctime; + var mtime = read(8); if(mtime != "0000000000000000") o.mtime = mtime; + o.start = read(4); + o.size = read(4); + if(o.type === 'root') { //root entry + minifat_store = o.start; + if(nmfs > 0 && minifat_store !== ENDOFCHAIN) sector_list[minifat_store].name = "!StreamData"; + minifat_size = o.size; + } else if(o.size >= ms_cutoff_size) { + o.storage = 'fat'; + if(!sector_list[o.start] && dir_start > 0) o.start = (o.start + dir_start) % sectors.length; + sector_list[o.start].name = o.name; + o.content = sector_list[o.start].data.slice(0,o.size); + prep_blob(o.content); + } else { + o.storage = 'minifat'; + w = o.start * mssz; + if(minifat_store !== ENDOFCHAIN && o.start !== ENDOFCHAIN) { + o.content = sector_list[minifat_store].data.slice(w,w+o.size); + prep_blob(o.content); + } + } + if(o.ctime) { + var ct = blob.slice(blob.l-24, blob.l-16); + var c2 = (ct.readUInt32LE(4)/1e7)*Math.pow(2,32)+ct.readUInt32LE(0)/1e7; + o.ct = new Date((c2 - 11644473600)*1000); + } + if(o.mtime) { + var mt = blob.slice(blob.l-16, blob.l-8); + var m2 = (mt.readUInt32LE(4)/1e7)*Math.pow(2,32)+mt.readUInt32LE(0)/1e7; + o.mt = new Date((m2 - 11644473600)*1000); + } + files[name] = o; + FileIndex.push(o); + } +} +read_directory(dir_start); + diff --git a/bits/45_rbtree.js b/bits/45_rbtree.js new file mode 100644 index 0000000..068297c --- /dev/null +++ b/bits/45_rbtree.js @@ -0,0 +1,32 @@ +/* [MS-CFB] 2.6.4 Red-Black Tree */ +function build_full_paths(Dir, pathobj, paths, patharr) { + var i; + var dad = new Array(patharr.length); + + var q = new Array(patharr.length); + + for(i=0; i != dad.length; ++i) { dad[i]=q[i]=i; paths[i]=patharr[i]; } + + for(i = q[0]; typeof i !== "undefined"; i = q.shift()) { + if(Dir[i].child) dad[Dir[i].child] = i; + if(Dir[i].left) { dad[Dir[i].left] = dad[i]; q.push(Dir[i].left); } + if(Dir[i].right) { dad[Dir[i].right] = dad[i]; q.push(Dir[i].right); } + } + + for(i=1; i !== paths.length; ++i) { + if(Dir[i].type === "unknown") continue; + var j = dad[i]; + if(j === 0) paths[i] = paths[0] + "/" + paths[i]; + else while(j !== 0) { + paths[i] = paths[j] + "/" + paths[i]; + j = dad[j]; + } + dad[i] = 0; + } + + paths[0] += "/"; + for(i=1; i !== paths.length; ++i) if(Dir[i].type !== 'stream') paths[i] += "/"; + for(i=0; i !== paths.length; ++i) pathobj[paths[i]] = FileIndex[i]; +} +build_full_paths(FileIndex, FullPathDir, FullPaths, Paths); + diff --git a/bits/46_findpath.js b/bits/46_findpath.js new file mode 100644 index 0000000..d712c1c --- /dev/null +++ b/bits/46_findpath.js @@ -0,0 +1,13 @@ +var root_name = Paths.shift(); +Paths.root = root_name; + +/* [MS-CFB] 2.6.4 (Unicode 3.0.1 case conversion) */ +function find_path(path) { + if(path[0] === "/") path = root_name + path; + var UCNames = (path.indexOf("/") !== -1 ? FullPaths : Paths).map(function(x) { return x.toUpperCase(); }); + var UCPath = path.toUpperCase(); + var w = UCNames.indexOf(UCPath); + if(w === -1) return null; + return path.indexOf("/") !== -1 ? FileIndex[w] : files[Paths[w]]; +} + diff --git a/bits/48_parsefooter.js b/bits/48_parsefooter.js new file mode 100644 index 0000000..1bc70cb --- /dev/null +++ b/bits/48_parsefooter.js @@ -0,0 +1,11 @@ +var rval = { + raw: {header: header, sectors: sectors}, + FileIndex: FileIndex, + FullPaths: FullPaths, + FullPathDir: FullPathDir, + find: find_path +}; + +return rval; +} // parse + diff --git a/bits/49_readutils.js b/bits/49_readutils.js new file mode 100644 index 0000000..fc4a359 --- /dev/null +++ b/bits/49_readutils.js @@ -0,0 +1,18 @@ + +function readFileSync(filename) { + var fs = require('fs'); + var file = fs.readFileSync(filename); + return parse(file); +} + +function readSync(blob, options) { + var o = options || {}; + switch((o.type || "base64")) { + case "file": return readFileSync(blob); + case "base64": blob = Base64.decode(blob); + /* falls through */ + case "binary": blob = s2a(blob); break; + } + return parse(blob); +} + diff --git a/bits/78_cfbexports.js b/bits/78_cfbexports.js new file mode 100644 index 0000000..c20d69f --- /dev/null +++ b/bits/78_cfbexports.js @@ -0,0 +1,3 @@ +this.read = readSync; +this.parse = parse; +return this; diff --git a/bits/79_cfbfooter.js b/bits/79_cfbfooter.js new file mode 100644 index 0000000..6d4e255 --- /dev/null +++ b/bits/79_cfbfooter.js @@ -0,0 +1,2 @@ +})(); + diff --git a/bits/80_consts.js b/bits/80_consts.js new file mode 100644 index 0000000..45eb609 --- /dev/null +++ b/bits/80_consts.js @@ -0,0 +1,18 @@ +/** CFB Constants */ +{ + /* 2.1 Compund File Sector Numbers and Types */ + var MAXREGSECT = 0xFFFFFFFA; + var DIFSECT = 0xFFFFFFFC; + var FATSECT = 0xFFFFFFFD; + var ENDOFCHAIN = 0xFFFFFFFE; + var FREESECT = 0xFFFFFFFF; + /* 2.2 Compound File Header */ + var HEADER_SIGNATURE = 'd0cf11e0a1b11ae1'; + var HEADER_MINOR_VERSION = '3e00'; + var MAXREGSID = 0xFFFFFFFA; + var NOSTREAM = 0xFFFFFFFF; + var HEADER_CLSID = '00000000000000000000000000000000'; + /* 2.6.1 Compound File Directory Entry */ + var EntryTypes = ['unknown','storage','stream','lockbytes','property','root']; +} + diff --git a/bits/90_utils.js b/bits/90_utils.js new file mode 100644 index 0000000..c4c9d62 --- /dev/null +++ b/bits/90_utils.js @@ -0,0 +1,8 @@ +var CFB_utils = { + ReadShift: ReadShift, + WarnField: WarnField, + CheckField: CheckField, + prep_blob: prep_blob, + bconcat: bconcat +}; + diff --git a/bits/98_exports.js b/bits/98_exports.js new file mode 100644 index 0000000..0cbb33d --- /dev/null +++ b/bits/98_exports.js @@ -0,0 +1,21 @@ +if(typeof require !== 'undefined' && typeof exports !== 'undefined') { + Array.prototype.toBuffer = function() { + return Buffer.concat(this[0]); + }; + var fs = require('fs'); + exports.read = CFB.read; + exports.parse = CFB.parse; + exports.utils = CFB_utils; + exports.main = function(args) { + var cfb = CFB.read(args[0], {type:'file'}); + console.log(cfb); + }; + if(typeof module !== 'undefined' && require.main === module) + exports.main(process.argv.slice(2)); +} else { + Array.prototype.toBuffer = function() { + var x = []; + for(var i = 0; i != this[0].length; ++i) { x = x.concat(this[0][i]); } + return x; + }; +} diff --git a/bits/99_footer.js b/bits/99_footer.js new file mode 100644 index 0000000..e69de29 diff --git a/cfb.js b/cfb.js index 380758c..4e0a42f 100644 --- a/cfb.js +++ b/cfb.js @@ -1,3 +1,4 @@ +/* cfb.js (C) 2013 SheetJS -- http://sheetjs.com */ /* vim: set ts=2: */ /*jshint eqnull:true */ @@ -397,11 +398,9 @@ function find_path(path) { var rval = { raw: {header: header, sectors: sectors}, - Paths: Paths, FileIndex: FileIndex, FullPaths: FullPaths, FullPathDir: FullPathDir, - Directory: files, find: find_path }; diff --git a/misc/cfb.d.ts b/misc/cfb.d.ts new file mode 100644 index 0000000..3a46614 --- /dev/null +++ b/misc/cfb.d.ts @@ -0,0 +1,104 @@ +declare enum CFBEntryType { unknown, storage, stream, lockbytes, property, root } +declare enum CFBStorageType { fat, minifat } + +/* CFB Entry Object demanded by write functions */ +interface CFBEntryMin { + + /* Raw Content (Buffer when available, Array of bytes otherwise) */ + content:any; +} + +/* CFB Entry Object returned by parse functions */ +interface CFBEntry extends CFBEntryMin { + + /* Case-sensitive internal name */ + name:string; + + /* CFB type (salient types: stream, storage) -- see CFBEntryType */ + type:string; + + /* Creation Time */ + ct:Date; + /* Modification Time */ + mt:Date; + + + /* Raw creation time -- see [MS-DTYP] 2.3.3 FILETIME */ + mtime:string; + /* Raw modification time -- see [MS-DTYP] 2.3.3 FILETIME */ + ctime:string; + + /* RBT color: 0 = red, 1 = black */ + color:number; + + /* Class ID represented as hex string */ + clsid:string; + + /* User-Defined State Bits */ + state:number; + + /* Starting Sector */ + start:number; + + /* Data Size */ + size:number; + + /* Storage location -- see CFBStorageType */ + storage:string; +} + + +/* cfb.FullPathDir as demanded by write functions */ +interface CFBDirectoryMin { + + /* keys are unix-style paths */ + [key:string]: CFBEntryMin; +} + +/* cfb.FullPathDir Directory object */ +interface CFBDirectory extends CFBDirectoryMin { + + /* cfb.FullPathDir keys are paths; cfb.Directory keys are file names */ + [key:string]: CFBEntry; +} + + +/* cfb object demanded by write functions */ +interface CFBContainerMin { + + /* Path -> CFB object mapping */ + FullPathDir:CFBDirectoryMin; +} + +/* cfb object returned by read and parse functions */ +interface CFBContainer extends CFBContainerMin { + + /* search by path or file name */ + find(string):CFBEntry; + + /* list of streams and storages */ + FullPaths:string[]; + + /* Path -> CFB object mapping */ + FullPathDir:CFBDirectory; + + /* Array of entries in the same order as FullPaths */ + FileIndex:CFBEntry[]; + + /* Raw Content, in chunks (Buffer when available, Array of bytes otherwise) */ + raw:any[]; +} + + +interface CFB { + read(f:any, options:any):CFBContainer; + parse(f:any):CFBContainer; + utils: { + ReadShift(size:any,t?:any):any; + WarnField(hexstr:string,fld?:string); + CheckField(hexstr:string,fld?:string); + prep_blob(blob:any, pos?:number):any; + bconcat(bufs:any[]):any; + }; + main; +} diff --git a/misc/cfb_.ts b/misc/cfb_.ts new file mode 100755 index 0000000..9a0c7ee --- /dev/null +++ b/misc/cfb_.ts @@ -0,0 +1,35 @@ +/// +/// + +/* vim: set ts=2: */ + +var CFB = require('../cfb'); +var fs = require('fs'), program = require('commander'); +program + .version('0.7.0') + .usage('[options] ') + .option('-q, --quiet', 'print but do not extract') + .parse(process.argv); + +if(program.args.length === 0 || !fs.existsSync(program.args[0])) { + console.error("Usage: " + process.argv[1] + " [-q] "); + process.exit(1); +} + +var cfb = CFB.read(program.args[0], {type:'file'}); +if(program.quiet) { + console.log("Full Paths:") + console.log(cfb.FullPaths.map(function(x) { return " " + x; }).join("\n")); + console.log("Full Path Directory:") + console.log(cfb.FullPathDir); + return; +} +for(var i=0; i != cfb.FullPaths.length; ++i) { + if(cfb.FullPaths[i].slice(-1) === "/") { + console.error("mkdir " + cfb.FullPaths[i]); + fs.mkdirSync(cfb.FullPaths[i]); + } else { + console.error("writing " + cfb.FullPaths[i]); + fs.writeFileSync(cfb.FullPaths[i], cfb.FileIndex[i].content); + } +} diff --git a/misc/node.d.ts b/misc/node.d.ts new file mode 100644 index 0000000..18156c5 --- /dev/null +++ b/misc/node.d.ts @@ -0,0 +1,8 @@ +declare var require: { + (id: string): any; +} + +declare var process: { + argv: string[]; + exit(status: number): void; +} diff --git a/package.json b/package.json index c1dca74..3fe11c1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "cfb", - "version": "0.7.0", + "version": "0.8.0", "author": "SheetJS", "description": "Compound File Binary File Format extractor", "keywords": [ "cfb", "compression", "office" ],