commit 99b049b362abd2b295f3c479e9096c69d5e70a56 Author: SheetJS Date: Thu Sep 5 11:55:36 2013 -0700 Initial commit diff --git a/APACHE.LICENSE b/APACHE.LICENSE new file mode 100644 index 0000000..b74794c --- /dev/null +++ b/APACHE.LICENSE @@ -0,0 +1,13 @@ +Copyright 2013 Niggler + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..9e40eb5 --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +# Compound File Binary Format + +This is a Pure-JS implementation of MS-CFB: Compound File Binary File Format, a +format used in many Microsoft file types (such as XLS, DOC, and other Microsoft +Office file types). + +# Installation and Usage + +The package is available on NPM: + +``` +$ npm install -g cfb +$ cfb path/to/CFB/file +``` + +The command will extract the storages and streams in the container, generating +files that line up with the tree-based structure of the storage. Metadata +such as the red-black tree are discarded (and in the future, new CFB containers +will exclusively use black nodes) + +# License + +This implementation is covered under Apache 2.0 license. It complies with the +[Open Specifications Promise](http://www.microsoft.com/openspecifications/) + diff --git a/bin/cfb b/bin/cfb new file mode 100755 index 0000000..9fe2615 --- /dev/null +++ b/bin/cfb @@ -0,0 +1,45 @@ +#!/usr/bin/env node + +var CFB = require('../cfb'); +var args = process.argv.slice(2); +var cfb = CFB.read(args[0], {type:'file'}); +cfb.Paths.unshift(cfb.Paths.root); + +var dir = cfb.Directory; + +var dad = new Array(cfb.Paths.length); +var paths = new Array(cfb.Paths.length); + +var q = new Array(paths.length); + +for(var i=0; i != dad.length; ++i) { dad[i]=q[i]=i; paths[i]=cfb.Paths[i]; } + +for(var i = q[0]; q.length != 0; i = q.shift()) { + if(dir[paths[i]].child) dad[dir[paths[i]].child] = i; + if(dir[paths[i]].left) { dad[dir[paths[i]].left] = dad[i]; q.push(dir[paths[i]].left); } + if(dir[paths[i]].right) { dad[dir[paths[i]].right] = dad[i]; q.push(dir[paths[i]].right); } +} + +for(var i=1; i != paths.length; ++i) { + var j = dad[i]; + if(j === 0) paths[i] = paths[0] + "/" + paths[i]; + else while(j != 0) { + paths[i] = paths[j] + "/" + paths[i]; + j = dad[j]; + } + dad[i] = 0; +} + +paths[0] += "/"; +for(var i=1; i != paths.length; ++i) if(dir[cfb.Paths[i]].type != 'stream') paths[i] += "/"; + +var fs = require('fs'); +for(var i=0; i != paths.length; ++i) { + if(paths[i].slice(-1) === "/") { + console.error("mkdir " + paths[i]); + fs.mkdirSync(paths[i]); + } else { + console.error("writing " + paths[i]); + fs.writeFile(paths[i], dir[cfb.Paths[i]].content); + } +} diff --git a/cfb.js b/cfb.js new file mode 100644 index 0000000..e4d058d --- /dev/null +++ b/cfb.js @@ -0,0 +1,397 @@ +/* vim: set ts=2: */ +/*jshint eqnull:true */ + +function readIEEE754(buf, idx, isLE, nl, ml) { + if(isLE === undefined) isLE = true; + if(!nl) nl = 8; + if(!ml && nl === 8) ml = 52; + var e, m, el = nl * 8 - ml - 1, eMax = (1 << el) - 1, eBias = eMax >> 1; + var bits = -7, d = isLE ? -1 : 1, i = isLE ? (nl - 1) : 0, s = buf[idx + i]; + + i += d; + e = s & ((1 << (-bits)) - 1); s >>>= (-bits); bits += el; + for (; bits > 0; e = e * 256 + buf[idx + i], i += d, bits -= 8); + m = e & ((1 << (-bits)) - 1); e >>>= (-bits); bits += ml; + for (; bits > 0; m = m * 256 + buf[idx + i], i += d, bits -= 8); + if (e === eMax) return m ? NaN : ((s ? -1 : 1) * Infinity); + else if (e === 0) e = 1 - eBias; + else { m = m + Math.pow(2, ml); e = e - eBias; } + return (s ? -1 : 1) * m * Math.pow(2, e - ml); +} + +var Base64 = (function(){ + var map = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; + return { + encode: function(input, utf8) { + var o = ""; + var c1, c2, c3, e1, e2, e3, e4; + for(var i = 0; i < input.length; ) { + c1 = input.charCodeAt(i++); + c2 = input.charCodeAt(i++); + c3 = input.charCodeAt(i++); + e1 = c1 >> 2; + e2 = (c1 & 3) << 4 | c2 >> 4; + e3 = (c2 & 15) << 2 | c3 >> 6; + e4 = c3 & 63; + if (isNaN(c2)) { e3 = e4 = 64; } + else if (isNaN(c3)) { e4 = 64; } + o += map.charAt(e1) + map.charAt(e2) + map.charAt(e3) + map.charAt(e4); + } + return o; + }, + decode: function(input, utf8) { + var o = ""; + var c1, c2, c3; + var e1, e2, e3, e4; + input = input.replace(/[^A-Za-z0-9\+\/\=]/g, ""); + for(var i = 0; i < input.length;) { + e1 = map.indexOf(input.charAt(i++)); + e2 = map.indexOf(input.charAt(i++)); + e3 = map.indexOf(input.charAt(i++)); + e4 = map.indexOf(input.charAt(i++)); + c1 = e1 << 2 | e2 >> 4; + c2 = (e2 & 15) << 4 | e3 >> 2; + c3 = (e3 & 3) << 6 | e4; + o += String.fromCharCode(c1); + if (e3 != 64) { o += String.fromCharCode(c2); } + if (e4 != 64) { o += String.fromCharCode(c3); } + } + return o; + } + }; +})(); + +function s2a(s) { + if(typeof Buffer !== 'undefined') return new Buffer(s, "binary"); + var w = s.split("").map(function(x){return x.charCodeAt(0);}); + return w; +} + +if(typeof Buffer !== "undefined") { + Buffer.prototype.hexlify= function() { return this.toString('hex'); }; + Buffer.prototype.utf16le= function(s,e){return this.toString('utf16le',s,e).replace(/\u0000/,'').replace(/[\u0001-\u0006]/,'!');}; + Buffer.prototype.utf8 = function(s,e) { return this.toString('utf8',s,e); }; +} + +Array.prototype.readUInt8 = function(idx) { return this[idx]; }; +Array.prototype.readUInt16LE = function(idx) { return this[idx+1]*(1<<8)+this[idx]; }; +Array.prototype.readInt16LE = function(idx) { var u = this.readUInt16LE(idx); if(!(u & 0x8000)) return u; return (0xffff - u + 1) * -1; }; +Array.prototype.readUInt32LE = function(idx) { return this[idx+3]*(1<<24)+this[idx+2]*(1<<16)+this[idx+1]*(1<<8)+this[idx]; }; +Array.prototype.readDoubleLE = function(idx) { return readIEEE754(this, idx||0);}; + +Array.prototype.hexlify = function() { return this.map(function(x){return (x<16?"0":"") + x.toString(16);}).join(""); }; + +Array.prototype.utf16le = function(s,e) { var str = ""; for(var i=s; i=MAXREGSECT) break; + fat_addrs[j++] = q; +} + + +/** Break the file up into sectors */ +if(file.length%ssz!==0) throw "File Length: Expected multiple of "+ssz; + +var nsectors = (file.length - ssz)/ssz; +var sectors = []; +for(var i=1; i != nsectors + 1; ++i) sectors[i-1] = file.slice(i*ssz,(i+1)*ssz); + +/** Chase down the rest of the DIFAT chain to build a comprehensive list + DIFAT chains by storing the next sector number as the last 32 bytes */ +function sleuth_fat(idx, cnt) { + if(idx === ENDOFCHAIN) { + if(cnt !== 0) throw "DIFAT chain shorter than expected"; + return; + } + var sector = sectors[idx]; + for(var i = 0; i != ssz/4-1; ++i) { + if((q = sector.readUInt32LE(i*4)) === ENDOFCHAIN) break; + fat_addrs.push(q); + } + sleuth_fat(sector.readUInt32LE(ssz-4),cnt - 1); +} +sleuth_fat(difat_start, ndfs); + +/** DONT CAT THE FAT! Just calculate where we need to go */ +function get_buffer(byte_addr, bytes) { + var addr = fat_addrs[Math.floor(byte_addr*4/ssz)]; + if(ssz - (byte_addr*4 % ssz) < (bytes || 0)) + throw "FAT boundary crossed: " + byte_addr + " "+bytes+" "+ssz; + return sectors[addr].slice((byte_addr*4 % ssz)); +} + +function get_buffer_u32(byte_addr) { + return get_buffer(byte_addr,4).readUInt32LE(0); +} + +function get_next_sector(idx) { return get_buffer_u32(idx); } + +/** Chains */ +var chkd = new Array(sectors.length), sector_list = []; +var get_sector = function get_sector(k) { return sectors[k]; }; +for(i=0; i != sectors.length; ++i) { + var buf = []; + if(chkd[i]) continue; + for(j=i; j<=MAXREGSECT; buf.push(j),j=get_next_sector(j)) chkd[j] = true; + sector_list[i] = {nodes: buf}; + sector_list[i].data = Array(buf.map(get_sector)).toBuffer(); +} +sector_list[dir_start].name = "!Directory"; +if(nmfs > 0) sector_list[minifat_start].name = "!MiniFAT"; +sector_list[fat_addrs[0]].name = "!FAT"; + +/** read directory structure */ +var files = {}, Paths = []; +function read_directory(idx) { + var blob, read; + var sector = sector_list[idx].data; + for(var i = 0; i != sector.length; i+= 128, l = 64) { + blob = sector.slice(i, i+128); + prep_blob(blob, 64); + read = ReadShift.bind(blob); + var namelen = read(2); + if(namelen === 0) return; + var name = blob.utf16le(0,namelen-(Paths.length?2:0)); // OLE + Paths.push(name); + var o = { name: name }; + o.type = EntryTypes[read(1)]; + o.color = read(1); + o.left = read(4); if(o.left === NOSTREAM) delete o.left; + o.right = read(4); if(o.right === NOSTREAM) delete o.right; + o.child = read(4); if(o.child === NOSTREAM) delete o.child; + o.clsid = read(16); + o.state = read(4); + o.ctime = read(8); + o.mtime = read(8); + o.start = read(4); + o.size = read(4); + if(o.type === 'root') { //root entry + minifat_store = o.start; + if(nmfs > 0) sector_list[minifat_store].name = "!StreamData"; + minifat_size = o.size; + } else if(o.size >= ms_cutoff_size) { + o.storage = 'fat'; + sector_list[o.start].name = o.name; + o.content = sector_list[o.start].data.slice(0,o.size); + prep_blob(o.content); + } else { + o.storage = 'minifat'; + w = o.start * mssz; + o.content = sector_list[minifat_store].data.slice(w,w+o.size); + prep_blob(o.content); + } + files[name] = o; + } +} +read_directory(dir_start); + +var root_name = Paths.shift(); +Paths.root = root_name; + +var rval = { + raw: {header: header, sectors: sectors}, + Paths: Paths, + Directory: files +}; + +return rval; +} // parse + + +function readFileSync(filename) { + var fs = require('fs'); + var file = fs.readFileSync(filename); + return parse(file); +} + +function readSync(blob, options) { + var o = options || {}; + switch((o.type || "base64")) { + case "file": return readFileSync(blob); + case "base64": blob = Base64.decode(blob); + /* falls through */ + case "binary": blob = s2a(blob); break; + } + return parse(blob); +} + +this.read = readSync; +this.parse = parse; +return this; +})(); + +/** CFB Constants */ +{ + var MAXREGSECT = 0xFFFFFFFA; + var DIFSECT = 0xFFFFFFFC; + var FATSECT = 0xFFFFFFFD; + var ENDOFCHAIN = 0xFFFFFFFE; + var FREESECT = 0xFFFFFFFF; + var HEADER_SIGNATURE = 'd0cf11e0a1b11ae1'; + var HEADER_MINOR_VERSION = '3e00'; + var MAXREGSID = 0xFFFFFFFA; + var NOSTREAM = 0xFFFFFFFF; + var HEADER_CLSID = '00000000000000000000000000000000'; + + var EntryTypes = ['unknown','storage','stream',null,null,'root']; +} + +if(typeof require !== 'undefined' && typeof exports !== 'undefined') { + Array.prototype.toBuffer = function() { + return Buffer.concat(this[0]); + }; + var fs = require('fs'); + exports.read = CFB.read; + exports.parse = CFB.parse; + exports.main = function(args) { + var cfb = CFB.read(args[0], {type:'file'}); + console.log(cfb); + }; + if(typeof module !== 'undefined' && require.main === module) + exports.main(process.argv.slice(2)); +} else { + Array.prototype.toBuffer = function() { + var x = []; + for(var i = 0; i != this[0].length; ++i) { x = x.concat(this[0][i]); } + return x; + }; +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..064a237 --- /dev/null +++ b/package.json @@ -0,0 +1,14 @@ +{ + "name": "cfb", + "version": "0.0.1", + "author": "Niggler", + "description": "Compound File Binary File Format extractor", + "keywords": [ "cfb", "compression", "office" ], + "bin": { + "cfb": "./bin/cfb" + }, + "main": "./cfb", + "repository": { "type":"git", "url":"git://github.com/Niggler/js-cfb.git" }, + "bugs": { "url": "https://github.com/Niggler/js-cfb/issues" }, + "license": "Apache 2.0" +}