diff --git a/.jshintrc b/.jshintrc new file mode 100644 index 0000000..5cd205b --- /dev/null +++ b/.jshintrc @@ -0,0 +1,4 @@ +{ + "bitwise": false, + "curly": false +} diff --git a/APACHE.LICENSE b/LICENSE similarity index 100% rename from APACHE.LICENSE rename to LICENSE diff --git a/README.md b/README.md index 9e40eb5..7e91a73 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This is a Pure-JS implementation of MS-CFB: Compound File Binary File Format, a format used in many Microsoft file types (such as XLS, DOC, and other Microsoft Office file types). -# Installation and Usage +# Utility Installation and Usage The package is available on NPM: @@ -18,6 +18,79 @@ files that line up with the tree-based structure of the storage. Metadata such as the red-black tree are discarded (and in the future, new CFB containers will exclusively use black nodes) +# Library Installation and Usage + +In the browser: + + + +In node: + + var CFB = require('cfb'); + +For example, to get the Workbook content from an XLS file: + + var cfb = CFB.read(filename, {type: 'file'}); + var has_vba = cfb.Directory['Workbook'] + +## API + +The CFB object exposes the following methods and properties: + +`CFB.parse(blob)` takes a nodejs Buffer or an array of bytes and returns an +parsed representation of the data. + +`CFB.read(blob, options)` wraps `parse`. `options.type` controls the behavior: + +- `file`: `blob` should be a file name +- `base64`: `blob` should be a base64 string +- `binary`: `blob` should be a binary string + +## Container Object Description + +The object returned by `parse` and `read` can be found in the source (`rval`). +It has the following properties and methods: + +- `.find(path)` performs a case-insensitive match for the path (or file name, if + there are no slashes) and returns an entry object (described later) or null if + not found + +- `.FullPaths` is an array of the names of all of the streams (files) and + storages (directories) in the container. The paths are properly prefixed from + the root entry (so the entries are unique) + +- `.FullPathDir` is an object whose keys are entries in `.FullPaths` and whose + values are objects with metadata and content (described below) + +- `.FileIndex` is an array of the objects from `.FullPathDir`, in the same order + as `.FullPaths`. + +- `.raw` contains the raw header and sectors + +- `.Paths` is an array of the names of all of the streams (files) and storages + (directories) in the container. There is no disambiguation in the case of + streams with the same name. + +- `.Directory` is an object whose keys are entries in `.Paths` and whose values + are objects with metadata and content. Since collisions are not properly + handled here, `.FullPathDir` is the better option for new projects. + +## Entry Object Description + +The entry objects are available from `FullPathDir`, `FileIndex`, and `Directory` +elements of the container object. + +- `.name` is the (case sensitive) internal name +- `.type` is the type (`stream` for files, `storage` for dirs, `root` for root) +- `.content` is a Buffer/Array with the raw content +- `.ct`/`.mt` are the creation and modification time (if provided in file) + +# Notes + +Case comparison has not been verified for non-ASCII character + +Writing is not supported. It is in the works, but it has not yet been released. + # License This implementation is covered under Apache 2.0 license. It complies with the diff --git a/cfb.js b/cfb.js index 3f27145..4ae1cea 100644 --- a/cfb.js +++ b/cfb.js @@ -355,6 +355,7 @@ function read_directory(idx) { } read_directory(dir_start); +/* [MS-CFB] 2.6.4 Red-Black Tree */ function build_full_paths(Dir, pathobj, paths, patharr) { var i; var dad = new Array(patharr.length); @@ -389,13 +390,24 @@ build_full_paths(FileIndex, FullPathDir, FullPaths, Paths); var root_name = Paths.shift(); Paths.root = root_name; +/* [MS-CFB] 2.6.4 (Unicode 3.0.1 case conversion) */ +function find_path(path) { + if(path[0] === "/") path = root_name + path; + var UCNames = (path.indexOf("/") !== -1 ? FullPaths : Paths).map(function(x) { return x.toUpperCase(); }); + var UCPath = path.toUpperCase(); + var w = UCNames.indexOf(UCPath); + if(w === -1) return null; + return path.indexOf("/") !== -1 ? FileIndex[w] : files[Paths[w]]; +} + var rval = { raw: {header: header, sectors: sectors}, Paths: Paths, FileIndex: FileIndex, FullPaths: FullPaths, FullPathDir: FullPathDir, - Directory: files + Directory: files, + find: find_path }; return rval; diff --git a/package.json b/package.json index bb1a192..ebec1b6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "cfb", - "version": "0.5.0", + "version": "0.6.0", "author": "Niggler", "description": "Compound File Binary File Format extractor", "keywords": [ "cfb", "compression", "office" ], diff --git a/test.js b/test.js index 8500605..d432b26 100644 --- a/test.js +++ b/test.js @@ -1,11 +1,26 @@ +/* vim: set ts=2: */ var CFB; var fs = require('fs'); describe('source', function() { it('should load', function() { CFB = require('./'); }); }); + var files = fs.readdirSync('test_files').filter(function(x){return x.substr(-4)==".xls";}); -files.forEach(function(x) { - describe(x, function() { + +function parsetest(x, cfb) { + describe(x + ' should have basic parts', function() { + it('should find relative path', function() { + if(!cfb.find('Workbook') && !cfb.find('Book')) throw new Error("Cannot find workbook for " + x); + }); + it('should find absolute path', function() { + if(!cfb.find('/Workbook') && !cfb.find('/Book')) throw new Error("Cannot find workbook for " + x); + }); + }); +} + +describe('should parse test files', function() { + files.forEach(function(x) { it('should parse ' + x, function() { var cfb = CFB.read('./test_files/' + x, {type: "file"}); + parsetest(x, cfb); }); }); });