From 55924668a78ed27008bf53cb6626852b122434c9 Mon Sep 17 00:00:00 2001 From: SheetJS Date: Fri, 4 Feb 2022 00:29:01 -0500 Subject: [PATCH] NUMBERS primary cell storage parse docs clarified row and column props (fixes #2486) (fixes #2511) --- .spelling | 3 + README.md | 564 ++++++++++++--------------- bits/83_numbers.js | 118 +++++- docbits/00_intro.md | 31 +- docbits/10_install.md | 61 ++- docbits/12_optional.md | 55 --- docbits/13_usage.md | 94 +++++ docbits/15_phil.md | 39 -- docbits/{11_demos.md => 16_demos.md} | 0 docbits/40_interface.md | 5 + docbits/62_colrow.md | 92 +++-- docbits/85_filetype.md | 157 ++------ misc/docs/README.md | 485 +++++++++++------------ misc/docs/SUMMARY.md | 32 +- modules/83_numbers.js | 118 +++++- modules/src/cell.ts | 88 +++++ modules/src/numbers.ts | 28 +- modules/src/prebnccell.ts | 50 --- modules/src/util.ts | 9 + 19 files changed, 1089 insertions(+), 940 deletions(-) delete mode 100644 docbits/12_optional.md create mode 100644 docbits/13_usage.md delete mode 100644 docbits/15_phil.md rename docbits/{11_demos.md => 16_demos.md} (100%) create mode 100644 modules/src/cell.ts delete mode 100644 modules/src/prebnccell.ts diff --git a/.spelling b/.spelling index 36cae85..5721ffa 100644 --- a/.spelling +++ b/.spelling @@ -15,6 +15,7 @@ OData OpenDocument OpenFormula PivotTable +PivotTables Quattro SpreadsheetML Unhide @@ -96,12 +97,14 @@ encodings filesystem globals javascript +lifecycle metadata natively pre-built pre-generated prepend prepended +programmatically repo runtime serverless diff --git a/README.md b/README.md index 3f980f8..cbf21a1 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,13 @@ # [SheetJS](https://sheetjs.com) -Parser and writer for various spreadsheet formats. Pure-JS cleanroom -implementation from official specifications, related documents, and test files. -Emphasis on parsing and writing robustness, cross-format feature compatibility -with a unified JS representation, and ES3/ES5 browser compatibility back to IE6. +The SheetJS Community Edition offers battle-tested open-source solutions for +extracting useful data from almost any complex spreadsheet and generating new +spreadsheets that will work with legacy and modern software alike. -This is the community version. We also offer a pro version with performance -enhancements, additional features like styling, and dedicated support. - - -Community Translations of this README: - -- [Simplified Chinese](https://github.com/rockboom/SheetJS-docs-zh-CN) - - -[**Pro Version**](https://sheetjs.com/pro) - -[**Commercial Support**](https://sheetjs.com/support) - -[**Rendered Documentation**](https://docs.sheetjs.com/) - -[**In-Browser Demos**](https://sheetjs.com/demos) - -[**Source Code**](https://git.io/xlsx) - -[**Issues and Bug Reports**](https://github.com/sheetjs/sheetjs/issues) +[SheetJS Pro](https://sheetjs.com/pro) offers solutions beyond data processing: +Edit complex templates with ease; let out your inner Picasso with styling; make +custom sheets with images/graphs/PivotTables; evaluate formula expressions and +port calculations to web apps; automate common spreadsheet tasks, and much more! ![License](https://img.shields.io/github/license/SheetJS/sheetjs) [![Build Status](https://img.shields.io/github/workflow/status/sheetjs/sheetjs/Tests:%20node.js)](https://github.com/SheetJS/sheetjs/actions) @@ -54,11 +37,11 @@ Community Translations of this README: -- [Installation](#installation) +- [Getting Started](#getting-started) + * [Installation](#installation) + * [Usage](#usage) + + [The Zen of SheetJS](#the-zen-of-sheetjs) * [JS Ecosystem Demos](#js-ecosystem-demos) - * [Optional Modules](#optional-modules) - * [ECMAScript 5 Compatibility](#ecmascript-5-compatibility) -- [Philosophy](#philosophy) - [Parsing Workbooks](#parsing-workbooks) * [Parsing Examples](#parsing-examples) * [Streaming Read](#streaming-read) @@ -89,8 +72,7 @@ Community Translations of this README: + [Miscellaneous Workbook Properties](#miscellaneous-workbook-properties) * [Document Features](#document-features) + [Formulae](#formulae) - + [Column Properties](#column-properties) - + [Row Properties](#row-properties) + + [Row and Column Properties](#row-and-column-properties) + [Number Formats](#number-formats) + [Hyperlinks](#hyperlinks) + [Cell Comments](#cell-comments) @@ -112,27 +94,6 @@ Community Translations of this README: * [HTML Output](#html-output) * [JSON](#json) - [File Formats](#file-formats) - * [Excel 2007+ XML (XLSX/XLSM)](#excel-2007-xml-xlsxxlsm) - * [Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5)](#excel-20-95-biff2biff3biff4biff5) - * [Excel 97-2004 Binary (BIFF8)](#excel-97-2004-binary-biff8) - * [Excel 2003-2004 (SpreadsheetML)](#excel-2003-2004-spreadsheetml) - * [Excel 2007+ Binary (XLSB, BIFF12)](#excel-2007-binary-xlsb-biff12) - * [Delimiter-Separated Values (CSV/TXT)](#delimiter-separated-values-csvtxt) - * [Other Workbook Formats](#other-workbook-formats) - + [Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123)](#lotus-1-2-3-wkswk1wk2wk3wk4123) - + [Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW)](#quattro-pro-wq1wq2wb1wb2wb3qpw) - + [Works for DOS / Windows Spreadsheet (WKS/XLR)](#works-for-dos--windows-spreadsheet-wksxlr) - + [Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS)](#numbers-30--iwork-2013-spreadsheet-numbers) - + [OpenDocument Spreadsheet (ODS/FODS)](#opendocument-spreadsheet-odsfods) - + [Uniform Office Spreadsheet (UOS1/2)](#uniform-office-spreadsheet-uos12) - * [Other Single-Worksheet Formats](#other-single-worksheet-formats) - + [dBASE and Visual FoxPro (DBF)](#dbase-and-visual-foxpro-dbf) - + [Symbolic Link (SYLK)](#symbolic-link-sylk) - + [Lotus Formatted Text (PRN)](#lotus-formatted-text-prn) - + [Data Interchange Format (DIF)](#data-interchange-format-dif) - + [HTML](#html) - + [Rich Text Format (RTF)](#rich-text-format-rtf) - + [Ethercalc Record Format (ETH)](#ethercalc-record-format-eth) - [Testing](#testing) * [Node](#node) * [Browser](#browser) @@ -149,7 +110,9 @@ Community Translations of this README: -## Installation +## Getting Started + +### Installation In the browser, just add a script tag: @@ -188,6 +151,157 @@ With [bower](https://bower.io/search/?q=js-xlsx): $ bower install js-xlsx ``` +
+ Optional features (click to show) + +The node version automatically requires modules for additional features. Some +of these modules are rather large in size and are only needed in special +circumstances, so they do not ship with the core. For browser use, they must +be included directly: + +```html + + +``` + +An appropriate version for each dependency is included in the dist/ directory. + +The complete single-file version is generated at `dist/xlsx.full.min.js` + +A slimmer build is generated at `dist/xlsx.mini.min.js`. Compared to full build: +- codepage library skipped (no support for XLS encodings) +- XLSX compression option not currently available +- no support for XLSB / XLS / Lotus 1-2-3 / SpreadsheetML 2003 +- node stream utils removed + +Webpack and Browserify builds include optional modules by default. Webpack can +be configured to remove support with `resolve.alias`: + +```js + /* uncomment the lines below to remove support */ + resolve: { + alias: { "./dist/cpexcel.js": "" } // <-- omit international support + } +``` + +
+ +
+ ECMAScript 3 Compatibility (click to show) + +For broad compatibility with JavaScript engines, the library is written using +ECMAScript 3 language dialect as well as some ES5 features like `Array#forEach`. +Older browsers require shims to provide missing functions. + +To use the shim, add the shim before the script tag that loads `xlsx.js`: + +```html + + + + +``` + +The script also includes `IE_LoadFile` and `IE_SaveFile` for loading and saving +files in Internet Explorer versions 6-9. The `xlsx.extendscript.js` script +bundles the shim in a format suitable for Photoshop and other Adobe products. + +
+ +### Usage + +Most scenarios involving spreadsheets and data can be broken into 5 parts: + +1) **Acquire Data**: Data may be stored anywhere: local or remote files, + databases, HTML TABLE, or even generated programmatically in the web browser. + +2) **Extract Data**: For spreadsheet files, this involves parsing raw bytes to + read the cell data. For general JS data, this involves reshaping the data. + +3) **Process Data**: From generating summary statistics to cleaning data + records, this step is the heart of the problem. + +4) **Package Data**: This can involve making a new spreadsheet or serializing + with `JSON.stringify` or writing XML or simply flattening data for UI tools. + +5) **Release Data**: Spreadsheet files can be uploaded to a server or written + locally. Data can be presented to users in an HTML TABLE or data grid. + +A common problem involves generating a valid spreadsheet export from data stored +in an HTML table. In this example, an HTML TABLE on the page will be scraped, +a row will be added to the bottom with the date of the report, and a new file +will be generated and downloaded locally. `XLSX.writeFile` takes care of +packaging the data and attempting a local download: + +```js +// Acquire Data (reference to the HTML table) +var table_elt = document.getElementById("my-table-id"); + +// Extract Data (create a workbook object from the table) +var workbook = XLSX.utils.table_to_book(table_elt); + +// Process Data (add a new row) +var worksheet = workbook.Sheets["Sheet1"]; +XLSX.utils.sheet_add_aoa([["Created "+new Date().toISOString()}]], {origin:-1}); + +// Package and Release Data (`writeFile` tries to write and save an XLSB file) +XLSX.writeFile(workbook, "Report.xlsb"); +``` + +This library tries to simplify steps 2 and 4 with functions to extract useful +data from spreadsheet files (`read` / `readFile`) and generate new spreadsheet +files from data (`write` / `writeFile`). + +This documentation and various demo projects cover a number of common scenarios +and approaches for steps 1 and 5. + +Utility functions help with step 3. + + +#### The Zen of SheetJS + + +_File formats are implementation details_ + +The parser covers a wide gamut of common spreadsheet file formats to ensure that +"HTML-saved-as-XLS" files work as well as actual XLS or XLSX files. + +The writer supports a number of common output formats for broad compatibility +with the data ecosystem. + + +_Data processing should fit in any workflow_ + +The library does not impose a separate lifecycle. It fits nicely in websites +and apps built using any framework. The plain JS data objects play nice with +Web Workers and future APIs. + +["Parsing Workbooks"](#parsing-workbooks) describes solutions for common data +import scenarios involving actual spreadsheet files. + +["Writing Workbooks"](#writing-workbooks) describes solutions for common data +export scenarios involving actual spreadsheet files. + +["Utility Functions"](#utility-functions) details utility functions for +translating JSON Arrays and other common JS structures into worksheet objects. + + +_JavaScript is a powerful language for data processing_ + +The ["Common Spreadsheet Format"](#common-spreadsheet-format) is a simple object +representation of the core concepts of a workbook. The various functions in the +library provide low-level tools for working with the object. + +For friendly JS processing, there are utility functions for converting parts of +a worksheet to/from an Array of Arrays. For example, summing columns from an +array of arrays can be implemented in a single Array reduce operation: + +```js +var aoa = XLSX.utils.sheet_to_json(worksheet, {header: 1}); +var sum_of_column_B = aoa.reduce((acc, row) => acc + (+row[1]||0), 0); +``` + + ### JS Ecosystem Demos The [`demos` directory](demos/) includes sample projects for: @@ -228,100 +342,6 @@ The [`demos` directory](demos/) includes sample projects for: Other examples are included in the [showcase](demos/showcase/). -### Optional Modules - -
- Optional features (click to show) - -The node version automatically requires modules for additional features. Some -of these modules are rather large in size and are only needed in special -circumstances, so they do not ship with the core. For browser use, they must -be included directly: - -```html - - -``` - -An appropriate version for each dependency is included in the dist/ directory. - -The complete single-file version is generated at `dist/xlsx.full.min.js` - -A slimmer build is generated at `dist/xlsx.mini.min.js`. Compared to full build: -- codepage library skipped (no support for XLS encodings) -- XLSX compression option not currently available -- no support for XLSB / XLS / Lotus 1-2-3 / SpreadsheetML 2003 -- node stream utils removed - -Webpack and Browserify builds include optional modules by default. Webpack can -be configured to remove support with `resolve.alias`: - -```js - /* uncomment the lines below to remove support */ - resolve: { - alias: { "./dist/cpexcel.js": "" } // <-- omit international support - } -``` - -
- -### ECMAScript 5 Compatibility - -Since the library uses functions like `Array#forEach`, older browsers require -[shims to provide missing functions](https://oss.sheetjs.com/sheetjs/shim.js). - -To use the shim, add the shim before the script tag that loads `xlsx.js`: - -```html - - - - -``` - -The script also includes `IE_LoadFile` and `IE_SaveFile` for loading and saving -files in Internet Explorer versions 6-9. The `xlsx.extendscript.js` script -bundles the shim in a format suitable for Photoshop and other Adobe products. - -## Philosophy - -
- Philosophy (click to show) - -Prior to SheetJS, APIs for processing spreadsheet files were format-specific. -Third-party libraries either supported one format, or they involved a separate -set of classes for each supported file type. Even though XLSB was introduced in -Excel 2007, nothing outside of SheetJS or Excel supported the format. - -To promote a format-agnostic view, SheetJS starts from a pure-JS representation -that we call the ["Common Spreadsheet Format"](#common-spreadsheet-format). -Emphasizing a uniform object representation enables new features like format -conversion (reading an XLSX template and saving as XLS) and circumvents the mess -of classes. By abstracting the complexities of the various formats, tools -need not worry about the specific file type! - -A simple object representation combined with careful coding practices enables -use cases in older browsers and in alternative environments like ExtendScript -and Web Workers. It is always tempting to use the latest and greatest features, -but they tend to require the latest versions of browsers, limiting usability. - -Utility functions capture common use cases like generating JS objects or HTML. -Most simple operations should only require a few lines of code. More complex -operations generally should be straightforward to implement. - -Excel pushes the XLSX format as default starting in Excel 2007. However, there -are other formats with more appealing properties. For example, the XLSB format -is spiritually similar to XLSX but files often tend up taking less than half the -space and open much faster! Even though an XLSX writer is available, other -format writers are available so users can take advantage of the unique -characteristics of each format. - -The primary focus of the Community Edition is correct data interchange, focused -on extracting data from any compatible data representation and exporting data in -various formats suitable for any third party interface. - -
- ## Parsing Workbooks For parsing, the first step is to read the file. This involves acquiring the @@ -890,6 +910,11 @@ Write options are described in the [Writing Options](#writing-options) section. Utilities are available in the `XLSX.utils` object and are described in the [Utility Functions](#utility-functions) section: +**Constructing:** + +- `book_new` creates an empty workbook +- `book_append_sheet` adds a worksheet to a workbook + **Importing:** - `aoa_to_sheet` converts an array of arrays of JS data to a worksheet. @@ -1391,7 +1416,23 @@ formulae and Lotus Parsed formulae have to be explicitly unwound. OpenFormula formulae can be converted with regular expressions. -#### Column Properties +#### Row and Column Properties + +
+ Format Support (click to show) + +**Row Properties**: XLSX/M, XLSB, BIFF8 XLS, XLML, SYLK, DOM, ODS + +**Column Properties**: XLSX/M, XLSB, BIFF8 XLS, XLML, SYLK, DOM + +
+ + +Row and Column properties are not extracted by default when reading from a file +and are not persisted by default when writing to a file. The option +`cellStyles: true` must be passed to the relevant read or write function. + +_Column Properties_ The `!cols` array in each worksheet, if present, is a collection of `ColInfo` objects which have the following properties: @@ -1412,6 +1453,30 @@ type ColInfo = { }; ``` +_Row Properties_ + +The `!rows` array in each worksheet, if present, is a collection of `RowInfo` +objects which have the following properties: + +```typescript +type RowInfo = { + /* visibility */ + hidden?: boolean; // if true, the row is hidden + + /* row height is specified in one of the following ways: */ + hpx?: number; // height in screen pixels + hpt?: number; // height in points + + level?: number; // 0-indexed outline / group level +}; +``` + +_Outline / Group Levels Convention_ + +The Excel UI displays the base outline level as `1` and the max level as `8`. +Following JS conventions, SheetJS uses 0-indexed outline levels wherein the base +outline level is `0` and the max level is `7`. +
Why are there three width types? (click to show) @@ -1442,6 +1507,20 @@ when changing the pixel width, delete the `wch` and `width` properties.
Implementation details (click to show) +_Row Heights_ + +Excel internally stores row heights in points. The default resolution is 72 DPI +or 96 PPI, so the pixel and point size should agree. For different resolutions +they may not agree, so the library separates the concepts. + +Even though all of the information is made available, writers are expected to +follow the priority order: + +1) use `hpx` pixel height if available +2) use `hpt` point height if available + +_Column Widths_ + Given the constraints, it is possible to determine the MDW without actually inspecting the font! The parsers guess the pixel width by converting from width to pixels and back, repeating for all possible MDW and selecting the MDW that @@ -1454,41 +1533,7 @@ follow the priority order: 1) use `width` field if available 2) use `wpx` pixel width if available 3) use `wch` character count if available -
-#### Row Properties - -The `!rows` array in each worksheet, if present, is a collection of `RowInfo` -objects which have the following properties: - -```typescript -type RowInfo = { - /* visibility */ - hidden?: boolean; // if true, the row is hidden - - /* row height is specified in one of the following ways: */ - hpx?: number; // height in screen pixels - hpt?: number; // height in points - - level?: number; // 0-indexed outline / group level -}; -``` - -Note: Excel UI displays the base outline level as `1` and the max level as `8`. -The `level` field stores the base outline as `0` and the max level as `7`. - -
- Implementation details (click to show) - -Excel internally stores row heights in points. The default resolution is 72 DPI -or 96 PPI, so the pixel and point size should agree. For different resolutions -they may not agree, so the library separates the concepts. - -Even though all of the information is made available, writers are expected to -follow the priority order: - -1) use `hpx` pixel height if available -2) use `hpt` point height if available
#### Number Formats @@ -2518,10 +2563,12 @@ range limits will be silently truncated: Excel 2003 SpreadsheetML range limits are governed by the version of Excel and are not enforced by the writer. -### Excel 2007+ XML (XLSX/XLSM) -
- (click to show) + File Format Details (click to show) + +**Core Spreadsheet Formats** + +- **Excel 2007+ XML (XLSX/XLSM)** XLSX and XLSM files are ZIP containers containing a series of XML files in accordance with the Open Packaging Conventions (OPC). The XLSM format, almost @@ -2531,12 +2578,7 @@ The format is standardized in ECMA-376 and later in ISO/IEC 29500. Excel does not follow the specification, and there are additional documents discussing how Excel deviates from the specification. -
- -### Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5) - -
- (click to show) +- **Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5)** BIFF 2/3 XLS are single-sheet streams of binary records. Excel 4 introduced the concept of a workbook (`XLW` files) but also had single-sheet `XLS` format. @@ -2548,12 +2590,7 @@ files in these formats, so record lengths and fields were determined by writing in all of the supported formats and comparing files. Excel 2016 can generate BIFF5 files, enabling a full suite of file tests starting from XLSX or BIFF2. -
- -### Excel 97-2004 Binary (BIFF8) - -
- (click to show) +- **Excel 97-2004 Binary (BIFF8)** BIFF8 exclusively uses the Compound File Binary container format, splitting some content into streams within the file. At its core, it still uses an extended @@ -2562,24 +2599,14 @@ version of the binary record format from older versions of BIFF. The `MS-XLS` specification covers the basics of the file format, and other specifications expand on serialization of features like properties. -
- -### Excel 2003-2004 (SpreadsheetML) - -
- (click to show) +- **Excel 2003-2004 (SpreadsheetML)** Predating XLSX, SpreadsheetML files are simple XML files. There is no official and comprehensive specification, although MS has released documentation on the format. Since Excel 2016 can generate SpreadsheetML files, mapping features is pretty straightforward. -
- -### Excel 2007+ Binary (XLSB, BIFF12) - -
- (click to show) +- **Excel 2007+ Binary (XLSB, BIFF12)** Introduced in parallel with XLSX, the XLSB format combines the BIFF architecture with the content separation and ZIP container of XLSX. For the most part nodes @@ -2588,12 +2615,7 @@ in an XLSX sub-file can be mapped to XLSB records in a corresponding sub-file. The `MS-XLSB` specification covers the basics of the file format, and other specifications expand on serialization of features like properties. -
- -### Delimiter-Separated Values (CSV/TXT) - -
- (click to show) +- **Delimiter-Separated Values (CSV/TXT)** Excel CSV deviates from RFC4180 in a number of important ways. The generated CSV files should generally work in Excel although they may not work in RFC4180 @@ -2602,32 +2624,20 @@ writer proactively generates cells for formulae if values are unavailable. Excel TXT uses tab as the delimiter and code page 1200. -Notes: +Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic +Link files. Unlike Excel, if the file does not have a valid SYLK header, it +will be proactively reinterpreted as CSV. There are some files with semicolon +delimiter that align with a valid SYLK file. For the broadest compatibility, +all cells with the value of `ID` are automatically wrapped in double-quotes. -- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic - Link files. Unlike Excel, if the file does not have a valid SYLK header, it - will be proactively reinterpreted as CSV. There are some files with semicolon - delimiter that align with a valid SYLK file. For the broadest compatibility, - all cells with the value of `ID` are automatically wrapped in double-quotes. +**Miscellaneous Workbook Formats** -
- -### Other Workbook Formats - -
- (click to show) - -Support for other formats is generally far XLS/XLSB/XLSX support, due in large +Support for other formats is generally far behind XLS/XLSB/XLSX support, due in part to a lack of publicly available documentation. Test files were produced in the respective apps and compared to their XLS exports to determine structure. The main focus is data extraction. -
- -#### Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123) - -
- (click to show) +- **Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123)** The Lotus formats consist of binary records similar to the BIFF structure. Lotus did release a specification decades ago covering the original WK1 format. Other @@ -2637,23 +2647,13 @@ Generated WK1 worksheets are compatible with Lotus 1-2-3 R2 and Excel 5.0. Generated WK3 workbooks are compatible with Lotus 1-2-3 R9 and Excel 5.0. -
- -#### Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW) - -
- (click to show) +- **Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW)** The Quattro Pro formats use binary records in the same way as BIFF and Lotus. Some of the newer formats (namely WB3 and QPW) use a CFB enclosure just like BIFF8 XLS. -
- -#### Works for DOS / Windows Spreadsheet (WKS/XLR) - -
- (click to show) +- **Works for DOS / Windows Spreadsheet (WKS/XLR)** All versions of Works were limited to a single worksheet. @@ -2669,12 +2669,7 @@ exact Workbook stream for the XLR and the 97-2003 XLS export. Works 6 XLS includes two empty worksheets but the main worksheet has an identical encoding. XLR also includes a `WksSSWorkBook` stream similar to Lotus FM3/FMT files. -
- -#### Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS) - -
- (click to show) +- **Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS)** iWork 2013 (Numbers 3.0 / Pages 5.0 / Keynote 6.0) switched from a proprietary XML-based format to the current file format based on the iWork Archive (IWA). @@ -2684,39 +2679,24 @@ The parser focuses on extracting raw data from tables. Numbers technically supports multiple tables in a logical worksheet, including custom titles. This parser will generate one worksheet per Numbers table. -
- -#### OpenDocument Spreadsheet (ODS/FODS) - -
- (click to show) +- **OpenDocument Spreadsheet (ODS/FODS)** ODS is an XML-in-ZIP format akin to XLSX while FODS is an XML format akin to SpreadsheetML. Both are detailed in the OASIS standard, but tools like LO/OO add undocumented extensions. The parsers and writers do not implement the full standard, instead focusing on parts necessary to extract and store raw data. -
- -#### Uniform Office Spreadsheet (UOS1/2) - -
- (click to show) +- **Uniform Office Spreadsheet (UOS1/2)** UOS is a very similar format, and it comes in 2 varieties corresponding to ODS and FODS respectively. For the most part, the difference between the formats is in the names of tags and attributes. -
- -### Other Single-Worksheet Formats +**Miscellaneous Worksheet Formats** Many older formats supported only one worksheet: -#### dBASE and Visual FoxPro (DBF) - -
- (click to show) +- **dBASE and Visual FoxPro (DBF)** DBF is really a typed table format: each column can only hold one data type and each record omits type information. The parser generates a header row and @@ -2727,12 +2707,7 @@ Multi-file extensions like external memos and tables are currently unsupported, limited by the general ability to read arbitrary files in the web browser. The reader understands DBF Level 7 extensions like DATETIME. -
- -#### Symbolic Link (SYLK) - -
- (click to show) +- **Symbolic Link (SYLK)** There is no real documentation. All knowledge was gathered by saving files in various versions of Excel to deduce the meaning of fields. Notes: @@ -2740,23 +2715,13 @@ various versions of Excel to deduce the meaning of fields. Notes: - Plain formulae are stored in the RC form. - Column widths are rounded to integral characters. -
- -#### Lotus Formatted Text (PRN) - -
- (click to show) +- **Lotus Formatted Text (PRN)** There is no real documentation, and in fact Excel treats PRN as an output-only file format. Nevertheless we can guess the column widths and reverse-engineer the original layout. Excel's 240 character width limitation is not enforced. -
- -#### Data Interchange Format (DIF) - -
- (click to show) +- **Data Interchange Format (DIF)** There is no unified definition. Visicalc DIF differs from Lotus DIF, and both differ from Excel DIF. Where ambiguous, the parser/writer follows the expected @@ -2769,12 +2734,7 @@ behavior from Excel. In particular, Excel extends DIF in incompatible ways: - DIF technically has no support for formulae, but Excel will automatically convert plain formulae. Array formulae are not preserved. -
- -#### HTML - -
- (click to show) +- **HTML** Excel HTML worksheets include special metadata encoded in styles. For example, `mso-number-format` is a localized string containing the number format. Despite @@ -2785,22 +2745,12 @@ looks for those tags and overrides the default interpretation. For example, text like `12345` will be parsed as numbers but `12345` will be parsed as text. -
- -#### Rich Text Format (RTF) - -
- (click to show) +- **Rich Text Format (RTF)** Excel RTF worksheets are stored in clipboard when copying cells or ranges from a worksheet. The supported codes are a subset of the Word RTF support. -
- -#### Ethercalc Record Format (ETH) - -
- (click to show) +- **Ethercalc Record Format (ETH)** [Ethercalc](https://ethercalc.net/) is an open source web spreadsheet powered by a record format reminiscent of SYLK wrapped in a MIME multi-part message. diff --git a/bits/83_numbers.js b/bits/83_numbers.js index d1f7272..d1fb749 100755 --- a/bits/83_numbers.js +++ b/bits/83_numbers.js @@ -59,6 +59,13 @@ var NUMBERS = (function() { x = (x & 858993459) + (x >> 2 & 858993459); return (x + (x >> 4) & 252645135) * 16843009 >>> 24; }; + var readDecimal128LE = function(buf, offset) { + var exp = (buf[offset + 15] & 127) << 7 | buf[offset + 14] >> 1; + var mantissa = buf[offset + 14] & 1; + for (var j = offset + 13; j >= offset; --j) + mantissa = mantissa * 256 + buf[j]; + return (buf[offset + 15] & 128 ? -mantissa : mantissa) * Math.pow(10, exp - 6176); + }; // src/proto.ts function parse_varint49(buf, ptr) { @@ -279,10 +286,10 @@ var NUMBERS = (function() { return out; } - // src/prebnccell.ts - function parseit(buf, sst, rsst, version) { + // src/cell.ts + function parse_old_storage(buf, sst, rsst) { var dv = u8_to_dataview(buf); - var ctype = buf[version == 4 ? 1 : 2]; + var ctype = buf[buf[0] == 4 ? 1 : 2]; var flags = dv.getUint32(4, true); var data_offset = 12 + popcnt(flags & 3470) * 4; var ridx = -1, sidx = -1, ieee = NaN, dt = new Date(2001, 0, 1); @@ -342,14 +349,79 @@ var NUMBERS = (function() { } return ret; } + function parse_storage(buf, sst, rsst) { + var dv = u8_to_dataview(buf); + var ctype = buf[1]; + var flags = dv.getUint32(8, true); + var data_offset = 12; + var ridx = -1, sidx = -1, d128 = NaN, ieee = NaN, dt = new Date(2001, 0, 1); + if (flags & 1) { + d128 = readDecimal128LE(buf, data_offset); + data_offset += 16; + } + if (flags & 2) { + ieee = dv.getFloat64(data_offset, true); + data_offset += 8; + } + if (flags & 4) { + dt.setTime(dt.getTime() + dv.getFloat64(data_offset, true) * 1e3); + data_offset += 8; + } + if (flags & 8) { + sidx = dv.getUint32(data_offset, true); + data_offset += 4; + } + if (flags & 16) { + ridx = dv.getUint32(data_offset, true); + data_offset += 4; + } + var ret; + switch (ctype) { + case 0: + break; + case 2: + ret = { t: "n", v: d128 }; + break; + case 3: + ret = { t: "s", v: sst[sidx] }; + break; + case 5: + ret = { t: "d", v: dt }; + break; + case 6: + ret = { t: "b", v: ieee > 0 }; + break; + case 7: + ret = { t: "n", v: ieee }; + break; + case 8: + ret = { t: "e", v: 0 }; + break; + case 9: + { + if (ridx > -1) + ret = { t: "s", v: rsst[ridx] }; + else + throw new Error("Unsupported cell type ".concat(ctype, " : ").concat(flags & 31, " : ").concat(buf.slice(0, 4))); + } + break; + case 10: + ret = { t: "n", v: d128 }; + break; + default: + throw new Error("Unsupported cell type ".concat(ctype, " : ").concat(flags & 31, " : ").concat(buf.slice(0, 4))); + } + return ret; + } function parse(buf, sst, rsst) { - var version = buf[0]; - switch (version) { + switch (buf[0]) { case 3: case 4: - return parseit(buf, sst, rsst, version); + return parse_old_storage(buf, sst, rsst); + case 5: + return parse_storage(buf, sst, rsst); default: - throw new Error("Unsupported pre-BNC version ".concat(version)); + throw new Error("Unsupported payload version ".concat(buf[0])); } } @@ -387,6 +459,10 @@ var NUMBERS = (function() { }; function parse_numbers(cfb) { var out = []; + cfb.FullPaths.forEach(function(p) { + if (p.match(/\.iwpv2/)) + throw new Error("Unsupported password protection"); + }); cfb.FileIndex.forEach(function(s) { if (!s.name.match(/\.iwa$/)) return; @@ -460,16 +536,30 @@ var NUMBERS = (function() { return data; } function parse_TST_TileRowInfo(u8) { + var _a, _b, _c, _d, _e, _f, _g, _h, _i, _j; var pb = parse_shallow(u8); var R = varint_to_i32(pb[1][0].data) >>> 0; - var storage = pb[3][0].data; - var offsets = u8_to_dataview(pb[4][0].data); + var pre_bnc = (_b = (_a = pb[3]) == null ? void 0 : _a[0]) == null ? void 0 : _b.data; + var pre_bnc_offsets = ((_d = (_c = pb[4]) == null ? void 0 : _c[0]) == null ? void 0 : _d.data) && u8_to_dataview(pb[4][0].data); + var storage = (_f = (_e = pb[6]) == null ? void 0 : _e[0]) == null ? void 0 : _f.data; + var storage_offsets = ((_h = (_g = pb[7]) == null ? void 0 : _g[0]) == null ? void 0 : _h.data) && u8_to_dataview(pb[7][0].data); + var wide_offsets = ((_j = (_i = pb[8]) == null ? void 0 : _i[0]) == null ? void 0 : _j.data) && varint_to_i32(pb[8][0].data) > 0 || false; + var width = wide_offsets ? 4 : 1; var cells = []; - for (var C = 0; C < offsets.byteLength / 2; ++C) { - var off = offsets.getUint16(C * 2, true); - if (off > storage.length) - continue; - cells[C] = storage.subarray(off, offsets.getUint16(C * 2 + 2, true)); + var off = 0; + for (var C = 0; C < pre_bnc_offsets.byteLength / 2; ++C) { + if (storage && storage_offsets) { + off = storage_offsets.getUint16(C * 2, true) * width; + if (off < storage.length) { + cells[C] = storage.subarray(off, storage_offsets.getUint16(C * 2 + 2, true) * width); + continue; + } + } + if (pre_bnc && pre_bnc_offsets) { + off = pre_bnc_offsets.getUint16(C * 2, true) * width; + if (off < pre_bnc.length) + cells[C] = pre_bnc.subarray(off, pre_bnc_offsets.getUint16(C * 2 + 2, true) * width); + } } return { R: R, cells: cells }; } diff --git a/docbits/00_intro.md b/docbits/00_intro.md index a1ec38a..3195682 100644 --- a/docbits/00_intro.md +++ b/docbits/00_intro.md @@ -1,30 +1,13 @@ # [SheetJS](https://sheetjs.com) -Parser and writer for various spreadsheet formats. Pure-JS cleanroom -implementation from official specifications, related documents, and test files. -Emphasis on parsing and writing robustness, cross-format feature compatibility -with a unified JS representation, and ES3/ES5 browser compatibility back to IE6. +The SheetJS Community Edition offers battle-tested open-source solutions for +extracting useful data from almost any complex spreadsheet and generating new +spreadsheets that will work with legacy and modern software alike. -This is the community version. We also offer a pro version with performance -enhancements, additional features like styling, and dedicated support. - - -Community Translations of this README: - -- [Simplified Chinese](https://github.com/rockboom/SheetJS-docs-zh-CN) - - -[**Pro Version**](https://sheetjs.com/pro) - -[**Commercial Support**](https://sheetjs.com/support) - -[**Rendered Documentation**](https://docs.sheetjs.com/) - -[**In-Browser Demos**](https://sheetjs.com/demos) - -[**Source Code**](https://git.io/xlsx) - -[**Issues and Bug Reports**](https://github.com/sheetjs/sheetjs/issues) +[SheetJS Pro](https://sheetjs.com/pro) offers solutions beyond data processing: +Edit complex templates with ease; let out your inner Picasso with styling; make +custom sheets with images/graphs/PivotTables; evaluate formula expressions and +port calculations to web apps; automate common spreadsheet tasks, and much more! ![License](https://img.shields.io/github/license/SheetJS/sheetjs) [![Build Status](https://img.shields.io/github/workflow/status/sheetjs/sheetjs/Tests:%20node.js)](https://github.com/SheetJS/sheetjs/actions) diff --git a/docbits/10_install.md b/docbits/10_install.md index ef0f6ac..b6bc764 100644 --- a/docbits/10_install.md +++ b/docbits/10_install.md @@ -1,4 +1,6 @@ -## Installation +## Getting Started + +### Installation In the browser, just add a script tag: @@ -37,3 +39,60 @@ With [bower](https://bower.io/search/?q=js-xlsx): $ bower install js-xlsx ``` +
+ Optional features (click to show) + +The node version automatically requires modules for additional features. Some +of these modules are rather large in size and are only needed in special +circumstances, so they do not ship with the core. For browser use, they must +be included directly: + +```html + + +``` + +An appropriate version for each dependency is included in the dist/ directory. + +The complete single-file version is generated at `dist/xlsx.full.min.js` + +A slimmer build is generated at `dist/xlsx.mini.min.js`. Compared to full build: +- codepage library skipped (no support for XLS encodings) +- XLSX compression option not currently available +- no support for XLSB / XLS / Lotus 1-2-3 / SpreadsheetML 2003 +- node stream utils removed + +Webpack and Browserify builds include optional modules by default. Webpack can +be configured to remove support with `resolve.alias`: + +```js + /* uncomment the lines below to remove support */ + resolve: { + alias: { "./dist/cpexcel.js": "" } // <-- omit international support + } +``` + +
+ +
+ ECMAScript 3 Compatibility (click to show) + +For broad compatibility with JavaScript engines, the library is written using +ECMAScript 3 language dialect as well as some ES5 features like `Array#forEach`. +Older browsers require shims to provide missing functions. + +To use the shim, add the shim before the script tag that loads `xlsx.js`: + +```html + + + + +``` + +The script also includes `IE_LoadFile` and `IE_SaveFile` for loading and saving +files in Internet Explorer versions 6-9. The `xlsx.extendscript.js` script +bundles the shim in a format suitable for Photoshop and other Adobe products. + +
+ diff --git a/docbits/12_optional.md b/docbits/12_optional.md deleted file mode 100644 index 54c09ca..0000000 --- a/docbits/12_optional.md +++ /dev/null @@ -1,55 +0,0 @@ -### Optional Modules - -
- Optional features (click to show) - -The node version automatically requires modules for additional features. Some -of these modules are rather large in size and are only needed in special -circumstances, so they do not ship with the core. For browser use, they must -be included directly: - -```html - - -``` - -An appropriate version for each dependency is included in the dist/ directory. - -The complete single-file version is generated at `dist/xlsx.full.min.js` - -A slimmer build is generated at `dist/xlsx.mini.min.js`. Compared to full build: -- codepage library skipped (no support for XLS encodings) -- XLSX compression option not currently available -- no support for XLSB / XLS / Lotus 1-2-3 / SpreadsheetML 2003 -- node stream utils removed - -Webpack and Browserify builds include optional modules by default. Webpack can -be configured to remove support with `resolve.alias`: - -```js - /* uncomment the lines below to remove support */ - resolve: { - alias: { "./dist/cpexcel.js": "" } // <-- omit international support - } -``` - -
- -### ECMAScript 5 Compatibility - -Since the library uses functions like `Array#forEach`, older browsers require -[shims to provide missing functions](https://oss.sheetjs.com/sheetjs/shim.js). - -To use the shim, add the shim before the script tag that loads `xlsx.js`: - -```html - - - - -``` - -The script also includes `IE_LoadFile` and `IE_SaveFile` for loading and saving -files in Internet Explorer versions 6-9. The `xlsx.extendscript.js` script -bundles the shim in a format suitable for Photoshop and other Adobe products. - diff --git a/docbits/13_usage.md b/docbits/13_usage.md new file mode 100644 index 0000000..c0263d6 --- /dev/null +++ b/docbits/13_usage.md @@ -0,0 +1,94 @@ +### Usage + +Most scenarios involving spreadsheets and data can be broken into 5 parts: + +1) **Acquire Data**: Data may be stored anywhere: local or remote files, + databases, HTML TABLE, or even generated programmatically in the web browser. + +2) **Extract Data**: For spreadsheet files, this involves parsing raw bytes to + read the cell data. For general JS data, this involves reshaping the data. + +3) **Process Data**: From generating summary statistics to cleaning data + records, this step is the heart of the problem. + +4) **Package Data**: This can involve making a new spreadsheet or serializing + with `JSON.stringify` or writing XML or simply flattening data for UI tools. + +5) **Release Data**: Spreadsheet files can be uploaded to a server or written + locally. Data can be presented to users in an HTML TABLE or data grid. + +A common problem involves generating a valid spreadsheet export from data stored +in an HTML table. In this example, an HTML TABLE on the page will be scraped, +a row will be added to the bottom with the date of the report, and a new file +will be generated and downloaded locally. `XLSX.writeFile` takes care of +packaging the data and attempting a local download: + +```js +// Acquire Data (reference to the HTML table) +var table_elt = document.getElementById("my-table-id"); + +// Extract Data (create a workbook object from the table) +var workbook = XLSX.utils.table_to_book(table_elt); + +// Process Data (add a new row) +var worksheet = workbook.Sheets["Sheet1"]; +XLSX.utils.sheet_add_aoa([["Created "+new Date().toISOString()}]], {origin:-1}); + +// Package and Release Data (`writeFile` tries to write and save an XLSB file) +XLSX.writeFile(workbook, "Report.xlsb"); +``` + +This library tries to simplify steps 2 and 4 with functions to extract useful +data from spreadsheet files (`read` / `readFile`) and generate new spreadsheet +files from data (`write` / `writeFile`). + +This documentation and various demo projects cover a number of common scenarios +and approaches for steps 1 and 5. + +Utility functions help with step 3. + + +#### The Zen of SheetJS + + +_File formats are implementation details_ + +The parser covers a wide gamut of common spreadsheet file formats to ensure that +"HTML-saved-as-XLS" files work as well as actual XLS or XLSX files. + +The writer supports a number of common output formats for broad compatibility +with the data ecosystem. + + +_Data processing should fit in any workflow_ + +The library does not impose a separate lifecycle. It fits nicely in websites +and apps built using any framework. The plain JS data objects play nice with +Web Workers and future APIs. + +["Parsing Workbooks"](#parsing-workbooks) describes solutions for common data +import scenarios involving actual spreadsheet files. + +["Writing Workbooks"](#writing-workbooks) describes solutions for common data +export scenarios involving actual spreadsheet files. + +["Utility Functions"](#utility-functions) details utility functions for +translating JSON Arrays and other common JS structures into worksheet objects. + + +_JavaScript is a powerful language for data processing_ + +The ["Common Spreadsheet Format"](#common-spreadsheet-format) is a simple object +representation of the core concepts of a workbook. The various functions in the +library provide low-level tools for working with the object. + +For friendly JS processing, there are utility functions for converting parts of +a worksheet to/from an Array of Arrays. For example, summing columns from an +array of arrays can be implemented in a single Array reduce operation: + +```js +var aoa = XLSX.utils.sheet_to_json(worksheet, {header: 1}); +var sum_of_column_B = aoa.reduce((acc, row) => acc + (+row[1]||0), 0); +``` + + diff --git a/docbits/15_phil.md b/docbits/15_phil.md deleted file mode 100644 index a593cea..0000000 --- a/docbits/15_phil.md +++ /dev/null @@ -1,39 +0,0 @@ -## Philosophy - -
- Philosophy (click to show) - -Prior to SheetJS, APIs for processing spreadsheet files were format-specific. -Third-party libraries either supported one format, or they involved a separate -set of classes for each supported file type. Even though XLSB was introduced in -Excel 2007, nothing outside of SheetJS or Excel supported the format. - -To promote a format-agnostic view, SheetJS starts from a pure-JS representation -that we call the ["Common Spreadsheet Format"](#common-spreadsheet-format). -Emphasizing a uniform object representation enables new features like format -conversion (reading an XLSX template and saving as XLS) and circumvents the mess -of classes. By abstracting the complexities of the various formats, tools -need not worry about the specific file type! - -A simple object representation combined with careful coding practices enables -use cases in older browsers and in alternative environments like ExtendScript -and Web Workers. It is always tempting to use the latest and greatest features, -but they tend to require the latest versions of browsers, limiting usability. - -Utility functions capture common use cases like generating JS objects or HTML. -Most simple operations should only require a few lines of code. More complex -operations generally should be straightforward to implement. - -Excel pushes the XLSX format as default starting in Excel 2007. However, there -are other formats with more appealing properties. For example, the XLSB format -is spiritually similar to XLSX but files often tend up taking less than half the -space and open much faster! Even though an XLSX writer is available, other -format writers are available so users can take advantage of the unique -characteristics of each format. - -The primary focus of the Community Edition is correct data interchange, focused -on extracting data from any compatible data representation and exporting data in -various formats suitable for any third party interface. - -
- diff --git a/docbits/11_demos.md b/docbits/16_demos.md similarity index 100% rename from docbits/11_demos.md rename to docbits/16_demos.md diff --git a/docbits/40_interface.md b/docbits/40_interface.md index dd7bf49..ac8f221 100644 --- a/docbits/40_interface.md +++ b/docbits/40_interface.md @@ -33,6 +33,11 @@ Write options are described in the [Writing Options](#writing-options) section. Utilities are available in the `XLSX.utils` object and are described in the [Utility Functions](#utility-functions) section: +**Constructing:** + +- `book_new` creates an empty workbook +- `book_append_sheet` adds a worksheet to a workbook + **Importing:** - `aoa_to_sheet` converts an array of arrays of JS data to a worksheet. diff --git a/docbits/62_colrow.md b/docbits/62_colrow.md index f366831..9f00c50 100644 --- a/docbits/62_colrow.md +++ b/docbits/62_colrow.md @@ -1,4 +1,20 @@ -#### Column Properties +#### Row and Column Properties + +
+ Format Support (click to show) + +**Row Properties**: XLSX/M, XLSB, BIFF8 XLS, XLML, SYLK, DOM, ODS + +**Column Properties**: XLSX/M, XLSB, BIFF8 XLS, XLML, SYLK, DOM + +
+ + +Row and Column properties are not extracted by default when reading from a file +and are not persisted by default when writing to a file. The option +`cellStyles: true` must be passed to the relevant read or write function. + +_Column Properties_ The `!cols` array in each worksheet, if present, is a collection of `ColInfo` objects which have the following properties: @@ -19,6 +35,30 @@ type ColInfo = { }; ``` +_Row Properties_ + +The `!rows` array in each worksheet, if present, is a collection of `RowInfo` +objects which have the following properties: + +```typescript +type RowInfo = { + /* visibility */ + hidden?: boolean; // if true, the row is hidden + + /* row height is specified in one of the following ways: */ + hpx?: number; // height in screen pixels + hpt?: number; // height in points + + level?: number; // 0-indexed outline / group level +}; +``` + +_Outline / Group Levels Convention_ + +The Excel UI displays the base outline level as `1` and the max level as `8`. +Following JS conventions, SheetJS uses 0-indexed outline levels wherein the base +outline level is `0` and the max level is `7`. +
Why are there three width types? (click to show) @@ -49,6 +89,20 @@ when changing the pixel width, delete the `wch` and `width` properties.
Implementation details (click to show) +_Row Heights_ + +Excel internally stores row heights in points. The default resolution is 72 DPI +or 96 PPI, so the pixel and point size should agree. For different resolutions +they may not agree, so the library separates the concepts. + +Even though all of the information is made available, writers are expected to +follow the priority order: + +1) use `hpx` pixel height if available +2) use `hpt` point height if available + +_Column Widths_ + Given the constraints, it is possible to determine the MDW without actually inspecting the font! The parsers guess the pixel width by converting from width to pixels and back, repeating for all possible MDW and selecting the MDW that @@ -61,40 +115,6 @@ follow the priority order: 1) use `width` field if available 2) use `wpx` pixel width if available 3) use `wch` character count if available -
- -#### Row Properties - -The `!rows` array in each worksheet, if present, is a collection of `RowInfo` -objects which have the following properties: - -```typescript -type RowInfo = { - /* visibility */ - hidden?: boolean; // if true, the row is hidden - - /* row height is specified in one of the following ways: */ - hpx?: number; // height in screen pixels - hpt?: number; // height in points - - level?: number; // 0-indexed outline / group level -}; -``` - -Note: Excel UI displays the base outline level as `1` and the max level as `8`. -The `level` field stores the base outline as `0` and the max level as `7`. - -
- Implementation details (click to show) - -Excel internally stores row heights in points. The default resolution is 72 DPI -or 96 PPI, so the pixel and point size should agree. For different resolutions -they may not agree, so the library separates the concepts. - -Even though all of the information is made available, writers are expected to -follow the priority order: - -1) use `hpx` pixel height if available -2) use `hpt` point height if available +
diff --git a/docbits/85_filetype.md b/docbits/85_filetype.md index e8052c3..675e97f 100644 --- a/docbits/85_filetype.md +++ b/docbits/85_filetype.md @@ -53,10 +53,12 @@ range limits will be silently truncated: Excel 2003 SpreadsheetML range limits are governed by the version of Excel and are not enforced by the writer. -### Excel 2007+ XML (XLSX/XLSM) -
- (click to show) + File Format Details (click to show) + +**Core Spreadsheet Formats** + +- **Excel 2007+ XML (XLSX/XLSM)** XLSX and XLSM files are ZIP containers containing a series of XML files in accordance with the Open Packaging Conventions (OPC). The XLSM format, almost @@ -66,12 +68,7 @@ The format is standardized in ECMA-376 and later in ISO/IEC 29500. Excel does not follow the specification, and there are additional documents discussing how Excel deviates from the specification. -
- -### Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5) - -
- (click to show) +- **Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5)** BIFF 2/3 XLS are single-sheet streams of binary records. Excel 4 introduced the concept of a workbook (`XLW` files) but also had single-sheet `XLS` format. @@ -83,12 +80,7 @@ files in these formats, so record lengths and fields were determined by writing in all of the supported formats and comparing files. Excel 2016 can generate BIFF5 files, enabling a full suite of file tests starting from XLSX or BIFF2. -
- -### Excel 97-2004 Binary (BIFF8) - -
- (click to show) +- **Excel 97-2004 Binary (BIFF8)** BIFF8 exclusively uses the Compound File Binary container format, splitting some content into streams within the file. At its core, it still uses an extended @@ -97,24 +89,14 @@ version of the binary record format from older versions of BIFF. The `MS-XLS` specification covers the basics of the file format, and other specifications expand on serialization of features like properties. -
- -### Excel 2003-2004 (SpreadsheetML) - -
- (click to show) +- **Excel 2003-2004 (SpreadsheetML)** Predating XLSX, SpreadsheetML files are simple XML files. There is no official and comprehensive specification, although MS has released documentation on the format. Since Excel 2016 can generate SpreadsheetML files, mapping features is pretty straightforward. -
- -### Excel 2007+ Binary (XLSB, BIFF12) - -
- (click to show) +- **Excel 2007+ Binary (XLSB, BIFF12)** Introduced in parallel with XLSX, the XLSB format combines the BIFF architecture with the content separation and ZIP container of XLSX. For the most part nodes @@ -123,12 +105,7 @@ in an XLSX sub-file can be mapped to XLSB records in a corresponding sub-file. The `MS-XLSB` specification covers the basics of the file format, and other specifications expand on serialization of features like properties. -
- -### Delimiter-Separated Values (CSV/TXT) - -
- (click to show) +- **Delimiter-Separated Values (CSV/TXT)** Excel CSV deviates from RFC4180 in a number of important ways. The generated CSV files should generally work in Excel although they may not work in RFC4180 @@ -137,32 +114,20 @@ writer proactively generates cells for formulae if values are unavailable. Excel TXT uses tab as the delimiter and code page 1200. -Notes: +Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic +Link files. Unlike Excel, if the file does not have a valid SYLK header, it +will be proactively reinterpreted as CSV. There are some files with semicolon +delimiter that align with a valid SYLK file. For the broadest compatibility, +all cells with the value of `ID` are automatically wrapped in double-quotes. -- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic - Link files. Unlike Excel, if the file does not have a valid SYLK header, it - will be proactively reinterpreted as CSV. There are some files with semicolon - delimiter that align with a valid SYLK file. For the broadest compatibility, - all cells with the value of `ID` are automatically wrapped in double-quotes. +**Miscellaneous Workbook Formats** -
- -### Other Workbook Formats - -
- (click to show) - -Support for other formats is generally far XLS/XLSB/XLSX support, due in large +Support for other formats is generally far behind XLS/XLSB/XLSX support, due in part to a lack of publicly available documentation. Test files were produced in the respective apps and compared to their XLS exports to determine structure. The main focus is data extraction. -
- -#### Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123) - -
- (click to show) +- **Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123)** The Lotus formats consist of binary records similar to the BIFF structure. Lotus did release a specification decades ago covering the original WK1 format. Other @@ -172,23 +137,13 @@ Generated WK1 worksheets are compatible with Lotus 1-2-3 R2 and Excel 5.0. Generated WK3 workbooks are compatible with Lotus 1-2-3 R9 and Excel 5.0. -
- -#### Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW) - -
- (click to show) +- **Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW)** The Quattro Pro formats use binary records in the same way as BIFF and Lotus. Some of the newer formats (namely WB3 and QPW) use a CFB enclosure just like BIFF8 XLS. -
- -#### Works for DOS / Windows Spreadsheet (WKS/XLR) - -
- (click to show) +- **Works for DOS / Windows Spreadsheet (WKS/XLR)** All versions of Works were limited to a single worksheet. @@ -204,12 +159,7 @@ exact Workbook stream for the XLR and the 97-2003 XLS export. Works 6 XLS includes two empty worksheets but the main worksheet has an identical encoding. XLR also includes a `WksSSWorkBook` stream similar to Lotus FM3/FMT files. -
- -#### Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS) - -
- (click to show) +- **Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS)** iWork 2013 (Numbers 3.0 / Pages 5.0 / Keynote 6.0) switched from a proprietary XML-based format to the current file format based on the iWork Archive (IWA). @@ -219,39 +169,24 @@ The parser focuses on extracting raw data from tables. Numbers technically supports multiple tables in a logical worksheet, including custom titles. This parser will generate one worksheet per Numbers table. -
- -#### OpenDocument Spreadsheet (ODS/FODS) - -
- (click to show) +- **OpenDocument Spreadsheet (ODS/FODS)** ODS is an XML-in-ZIP format akin to XLSX while FODS is an XML format akin to SpreadsheetML. Both are detailed in the OASIS standard, but tools like LO/OO add undocumented extensions. The parsers and writers do not implement the full standard, instead focusing on parts necessary to extract and store raw data. -
- -#### Uniform Office Spreadsheet (UOS1/2) - -
- (click to show) +- **Uniform Office Spreadsheet (UOS1/2)** UOS is a very similar format, and it comes in 2 varieties corresponding to ODS and FODS respectively. For the most part, the difference between the formats is in the names of tags and attributes. -
- -### Other Single-Worksheet Formats +**Miscellaneous Worksheet Formats** Many older formats supported only one worksheet: -#### dBASE and Visual FoxPro (DBF) - -
- (click to show) +- **dBASE and Visual FoxPro (DBF)** DBF is really a typed table format: each column can only hold one data type and each record omits type information. The parser generates a header row and @@ -262,12 +197,7 @@ Multi-file extensions like external memos and tables are currently unsupported, limited by the general ability to read arbitrary files in the web browser. The reader understands DBF Level 7 extensions like DATETIME. -
- -#### Symbolic Link (SYLK) - -
- (click to show) +- **Symbolic Link (SYLK)** There is no real documentation. All knowledge was gathered by saving files in various versions of Excel to deduce the meaning of fields. Notes: @@ -275,23 +205,13 @@ various versions of Excel to deduce the meaning of fields. Notes: - Plain formulae are stored in the RC form. - Column widths are rounded to integral characters. -
- -#### Lotus Formatted Text (PRN) - -
- (click to show) +- **Lotus Formatted Text (PRN)** There is no real documentation, and in fact Excel treats PRN as an output-only file format. Nevertheless we can guess the column widths and reverse-engineer the original layout. Excel's 240 character width limitation is not enforced. -
- -#### Data Interchange Format (DIF) - -
- (click to show) +- **Data Interchange Format (DIF)** There is no unified definition. Visicalc DIF differs from Lotus DIF, and both differ from Excel DIF. Where ambiguous, the parser/writer follows the expected @@ -304,12 +224,7 @@ behavior from Excel. In particular, Excel extends DIF in incompatible ways: - DIF technically has no support for formulae, but Excel will automatically convert plain formulae. Array formulae are not preserved. -
- -#### HTML - -
- (click to show) +- **HTML** Excel HTML worksheets include special metadata encoded in styles. For example, `mso-number-format` is a localized string containing the number format. Despite @@ -320,22 +235,12 @@ looks for those tags and overrides the default interpretation. For example, text like `12345` will be parsed as numbers but `12345` will be parsed as text. -
- -#### Rich Text Format (RTF) - -
- (click to show) +- **Rich Text Format (RTF)** Excel RTF worksheets are stored in clipboard when copying cells or ranges from a worksheet. The supported codes are a subset of the Word RTF support. -
- -#### Ethercalc Record Format (ETH) - -
- (click to show) +- **Ethercalc Record Format (ETH)** [Ethercalc](https://ethercalc.net/) is an open source web spreadsheet powered by a record format reminiscent of SYLK wrapped in a MIME multi-part message. diff --git a/misc/docs/README.md b/misc/docs/README.md index 0ed77b0..c168ab1 100644 --- a/misc/docs/README.md +++ b/misc/docs/README.md @@ -1,30 +1,13 @@ # [SheetJS](https://sheetjs.com) -Parser and writer for various spreadsheet formats. Pure-JS cleanroom -implementation from official specifications, related documents, and test files. -Emphasis on parsing and writing robustness, cross-format feature compatibility -with a unified JS representation, and ES3/ES5 browser compatibility back to IE6. +The SheetJS Community Edition offers battle-tested open-source solutions for +extracting useful data from almost any complex spreadsheet and generating new +spreadsheets that will work with legacy and modern software alike. -This is the community version. We also offer a pro version with performance -enhancements, additional features like styling, and dedicated support. - - -Community Translations of this README: - -- [Simplified Chinese](https://github.com/rockboom/SheetJS-docs-zh-CN) - - -[**Pro Version**](https://sheetjs.com/pro) - -[**Commercial Support**](https://sheetjs.com/support) - -[**Rendered Documentation**](https://docs.sheetjs.com/) - -[**In-Browser Demos**](https://sheetjs.com/demos) - -[**Source Code**](https://git.io/xlsx) - -[**Issues and Bug Reports**](https://github.com/sheetjs/sheetjs/issues) +[SheetJS Pro](https://sheetjs.com/pro) offers solutions beyond data processing: +Edit complex templates with ease; let out your inner Picasso with styling; make +custom sheets with images/graphs/PivotTables; evaluate formula expressions and +port calculations to web apps; automate common spreadsheet tasks, and much more! ![License](https://img.shields.io/github/license/SheetJS/sheetjs) [![Build Status](https://img.shields.io/github/workflow/status/sheetjs/sheetjs/Tests:%20node.js)](https://github.com/SheetJS/sheetjs/actions) @@ -50,11 +33,11 @@ Community Translations of this README: -- [Installation](#installation) +- [Getting Started](#getting-started) + * [Installation](#installation) + * [Usage](#usage) + + [The Zen of SheetJS](#the-zen-of-sheetjs) * [JS Ecosystem Demos](#js-ecosystem-demos) - * [Optional Modules](#optional-modules) - * [ECMAScript 5 Compatibility](#ecmascript-5-compatibility) -- [Philosophy](#philosophy) - [Parsing Workbooks](#parsing-workbooks) * [Parsing Examples](#parsing-examples) * [Streaming Read](#streaming-read) @@ -85,8 +68,7 @@ Community Translations of this README: + [Miscellaneous Workbook Properties](#miscellaneous-workbook-properties) * [Document Features](#document-features) + [Formulae](#formulae) - + [Column Properties](#column-properties) - + [Row Properties](#row-properties) + + [Row and Column Properties](#row-and-column-properties) + [Number Formats](#number-formats) + [Hyperlinks](#hyperlinks) + [Cell Comments](#cell-comments) @@ -108,27 +90,6 @@ Community Translations of this README: * [HTML Output](#html-output) * [JSON](#json) - [File Formats](#file-formats) - * [Excel 2007+ XML (XLSX/XLSM)](#excel-2007-xml-xlsxxlsm) - * [Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5)](#excel-20-95-biff2biff3biff4biff5) - * [Excel 97-2004 Binary (BIFF8)](#excel-97-2004-binary-biff8) - * [Excel 2003-2004 (SpreadsheetML)](#excel-2003-2004-spreadsheetml) - * [Excel 2007+ Binary (XLSB, BIFF12)](#excel-2007-binary-xlsb-biff12) - * [Delimiter-Separated Values (CSV/TXT)](#delimiter-separated-values-csvtxt) - * [Other Workbook Formats](#other-workbook-formats) - + [Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123)](#lotus-1-2-3-wkswk1wk2wk3wk4123) - + [Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW)](#quattro-pro-wq1wq2wb1wb2wb3qpw) - + [Works for DOS / Windows Spreadsheet (WKS/XLR)](#works-for-dos--windows-spreadsheet-wksxlr) - + [Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS)](#numbers-30--iwork-2013-spreadsheet-numbers) - + [OpenDocument Spreadsheet (ODS/FODS)](#opendocument-spreadsheet-odsfods) - + [Uniform Office Spreadsheet (UOS1/2)](#uniform-office-spreadsheet-uos12) - * [Other Single-Worksheet Formats](#other-single-worksheet-formats) - + [dBASE and Visual FoxPro (DBF)](#dbase-and-visual-foxpro-dbf) - + [Symbolic Link (SYLK)](#symbolic-link-sylk) - + [Lotus Formatted Text (PRN)](#lotus-formatted-text-prn) - + [Data Interchange Format (DIF)](#data-interchange-format-dif) - + [HTML](#html) - + [Rich Text Format (RTF)](#rich-text-format-rtf) - + [Ethercalc Record Format (ETH)](#ethercalc-record-format-eth) - [Testing](#testing) * [Node](#node) * [Browser](#browser) @@ -144,7 +105,9 @@ Community Translations of this README: -## Installation +## Getting Started + +### Installation In the browser, just add a script tag: @@ -180,6 +143,151 @@ With [bower](https://bower.io/search/?q=js-xlsx): $ bower install js-xlsx ``` + +The node version automatically requires modules for additional features. Some +of these modules are rather large in size and are only needed in special +circumstances, so they do not ship with the core. For browser use, they must +be included directly: + +```html + + +``` + +An appropriate version for each dependency is included in the dist/ directory. + +The complete single-file version is generated at `dist/xlsx.full.min.js` + +A slimmer build is generated at `dist/xlsx.mini.min.js`. Compared to full build: +- codepage library skipped (no support for XLS encodings) +- XLSX compression option not currently available +- no support for XLSB / XLS / Lotus 1-2-3 / SpreadsheetML 2003 +- node stream utils removed + +Webpack and Browserify builds include optional modules by default. Webpack can +be configured to remove support with `resolve.alias`: + +```js + /* uncomment the lines below to remove support */ + resolve: { + alias: { "./dist/cpexcel.js": "" } // <-- omit international support + } +``` + + + +For broad compatibility with JavaScript engines, the library is written using +ECMAScript 3 language dialect as well as some ES5 features like `Array#forEach`. +Older browsers require shims to provide missing functions. + +To use the shim, add the shim before the script tag that loads `xlsx.js`: + +```html + + + + +``` + +The script also includes `IE_LoadFile` and `IE_SaveFile` for loading and saving +files in Internet Explorer versions 6-9. The `xlsx.extendscript.js` script +bundles the shim in a format suitable for Photoshop and other Adobe products. + + +### Usage + +Most scenarios involving spreadsheets and data can be broken into 5 parts: + +1) **Acquire Data**: Data may be stored anywhere: local or remote files, + databases, HTML TABLE, or even generated programmatically in the web browser. + +2) **Extract Data**: For spreadsheet files, this involves parsing raw bytes to + read the cell data. For general JS data, this involves reshaping the data. + +3) **Process Data**: From generating summary statistics to cleaning data + records, this step is the heart of the problem. + +4) **Package Data**: This can involve making a new spreadsheet or serializing + with `JSON.stringify` or writing XML or simply flattening data for UI tools. + +5) **Release Data**: Spreadsheet files can be uploaded to a server or written + locally. Data can be presented to users in an HTML TABLE or data grid. + +A common problem involves generating a valid spreadsheet export from data stored +in an HTML table. In this example, an HTML TABLE on the page will be scraped, +a row will be added to the bottom with the date of the report, and a new file +will be generated and downloaded locally. `XLSX.writeFile` takes care of +packaging the data and attempting a local download: + +```js +// Acquire Data (reference to the HTML table) +var table_elt = document.getElementById("my-table-id"); + +// Extract Data (create a workbook object from the table) +var workbook = XLSX.utils.table_to_book(table_elt); + +// Process Data (add a new row) +var worksheet = workbook.Sheets["Sheet1"]; +XLSX.utils.sheet_add_aoa([["Created "+new Date().toISOString()}]], {origin:-1}); + +// Package and Release Data (`writeFile` tries to write and save an XLSB file) +XLSX.writeFile(workbook, "Report.xlsb"); +``` + +This library tries to simplify steps 2 and 4 with functions to extract useful +data from spreadsheet files (`read` / `readFile`) and generate new spreadsheet +files from data (`write` / `writeFile`). + +This documentation and various demo projects cover a number of common scenarios +and approaches for steps 1 and 5. + +Utility functions help with step 3. + + +#### The Zen of SheetJS + + +_File formats are implementation details_ + +The parser covers a wide gamut of common spreadsheet file formats to ensure that +"HTML-saved-as-XLS" files work as well as actual XLS or XLSX files. + +The writer supports a number of common output formats for broad compatibility +with the data ecosystem. + + +_Data processing should fit in any workflow_ + +The library does not impose a separate lifecycle. It fits nicely in websites +and apps built using any framework. The plain JS data objects play nice with +Web Workers and future APIs. + +["Parsing Workbooks"](#parsing-workbooks) describes solutions for common data +import scenarios involving actual spreadsheet files. + +["Writing Workbooks"](#writing-workbooks) describes solutions for common data +export scenarios involving actual spreadsheet files. + +["Utility Functions"](#utility-functions) details utility functions for +translating JSON Arrays and other common JS structures into worksheet objects. + + +_JavaScript is a powerful language for data processing_ + +The ["Common Spreadsheet Format"](#common-spreadsheet-format) is a simple object +representation of the core concepts of a workbook. The various functions in the +library provide low-level tools for working with the object. + +For friendly JS processing, there are utility functions for converting parts of +a worksheet to/from an Array of Arrays. For example, summing columns from an +array of arrays can be implemented in a single Array reduce operation: + +```js +var aoa = XLSX.utils.sheet_to_json(worksheet, {header: 1}); +var sum_of_column_B = aoa.reduce((acc, row) => acc + (+row[1]||0), 0); +``` + + ### JS Ecosystem Demos The [`demos` directory](demos/) includes sample projects for: @@ -220,94 +328,6 @@ The [`demos` directory](demos/) includes sample projects for: Other examples are included in the [showcase](demos/showcase/). -### Optional Modules - - -The node version automatically requires modules for additional features. Some -of these modules are rather large in size and are only needed in special -circumstances, so they do not ship with the core. For browser use, they must -be included directly: - -```html - - -``` - -An appropriate version for each dependency is included in the dist/ directory. - -The complete single-file version is generated at `dist/xlsx.full.min.js` - -A slimmer build is generated at `dist/xlsx.mini.min.js`. Compared to full build: -- codepage library skipped (no support for XLS encodings) -- XLSX compression option not currently available -- no support for XLSB / XLS / Lotus 1-2-3 / SpreadsheetML 2003 -- node stream utils removed - -Webpack and Browserify builds include optional modules by default. Webpack can -be configured to remove support with `resolve.alias`: - -```js - /* uncomment the lines below to remove support */ - resolve: { - alias: { "./dist/cpexcel.js": "" } // <-- omit international support - } -``` - - -### ECMAScript 5 Compatibility - -Since the library uses functions like `Array#forEach`, older browsers require -[shims to provide missing functions](https://oss.sheetjs.com/sheetjs/shim.js). - -To use the shim, add the shim before the script tag that loads `xlsx.js`: - -```html - - - - -``` - -The script also includes `IE_LoadFile` and `IE_SaveFile` for loading and saving -files in Internet Explorer versions 6-9. The `xlsx.extendscript.js` script -bundles the shim in a format suitable for Photoshop and other Adobe products. - -## Philosophy - - -Prior to SheetJS, APIs for processing spreadsheet files were format-specific. -Third-party libraries either supported one format, or they involved a separate -set of classes for each supported file type. Even though XLSB was introduced in -Excel 2007, nothing outside of SheetJS or Excel supported the format. - -To promote a format-agnostic view, SheetJS starts from a pure-JS representation -that we call the ["Common Spreadsheet Format"](#common-spreadsheet-format). -Emphasizing a uniform object representation enables new features like format -conversion (reading an XLSX template and saving as XLS) and circumvents the mess -of classes. By abstracting the complexities of the various formats, tools -need not worry about the specific file type! - -A simple object representation combined with careful coding practices enables -use cases in older browsers and in alternative environments like ExtendScript -and Web Workers. It is always tempting to use the latest and greatest features, -but they tend to require the latest versions of browsers, limiting usability. - -Utility functions capture common use cases like generating JS objects or HTML. -Most simple operations should only require a few lines of code. More complex -operations generally should be straightforward to implement. - -Excel pushes the XLSX format as default starting in Excel 2007. However, there -are other formats with more appealing properties. For example, the XLSB format -is spiritually similar to XLSX but files often tend up taking less than half the -space and open much faster! Even though an XLSX writer is available, other -format writers are available so users can take advantage of the unique -characteristics of each format. - -The primary focus of the Community Edition is correct data interchange, focused -on extracting data from any compatible data representation and exporting data in -various formats suitable for any third party interface. - - ## Parsing Workbooks For parsing, the first step is to read the file. This involves acquiring the @@ -813,6 +833,11 @@ Write options are described in the [Writing Options](#writing-options) section. Utilities are available in the `XLSX.utils` object and are described in the [Utility Functions](#utility-functions) section: +**Constructing:** + +- `book_new` creates an empty workbook +- `book_append_sheet` adds a worksheet to a workbook + **Importing:** - `aoa_to_sheet` converts an array of arrays of JS data to a worksheet. @@ -1275,7 +1300,20 @@ cell references, a (not-so-simple) regex conversion is possible. BIFF Parsed formulae and Lotus Parsed formulae have to be explicitly unwound. OpenFormula formulae can be converted with regular expressions. -#### Column Properties +#### Row and Column Properties + + +**Row Properties**: XLSX/M, XLSB, BIFF8 XLS, XLML, SYLK, DOM, ODS + +**Column Properties**: XLSX/M, XLSB, BIFF8 XLS, XLML, SYLK, DOM + + + +Row and Column properties are not extracted by default when reading from a file +and are not persisted by default when writing to a file. The option +`cellStyles: true` must be passed to the relevant read or write function. + +_Column Properties_ The `!cols` array in each worksheet, if present, is a collection of `ColInfo` objects which have the following properties: @@ -1296,6 +1334,30 @@ type ColInfo = { }; ``` +_Row Properties_ + +The `!rows` array in each worksheet, if present, is a collection of `RowInfo` +objects which have the following properties: + +```typescript +type RowInfo = { + /* visibility */ + hidden?: boolean; // if true, the row is hidden + + /* row height is specified in one of the following ways: */ + hpx?: number; // height in screen pixels + hpt?: number; // height in points + + level?: number; // 0-indexed outline / group level +}; +``` + +_Outline / Group Levels Convention_ + +The Excel UI displays the base outline level as `1` and the max level as `8`. +Following JS conventions, SheetJS uses 0-indexed outline levels wherein the base +outline level is `0` and the max level is `7`. + There are three different width types corresponding to the three different ways spreadsheets store column widths: @@ -1321,6 +1383,20 @@ conflicts, manipulation should delete the other properties first. For example, when changing the pixel width, delete the `wch` and `width` properties. +_Row Heights_ + +Excel internally stores row heights in points. The default resolution is 72 DPI +or 96 PPI, so the pixel and point size should agree. For different resolutions +they may not agree, so the library separates the concepts. + +Even though all of the information is made available, writers are expected to +follow the priority order: + +1) use `hpx` pixel height if available +2) use `hpt` point height if available + +_Column Widths_ + Given the constraints, it is possible to determine the MDW without actually inspecting the font! The parsers guess the pixel width by converting from width to pixels and back, repeating for all possible MDW and selecting the MDW that @@ -1334,37 +1410,6 @@ follow the priority order: 2) use `wpx` pixel width if available 3) use `wch` character count if available -#### Row Properties - -The `!rows` array in each worksheet, if present, is a collection of `RowInfo` -objects which have the following properties: - -```typescript -type RowInfo = { - /* visibility */ - hidden?: boolean; // if true, the row is hidden - - /* row height is specified in one of the following ways: */ - hpx?: number; // height in screen pixels - hpt?: number; // height in points - - level?: number; // 0-indexed outline / group level -}; -``` - -Note: Excel UI displays the base outline level as `1` and the max level as `8`. -The `level` field stores the base outline as `0` and the max level as `7`. - - -Excel internally stores row heights in points. The default resolution is 72 DPI -or 96 PPI, so the pixel and point size should agree. For different resolutions -they may not agree, so the library separates the concepts. - -Even though all of the information is made available, writers are expected to -follow the priority order: - -1) use `hpx` pixel height if available -2) use `hpt` point height if available #### Number Formats @@ -2336,8 +2381,10 @@ range limits will be silently truncated: Excel 2003 SpreadsheetML range limits are governed by the version of Excel and are not enforced by the writer. -### Excel 2007+ XML (XLSX/XLSM) +**Core Spreadsheet Formats** + +- **Excel 2007+ XML (XLSX/XLSM)** XLSX and XLSM files are ZIP containers containing a series of XML files in accordance with the Open Packaging Conventions (OPC). The XLSM format, almost @@ -2347,9 +2394,7 @@ The format is standardized in ECMA-376 and later in ISO/IEC 29500. Excel does not follow the specification, and there are additional documents discussing how Excel deviates from the specification. - -### Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5) - +- **Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5)** BIFF 2/3 XLS are single-sheet streams of binary records. Excel 4 introduced the concept of a workbook (`XLW` files) but also had single-sheet `XLS` format. @@ -2361,9 +2406,7 @@ files in these formats, so record lengths and fields were determined by writing in all of the supported formats and comparing files. Excel 2016 can generate BIFF5 files, enabling a full suite of file tests starting from XLSX or BIFF2. - -### Excel 97-2004 Binary (BIFF8) - +- **Excel 97-2004 Binary (BIFF8)** BIFF8 exclusively uses the Compound File Binary container format, splitting some content into streams within the file. At its core, it still uses an extended @@ -2372,18 +2415,14 @@ version of the binary record format from older versions of BIFF. The `MS-XLS` specification covers the basics of the file format, and other specifications expand on serialization of features like properties. - -### Excel 2003-2004 (SpreadsheetML) - +- **Excel 2003-2004 (SpreadsheetML)** Predating XLSX, SpreadsheetML files are simple XML files. There is no official and comprehensive specification, although MS has released documentation on the format. Since Excel 2016 can generate SpreadsheetML files, mapping features is pretty straightforward. - -### Excel 2007+ Binary (XLSB, BIFF12) - +- **Excel 2007+ Binary (XLSB, BIFF12)** Introduced in parallel with XLSX, the XLSB format combines the BIFF architecture with the content separation and ZIP container of XLSX. For the most part nodes @@ -2392,9 +2431,7 @@ in an XLSX sub-file can be mapped to XLSB records in a corresponding sub-file. The `MS-XLSB` specification covers the basics of the file format, and other specifications expand on serialization of features like properties. - -### Delimiter-Separated Values (CSV/TXT) - +- **Delimiter-Separated Values (CSV/TXT)** Excel CSV deviates from RFC4180 in a number of important ways. The generated CSV files should generally work in Excel although they may not work in RFC4180 @@ -2403,26 +2440,20 @@ writer proactively generates cells for formulae if values are unavailable. Excel TXT uses tab as the delimiter and code page 1200. -Notes: +Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic +Link files. Unlike Excel, if the file does not have a valid SYLK header, it +will be proactively reinterpreted as CSV. There are some files with semicolon +delimiter that align with a valid SYLK file. For the broadest compatibility, +all cells with the value of `ID` are automatically wrapped in double-quotes. -- Like in Excel, files starting with `0x49 0x44 ("ID")` are treated as Symbolic - Link files. Unlike Excel, if the file does not have a valid SYLK header, it - will be proactively reinterpreted as CSV. There are some files with semicolon - delimiter that align with a valid SYLK file. For the broadest compatibility, - all cells with the value of `ID` are automatically wrapped in double-quotes. +**Miscellaneous Workbook Formats** - -### Other Workbook Formats - - -Support for other formats is generally far XLS/XLSB/XLSX support, due in large +Support for other formats is generally far behind XLS/XLSB/XLSX support, due in part to a lack of publicly available documentation. Test files were produced in the respective apps and compared to their XLS exports to determine structure. The main focus is data extraction. - -#### Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123) - +- **Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123)** The Lotus formats consist of binary records similar to the BIFF structure. Lotus did release a specification decades ago covering the original WK1 format. Other @@ -2432,17 +2463,13 @@ Generated WK1 worksheets are compatible with Lotus 1-2-3 R2 and Excel 5.0. Generated WK3 workbooks are compatible with Lotus 1-2-3 R9 and Excel 5.0. - -#### Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW) - +- **Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW)** The Quattro Pro formats use binary records in the same way as BIFF and Lotus. Some of the newer formats (namely WB3 and QPW) use a CFB enclosure just like BIFF8 XLS. - -#### Works for DOS / Windows Spreadsheet (WKS/XLR) - +- **Works for DOS / Windows Spreadsheet (WKS/XLR)** All versions of Works were limited to a single worksheet. @@ -2458,9 +2485,7 @@ exact Workbook stream for the XLR and the 97-2003 XLS export. Works 6 XLS includes two empty worksheets but the main worksheet has an identical encoding. XLR also includes a `WksSSWorkBook` stream similar to Lotus FM3/FMT files. - -#### Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS) - +- **Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS)** iWork 2013 (Numbers 3.0 / Pages 5.0 / Keynote 6.0) switched from a proprietary XML-based format to the current file format based on the iWork Archive (IWA). @@ -2470,30 +2495,24 @@ The parser focuses on extracting raw data from tables. Numbers technically supports multiple tables in a logical worksheet, including custom titles. This parser will generate one worksheet per Numbers table. - -#### OpenDocument Spreadsheet (ODS/FODS) - +- **OpenDocument Spreadsheet (ODS/FODS)** ODS is an XML-in-ZIP format akin to XLSX while FODS is an XML format akin to SpreadsheetML. Both are detailed in the OASIS standard, but tools like LO/OO add undocumented extensions. The parsers and writers do not implement the full standard, instead focusing on parts necessary to extract and store raw data. - -#### Uniform Office Spreadsheet (UOS1/2) - +- **Uniform Office Spreadsheet (UOS1/2)** UOS is a very similar format, and it comes in 2 varieties corresponding to ODS and FODS respectively. For the most part, the difference between the formats is in the names of tags and attributes. - -### Other Single-Worksheet Formats +**Miscellaneous Worksheet Formats** Many older formats supported only one worksheet: -#### dBASE and Visual FoxPro (DBF) - +- **dBASE and Visual FoxPro (DBF)** DBF is really a typed table format: each column can only hold one data type and each record omits type information. The parser generates a header row and @@ -2504,9 +2523,7 @@ Multi-file extensions like external memos and tables are currently unsupported, limited by the general ability to read arbitrary files in the web browser. The reader understands DBF Level 7 extensions like DATETIME. - -#### Symbolic Link (SYLK) - +- **Symbolic Link (SYLK)** There is no real documentation. All knowledge was gathered by saving files in various versions of Excel to deduce the meaning of fields. Notes: @@ -2514,17 +2531,13 @@ various versions of Excel to deduce the meaning of fields. Notes: - Plain formulae are stored in the RC form. - Column widths are rounded to integral characters. - -#### Lotus Formatted Text (PRN) - +- **Lotus Formatted Text (PRN)** There is no real documentation, and in fact Excel treats PRN as an output-only file format. Nevertheless we can guess the column widths and reverse-engineer the original layout. Excel's 240 character width limitation is not enforced. - -#### Data Interchange Format (DIF) - +- **Data Interchange Format (DIF)** There is no unified definition. Visicalc DIF differs from Lotus DIF, and both differ from Excel DIF. Where ambiguous, the parser/writer follows the expected @@ -2537,9 +2550,7 @@ behavior from Excel. In particular, Excel extends DIF in incompatible ways: - DIF technically has no support for formulae, but Excel will automatically convert plain formulae. Array formulae are not preserved. - -#### HTML - +- **HTML** Excel HTML worksheets include special metadata encoded in styles. For example, `mso-number-format` is a localized string containing the number format. Despite @@ -2550,16 +2561,12 @@ looks for those tags and overrides the default interpretation. For example, text like `12345` will be parsed as numbers but `12345` will be parsed as text. - -#### Rich Text Format (RTF) - +- **Rich Text Format (RTF)** Excel RTF worksheets are stored in clipboard when copying cells or ranges from a worksheet. The supported codes are a subset of the Word RTF support. - -#### Ethercalc Record Format (ETH) - +- **Ethercalc Record Format (ETH)** [Ethercalc](https://ethercalc.net/) is an open source web spreadsheet powered by a record format reminiscent of SYLK wrapped in a MIME multi-part message. diff --git a/misc/docs/SUMMARY.md b/misc/docs/SUMMARY.md index e876876..2f9372a 100644 --- a/misc/docs/SUMMARY.md +++ b/misc/docs/SUMMARY.md @@ -1,11 +1,11 @@ # Summary - [xlsx](README.md#sheetjs-js-xlsx) -- [Installation](README.md#installation) +- [Getting Started](README.md#getting-started) + * [Installation](README.md#installation) + * [Usage](README.md#usage) + + [The Zen of SheetJS](README.md#the-zen-of-sheetjs) * [JS Ecosystem Demos](README.md#js-ecosystem-demos) - * [Optional Modules](README.md#optional-modules) - * [ECMAScript 5 Compatibility](README.md#ecmascript-5-compatibility) -- [Philosophy](README.md#philosophy) - [Parsing Workbooks](README.md#parsing-workbooks) * [Parsing Examples](README.md#parsing-examples) * [Streaming Read](README.md#streaming-read) @@ -36,8 +36,7 @@ + [Miscellaneous Workbook Properties](README.md#miscellaneous-workbook-properties) * [Document Features](README.md#document-features) + [Formulae](README.md#formulae) - + [Column Properties](README.md#column-properties) - + [Row Properties](README.md#row-properties) + + [Row and Column Properties](README.md#row-and-column-properties) + [Number Formats](README.md#number-formats) + [Hyperlinks](README.md#hyperlinks) + [Cell Comments](README.md#cell-comments) @@ -59,27 +58,6 @@ * [HTML Output](README.md#html-output) * [JSON](README.md#json) - [File Formats](README.md#file-formats) - * [Excel 2007+ XML (XLSX/XLSM)](README.md#excel-2007-xml-xlsxxlsm) - * [Excel 2.0-95 (BIFF2/BIFF3/BIFF4/BIFF5)](README.md#excel-20-95-biff2biff3biff4biff5) - * [Excel 97-2004 Binary (BIFF8)](README.md#excel-97-2004-binary-biff8) - * [Excel 2003-2004 (SpreadsheetML)](README.md#excel-2003-2004-spreadsheetml) - * [Excel 2007+ Binary (XLSB, BIFF12)](README.md#excel-2007-binary-xlsb-biff12) - * [Delimiter-Separated Values (CSV/TXT)](README.md#delimiter-separated-values-csvtxt) - * [Other Workbook Formats](README.md#other-workbook-formats) - + [Lotus 1-2-3 (WKS/WK1/WK2/WK3/WK4/123)](README.md#lotus-1-2-3-wkswk1wk2wk3wk4123) - + [Quattro Pro (WQ1/WQ2/WB1/WB2/WB3/QPW)](README.md#quattro-pro-wq1wq2wb1wb2wb3qpw) - + [Works for DOS / Windows Spreadsheet (WKS/XLR)](README.md#works-for-dos--windows-spreadsheet-wksxlr) - + [Numbers 3.0+ / iWork 2013+ Spreadsheet (NUMBERS)](README.md#numbers-30--iwork-2013-spreadsheet-numbers) - + [OpenDocument Spreadsheet (ODS/FODS)](README.md#opendocument-spreadsheet-odsfods) - + [Uniform Office Spreadsheet (UOS1/2)](README.md#uniform-office-spreadsheet-uos12) - * [Other Single-Worksheet Formats](README.md#other-single-worksheet-formats) - + [dBASE and Visual FoxPro (DBF)](README.md#dbase-and-visual-foxpro-dbf) - + [Symbolic Link (SYLK)](README.md#symbolic-link-sylk) - + [Lotus Formatted Text (PRN)](README.md#lotus-formatted-text-prn) - + [Data Interchange Format (DIF)](README.md#data-interchange-format-dif) - + [HTML](README.md#html) - + [Rich Text Format (RTF)](README.md#rich-text-format-rtf) - + [Ethercalc Record Format (ETH)](README.md#ethercalc-record-format-eth) - [Testing](README.md#testing) * [Node](README.md#node) * [Browser](README.md#browser) diff --git a/modules/83_numbers.js b/modules/83_numbers.js index d1f7272..d1fb749 100644 --- a/modules/83_numbers.js +++ b/modules/83_numbers.js @@ -59,6 +59,13 @@ var NUMBERS = (function() { x = (x & 858993459) + (x >> 2 & 858993459); return (x + (x >> 4) & 252645135) * 16843009 >>> 24; }; + var readDecimal128LE = function(buf, offset) { + var exp = (buf[offset + 15] & 127) << 7 | buf[offset + 14] >> 1; + var mantissa = buf[offset + 14] & 1; + for (var j = offset + 13; j >= offset; --j) + mantissa = mantissa * 256 + buf[j]; + return (buf[offset + 15] & 128 ? -mantissa : mantissa) * Math.pow(10, exp - 6176); + }; // src/proto.ts function parse_varint49(buf, ptr) { @@ -279,10 +286,10 @@ var NUMBERS = (function() { return out; } - // src/prebnccell.ts - function parseit(buf, sst, rsst, version) { + // src/cell.ts + function parse_old_storage(buf, sst, rsst) { var dv = u8_to_dataview(buf); - var ctype = buf[version == 4 ? 1 : 2]; + var ctype = buf[buf[0] == 4 ? 1 : 2]; var flags = dv.getUint32(4, true); var data_offset = 12 + popcnt(flags & 3470) * 4; var ridx = -1, sidx = -1, ieee = NaN, dt = new Date(2001, 0, 1); @@ -342,14 +349,79 @@ var NUMBERS = (function() { } return ret; } + function parse_storage(buf, sst, rsst) { + var dv = u8_to_dataview(buf); + var ctype = buf[1]; + var flags = dv.getUint32(8, true); + var data_offset = 12; + var ridx = -1, sidx = -1, d128 = NaN, ieee = NaN, dt = new Date(2001, 0, 1); + if (flags & 1) { + d128 = readDecimal128LE(buf, data_offset); + data_offset += 16; + } + if (flags & 2) { + ieee = dv.getFloat64(data_offset, true); + data_offset += 8; + } + if (flags & 4) { + dt.setTime(dt.getTime() + dv.getFloat64(data_offset, true) * 1e3); + data_offset += 8; + } + if (flags & 8) { + sidx = dv.getUint32(data_offset, true); + data_offset += 4; + } + if (flags & 16) { + ridx = dv.getUint32(data_offset, true); + data_offset += 4; + } + var ret; + switch (ctype) { + case 0: + break; + case 2: + ret = { t: "n", v: d128 }; + break; + case 3: + ret = { t: "s", v: sst[sidx] }; + break; + case 5: + ret = { t: "d", v: dt }; + break; + case 6: + ret = { t: "b", v: ieee > 0 }; + break; + case 7: + ret = { t: "n", v: ieee }; + break; + case 8: + ret = { t: "e", v: 0 }; + break; + case 9: + { + if (ridx > -1) + ret = { t: "s", v: rsst[ridx] }; + else + throw new Error("Unsupported cell type ".concat(ctype, " : ").concat(flags & 31, " : ").concat(buf.slice(0, 4))); + } + break; + case 10: + ret = { t: "n", v: d128 }; + break; + default: + throw new Error("Unsupported cell type ".concat(ctype, " : ").concat(flags & 31, " : ").concat(buf.slice(0, 4))); + } + return ret; + } function parse(buf, sst, rsst) { - var version = buf[0]; - switch (version) { + switch (buf[0]) { case 3: case 4: - return parseit(buf, sst, rsst, version); + return parse_old_storage(buf, sst, rsst); + case 5: + return parse_storage(buf, sst, rsst); default: - throw new Error("Unsupported pre-BNC version ".concat(version)); + throw new Error("Unsupported payload version ".concat(buf[0])); } } @@ -387,6 +459,10 @@ var NUMBERS = (function() { }; function parse_numbers(cfb) { var out = []; + cfb.FullPaths.forEach(function(p) { + if (p.match(/\.iwpv2/)) + throw new Error("Unsupported password protection"); + }); cfb.FileIndex.forEach(function(s) { if (!s.name.match(/\.iwa$/)) return; @@ -460,16 +536,30 @@ var NUMBERS = (function() { return data; } function parse_TST_TileRowInfo(u8) { + var _a, _b, _c, _d, _e, _f, _g, _h, _i, _j; var pb = parse_shallow(u8); var R = varint_to_i32(pb[1][0].data) >>> 0; - var storage = pb[3][0].data; - var offsets = u8_to_dataview(pb[4][0].data); + var pre_bnc = (_b = (_a = pb[3]) == null ? void 0 : _a[0]) == null ? void 0 : _b.data; + var pre_bnc_offsets = ((_d = (_c = pb[4]) == null ? void 0 : _c[0]) == null ? void 0 : _d.data) && u8_to_dataview(pb[4][0].data); + var storage = (_f = (_e = pb[6]) == null ? void 0 : _e[0]) == null ? void 0 : _f.data; + var storage_offsets = ((_h = (_g = pb[7]) == null ? void 0 : _g[0]) == null ? void 0 : _h.data) && u8_to_dataview(pb[7][0].data); + var wide_offsets = ((_j = (_i = pb[8]) == null ? void 0 : _i[0]) == null ? void 0 : _j.data) && varint_to_i32(pb[8][0].data) > 0 || false; + var width = wide_offsets ? 4 : 1; var cells = []; - for (var C = 0; C < offsets.byteLength / 2; ++C) { - var off = offsets.getUint16(C * 2, true); - if (off > storage.length) - continue; - cells[C] = storage.subarray(off, offsets.getUint16(C * 2 + 2, true)); + var off = 0; + for (var C = 0; C < pre_bnc_offsets.byteLength / 2; ++C) { + if (storage && storage_offsets) { + off = storage_offsets.getUint16(C * 2, true) * width; + if (off < storage.length) { + cells[C] = storage.subarray(off, storage_offsets.getUint16(C * 2 + 2, true) * width); + continue; + } + } + if (pre_bnc && pre_bnc_offsets) { + off = pre_bnc_offsets.getUint16(C * 2, true) * width; + if (off < pre_bnc.length) + cells[C] = pre_bnc.subarray(off, pre_bnc_offsets.getUint16(C * 2 + 2, true) * width); + } } return { R: R, cells: cells }; } diff --git a/modules/src/cell.ts b/modules/src/cell.ts new file mode 100644 index 0000000..66930c7 --- /dev/null +++ b/modules/src/cell.ts @@ -0,0 +1,88 @@ +/*! sheetjs (C) 2013-present SheetJS -- http://sheetjs.com */ +import { CellObject } from '../../'; +import { u8_to_dataview, popcnt, readDecimal128LE } from './util'; + +function parse_old_storage(buf: Uint8Array, sst: string[], rsst: string[]): CellObject { + var dv = u8_to_dataview(buf); + var ctype = buf[buf[0] == 4 ? 1 : 2]; + + /* TODO: find the correct field position of number formats, formulae, etc */ + var flags = dv.getUint32(4, true); + var data_offset = 12 + popcnt(flags & 0x0D8E) * 4; + + var ridx = -1, sidx = -1, ieee = NaN, dt = new Date(2001, 0, 1); + if(flags & 0x0200) { ridx = dv.getUint32(data_offset, true); data_offset += 4; } + data_offset += popcnt(flags & 0x3000) * 4; + if(flags & 0x0010) { sidx = dv.getUint32(data_offset, true); data_offset += 4; } + if(flags & 0x0020) { ieee = dv.getFloat64(data_offset, true); data_offset += 8; } + if(flags & 0x0040) { dt.setTime(dt.getTime() + dv.getFloat64(data_offset, true) * 1000); data_offset += 8; } + + var ret: CellObject; + switch(ctype) { + case 0: break; // return { t: "z" }; // blank? + case 2: ret = { t: "n", v: ieee }; break; // number + case 3: ret = { t: "s", v: sst[sidx] }; break; // string + case 5: ret = { t: "d", v: dt }; break; // date-time + case 6: ret = { t: "b", v: ieee > 0 }; break; // boolean + case 7: ret = { t: "n", v: ieee }; break; // duration in seconds TODO: emit [hh]:[mm] style format with adjusted value + case 8: ret = { t: "e", v: 0}; break; // "formula error" TODO: enumerate and map errors to csf equivalents + case 9: { // "automatic"? + if(ridx > -1) ret = { t: "s", v: rsst[ridx] }; + else if(sidx > -1) ret = { t: "s", v: sst[sidx] }; + else if(!isNaN(ieee)) ret = { t: "n", v: ieee }; + else throw new Error(`Unsupported cell type ${buf.slice(0,4)}`); + } break; + default: throw new Error(`Unsupported cell type ${buf.slice(0,4)}`); + } + /* TODO: Some fields appear after the cell data */ + + return ret; +} + +function parse_storage(buf: Uint8Array, sst: string[], rsst: string[]): CellObject { + var dv = u8_to_dataview(buf); + var ctype = buf[1]; + + /* TODO: find the correct field position of number formats, formulae, etc */ + var flags = dv.getUint32(8, true); + var data_offset = 12; + + var ridx = -1, sidx = -1, d128 = NaN, ieee = NaN, dt = new Date(2001, 0, 1); + + if(flags & 0x0001) { d128 = readDecimal128LE(buf, data_offset); data_offset += 16; } + if(flags & 0x0002) { ieee = dv.getFloat64(data_offset, true); data_offset += 8; } + if(flags & 0x0004) { dt.setTime(dt.getTime() + dv.getFloat64(data_offset, true) * 1000); data_offset += 8; } + if(flags & 0x0008) { sidx = dv.getUint32(data_offset, true); data_offset += 4; } + if(flags & 0x0010) { ridx = dv.getUint32(data_offset, true); data_offset += 4; } + + var ret: CellObject; + switch(ctype) { + case 0: break; // return { t: "z" }; // blank? + case 2: ret = { t: "n", v: d128 }; break; // number + case 3: ret = { t: "s", v: sst[sidx] }; break; // string + case 5: ret = { t: "d", v: dt }; break; // date-time + case 6: ret = { t: "b", v: ieee > 0 }; break; // boolean + case 7: ret = { t: "n", v: ieee }; break; // duration in seconds TODO: emit [hh]:[mm] style format with adjusted value + case 8: ret = { t: "e", v: 0}; break; // "formula error" TODO: enumerate and map errors to csf equivalents + case 9: { // "automatic"? + if(ridx > -1) ret = { t: "s", v: rsst[ridx] }; + else throw new Error(`Unsupported cell type ${ctype} : ${flags & 0x1F} : ${buf.slice(0,4)}`); + } break; + case 10: ret = { t: "n", v: d128 }; break; // currency + default: throw new Error(`Unsupported cell type ${ctype} : ${flags & 0x1F} : ${buf.slice(0,4)}`); + } + /* TODO: All styling fields appear after the cell data */ + + return ret; +} + +function parse(buf: Uint8Array, sst: string[], rsst: string[]): CellObject { + switch(buf[0]) { + /* TODO: 0-2? */ + case 3: case 4: return parse_old_storage(buf, sst, rsst); + case 5: return parse_storage(buf, sst, rsst); + default: throw new Error(`Unsupported payload version ${buf[0]}`); + } +} + +export { parse }; \ No newline at end of file diff --git a/modules/src/numbers.ts b/modules/src/numbers.ts index 33db6a8..d36ac59 100644 --- a/modules/src/numbers.ts +++ b/modules/src/numbers.ts @@ -5,7 +5,7 @@ import { u8str, u8_to_dataview } from './util'; import { parse_shallow, varint_to_i32, parse_varint49, mappa } from './proto'; import { deframe } from './frame'; import { IWAArchiveInfo, IWAMessage, parse_iwa } from './iwa'; -import { parse as parse_bnc } from "./prebnccell"; +import { parse as parse_storage } from "./cell"; /* written here to avoid a full import of the 'xlsx' library */ var encode_col = (C: number): string => { @@ -27,6 +27,7 @@ var book_append_sheet = (wb: WorkBook, ws: WorkSheet, name?: string): void => { function parse_numbers(cfb: CFB$Container): WorkBook { var out: IWAMessage[][] = []; + cfb.FullPaths.forEach(p => { if(p.match(/\.iwpv2/)) throw new Error(`Unsupported password protection`); }); /* collect entire message space */ cfb.FileIndex.forEach(s => { if(!s.name.match(/\.iwa$/)) return; @@ -99,13 +100,24 @@ interface TileRowInfo { function parse_TST_TileRowInfo(u8: Uint8Array): TileRowInfo { var pb = parse_shallow(u8); var R = varint_to_i32(pb[1][0].data) >>> 0; - var storage = pb[3][0].data; - var offsets = u8_to_dataview(pb[4][0].data); + var pre_bnc = pb[3]?.[0]?.data; + var pre_bnc_offsets = pb[4]?.[0]?.data && u8_to_dataview(pb[4][0].data); + var storage = pb[6]?.[0]?.data; + var storage_offsets = pb[7]?.[0]?.data && u8_to_dataview(pb[7][0].data); + var wide_offsets = pb[8]?.[0]?.data && varint_to_i32(pb[8][0].data) > 0 || false; + var width = wide_offsets ? 4 : 1; var cells = []; - for(var C = 0; C < offsets.byteLength/2; ++C) { - var off = offsets.getUint16(C*2, true); - if(off > storage.length) continue; - cells[C] = storage.subarray(off, offsets.getUint16(C*2+2, true)); + var off = 0; + for(var C = 0; C < pre_bnc_offsets.byteLength/2; ++C) { + /* prefer storage if it is present, otherwise fall back on pre_bnc */ + if(storage && storage_offsets) { + off = storage_offsets.getUint16(C*2, true) * width; + if(off < storage.length) { cells[C] = storage.subarray(off, storage_offsets.getUint16(C*2+2, true) * width); continue; } + } + if(pre_bnc && pre_bnc_offsets) { + off = pre_bnc_offsets.getUint16(C*2, true) * width; + if(off < pre_bnc.length) cells[C] = pre_bnc.subarray(off, pre_bnc_offsets.getUint16(C*2+2, true) * width); + } } return { R, cells }; } @@ -155,7 +167,7 @@ function parse_TST_TableModelArchive(M: IWAMessage[][], root: IWAMessage, ws: Wo tile.ref.forEach((row, R) => { row.forEach((buf, C) => { var addr = encode_cell({r:R,c:C}); - var res = parse_bnc(buf, sst, rsst); + var res = parse_storage(buf, sst, rsst); if(res) ws[addr] = res; }); }); diff --git a/modules/src/prebnccell.ts b/modules/src/prebnccell.ts deleted file mode 100644 index 2b42b5f..0000000 --- a/modules/src/prebnccell.ts +++ /dev/null @@ -1,50 +0,0 @@ -/*! sheetjs (C) 2013-present SheetJS -- http://sheetjs.com */ -import { CellObject } from '../../'; -import { u8_to_dataview, popcnt } from './util'; - -function parseit(buf: Uint8Array, sst: string[], rsst: string[], version: number): CellObject { - var dv = u8_to_dataview(buf); - var ctype = buf[version == 4 ? 1 : 2]; - - /* TODO: find the correct field position of number formats, formulae, etc */ - var flags = dv.getUint32(4, true); - var data_offset = 12 + popcnt(flags & 0x0D8E) * 4; - - var ridx = -1, sidx = -1, ieee = NaN, dt = new Date(2001, 0, 1); - if(flags & 0x0200) { ridx = dv.getUint32(data_offset, true); data_offset += 4; } - data_offset += popcnt(flags & 0x3000) * 4; - if(flags & 0x0010) { sidx = dv.getUint32(data_offset, true); data_offset += 4; } - if(flags & 0x0020) { ieee = dv.getFloat64(data_offset, true); data_offset += 8; } - if(flags & 0x0040) { dt.setTime(dt.getTime() + dv.getFloat64(data_offset, true) * 1000); data_offset += 8; } - - var ret: CellObject; - switch(ctype) { - case 0: break; // return { t: "z" }; // blank? - case 2: ret = { t: "n", v: ieee }; break; // number - case 3: ret = { t: "s", v: sst[sidx] }; break; // string - case 5: ret = { t: "d", v: dt }; break; // date-time - case 6: ret = { t: "b", v: ieee > 0 }; break; // boolean - case 7: ret = { t: "n", v: ieee }; break; // duration in seconds TODO: emit [hh]:[mm] style format with adjusted value - case 8: ret = { t: "e", v: 0}; break; // "formula error" TODO: enumerate and map errors to csf equivalents - case 9: { // "automatic"? - if(ridx > -1) ret = { t: "s", v: rsst[ridx] }; - else if(sidx > -1) ret = { t: "s", v: sst[sidx] }; - else if(!isNaN(ieee)) ret = { t: "n", v: ieee }; - else throw new Error(`Unsupported cell type ${buf.slice(0,4)}`); - } break; - default: throw new Error(`Unsupported cell type ${buf.slice(0,4)}`); - } - /* TODO: Some fields appear after the cell data */ - - return ret; -} - -function parse(buf: Uint8Array, sst: string[], rsst: string[]): CellObject { - var version = buf[0]; // numbers 3.5 uses "3", 6.x - 11.x use "4" - switch(version) { - case 3: case 4: return parseit(buf, sst, rsst, version); - default: throw new Error(`Unsupported pre-BNC version ${version}`); - } -} - -export { parse }; \ No newline at end of file diff --git a/modules/src/util.ts b/modules/src/util.ts index 8f3ecca..1929389 100644 --- a/modules/src/util.ts +++ b/modules/src/util.ts @@ -45,3 +45,12 @@ var popcnt = (x: number): number => { return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >>> 24; }; export { popcnt }; + +/* Used in the modern cell storage */ +var readDecimal128LE = (buf: Uint8Array, offset: number): number => { + var exp = ((buf[offset + 15] & 0x7F) << 7) | (buf[offset + 14] >> 1); + var mantissa = buf[offset + 14] & 1; + for(var j = offset + 13; j >= offset; --j) mantissa = mantissa * 256 + buf[j]; + return ((buf[offset+15] & 0x80) ? -mantissa : mantissa) * Math.pow(10, exp - 0x1820); +}; +export { readDecimal128LE };