From a77f991b86cf1abd473adafcd448862a42965ba8 Mon Sep 17 00:00:00 2001 From: SheetJS Date: Sun, 30 Oct 2022 20:58:49 -0400 Subject: [PATCH] stream --- .gitignore | 1 + .spelling | 5 + docz/docs/03-demos/07-worker.md | 131 +++++++- docz/docs/03-demos/08-stream.md | 354 ++++++++++++++++++++ docz/docs/03-demos/19-bundler.md | 4 +- docz/docs/03-demos/33-localfile.md | 3 +- docz/docs/06-solutions/05-output.md | 99 +----- docz/docs/07-csf/07-features/01-formulae.md | 8 +- docz/docs/08-api/05-parse-options.md | 2 +- docz/docs/09-miscellany/02-errors.md | 4 +- docz/package.json | 12 +- 11 files changed, 503 insertions(+), 120 deletions(-) create mode 100644 docz/docs/03-demos/08-stream.md diff --git a/.gitignore b/.gitignore index 60d61c8..5f4b662 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.*.sw* *.bak package-lock.json pnpm-lock.yaml diff --git a/.spelling b/.spelling index 2a16e2d..0513a27 100644 --- a/.spelling +++ b/.spelling @@ -76,6 +76,7 @@ WK3 WK4 WKS WK_ +WPS WQ WQ1 WQ2 @@ -97,6 +98,7 @@ macrosheets tooltip tooltips 标文通 +电子表格 # Other terms 1.x @@ -117,6 +119,7 @@ BOM Base64 Base64-encoded Big5 +BitBucket Booleans Browserify Bundlers @@ -151,6 +154,8 @@ ExpressJS ExtendScript Fastify FileReader +FileReaderSync +FileSaver GBK GatsbyJS GitLab diff --git a/docz/docs/03-demos/07-worker.md b/docz/docs/03-demos/07-worker.md index d7c7439..425c0ff 100644 --- a/docz/docs/03-demos/07-worker.md +++ b/docz/docs/03-demos/07-worker.md @@ -30,7 +30,7 @@ For production use, it is highly encouraged to download and host the script. :::note Browser Compatibility ESM is supported in Web Workers in the Chromium family of browsers (including -Chrome and Edge) as well as in Webkit-based browsers (including Safari). +Chrome and Edge) as well as in browsers powered by WebKit (including Safari). For support in legacy browsers like Firefox, `importScripts` should be used. @@ -102,7 +102,7 @@ self.addEventListener('message', async(e) => { const ab = await res.arrayBuffer(); /* Parse file */ - const wb = XLSX.read(ab); + const wb = XLSX.read(ab, {dense: true}); const ws = wb.Sheets[wb.SheetNames[0]]; /* Generate HTML */ @@ -162,7 +162,7 @@ self.addEventListener('message', async(e) => { SheetJS,in,Web,Workers வணக்கம்,สวัสดี,你好,가지마 1,2,3,4\`; - const wb = XLSX.read(csv, { type: "string" }); + const wb = XLSX.read(csv, { type: "string", dense: true }); /* Write XLSB data (Uint8Array) */ const u8 = XLSX.write(wb, { bookType: "xlsb", type: "buffer" }); @@ -234,7 +234,7 @@ self.addEventListener('message', (e) => { const ab = new FileReaderSync().readAsArrayBuffer(e.data.file); /* Parse file */ - const wb = XLSX.read(ab); + const wb = XLSX.read(ab, {dense: true}); const ws = wb.Sheets[wb.SheetNames[0]]; /* Generate HTML */ @@ -264,4 +264,125 @@ self.addEventListener('message', (e) => {
); } -``` \ No newline at end of file +``` + +## Streaming Write + +A more general discussion, including row-oriented processing demos, is included +in the ["Large Datasets"](/docs/demos/stream#browser) demo. + +#### File System Access API + +:::note + +At the time of writing, the File System Access API is only available in Chromium +and Chromium-based browsers like Chrome and Edge. + +::: + +The following live demo fetches and parses a file in a Web Worker. The script: + +- prompts user to save file (`window.showSaveFilePicker` in the main thread) +- passes the URL and the file object to the Web Worker +- loads the SheetJS library in the Web Worker +- fetches the requested URL and parses the workbook from the Worker +- creates a Writable Stream from the file object. +- uses `XLSX.stream.to_csv` to generate CSV rows of the first worksheet + + on each row, the data is written to the file stream + + every 10th row, a progress message is sent back to the main thread + + at the end, a completion message is sent back to the main thread + +The demo has a URL input box. Feel free to change the URL. For example, + +`https://raw.githubusercontent.com/SheetJS/test_files/master/large_strings.xls` +is an XLS file over 50 MB + +`https://raw.githubusercontent.com/SheetJS/libreoffice_test-files/master/calc/xlsx-import/perf/8-by-300000-cells.xlsx` +is an XLSX file with 300000 rows (approximately 20 MB) + +```jsx live +function SheetJSFetchCSVStreamFile() { + const [state, setState] = React.useState(""); + const [cnt, setCnt] = React.useState(0); + const [url, setUrl] = React.useState("https://oss.sheetjs.com/test_files/large_strings.xlsx"); + + return ( <> + URL: setUrl(e.target.value)} size="80"/> + +
State: {state}
Number of rows: {cnt}
+ ); +} +``` diff --git a/docz/docs/03-demos/08-stream.md b/docz/docs/03-demos/08-stream.md new file mode 100644 index 0000000..f8ee8b2 --- /dev/null +++ b/docz/docs/03-demos/08-stream.md @@ -0,0 +1,354 @@ +--- +title: Large Datasets +--- + +For maximal compatibility, the library reads entire files at once and generates +files at once. Browsers and other JS engines enforce tight memory limits. In +these cases, the library offers strategies to optimize for memory or space by +using platform-specific APIs. + +## Dense Mode + +The `dense` option (supported in `read`, `readFile` and `aoa_to_sheet`) creates +worksheet objects that use arrays of arrays under the hood: + +```js +var dense_wb = XLSX.read(ab, {dense: true}); + +var dense_sheet = XLSX.utils.aoa_to_sheet(aoa); +``` + +
Historical Note (click to show) + +The earliest versions of the library aimed for IE6+ compatibility. In early +testing, both in Chrome 26 and in IE6, the most efficient worksheet storage for +small sheets was a large object whose keys were cell addresses. + +Over time, V8 (the engine behind Chrome and NodeJS) evolved in a way that made +the array of arrays approach more efficient but reduced the performance of the +large object approach. + +In the interest of preserving backwards compatibility, the library opts to make +the array of arrays approach available behind a special `dense` option. + +
+ +The various API functions will seamlessly handle dense and sparse worksheets. + +## Streaming Write + +The streaming write functions are available in the `XLSX.stream` object. They +take the same arguments as the normal write functions: + +- `XLSX.stream.to_csv` is the streaming version of `XLSX.utils.sheet_to_csv`. +- `XLSX.stream.to_html` is the streaming version of `XLSX.utils.sheet_to_html`. +- `XLSX.stream.to_json` is the streaming version of `XLSX.utils.sheet_to_json`. + +"Stream" refers to the NodeJS push streams API. + +
Historical Note (click to show) + +NodeJS push streams were introduced in 2012. + +The first streaming write function, `to_csv`, was introduced in April 2017. It +used and still uses the same NodeJS streaming API. + +Years later, browser vendors are settling on a different stream API. + +For maximal compatibility, the library uses NodeJS push streams. + +
+ +### NodeJS + +:::note + +In a CommonJS context, NodeJS Streams and `fs` immediately work with SheetJS: + +```js +const XLSX = require("xlsx"); // "just works" +``` + +In NodeJS ESM, the dependency must be loaded manually: + +```js +import * as XLSX from 'xlsx'; +import { Readable } from 'stream'; + +XLSX.stream.set_readable(Readable); // manually load stream helpers +``` + +Additionally, for file-related operations in NodeJS ESM, `fs` must be loaded: + +```js +import * as XLSX from 'xlsx'; +import * as fs from 'fs'; + +XLSX.set_fs(fs); // manually load fs helpers +``` + +**It is strongly encouraged to use CommonJS in NodeJS whenever possible.** + +::: + +This example reads a worksheet passed as an argument to the script, pulls the +first worksheet, converts to CSV and writes to `out.csv`: + +```js +var XLSX = require("xlsx"); +var workbook = XLSX.readFile(process.argv[2]); +var worksheet = workbook.Sheets[workbook.SheetNames[0]]; +// highlight-next-line +var stream = XLSX.stream.to_csv(worksheet); + +var output_file_name = "out.csv"; +// highlight-next-line +stream.pipe(fs.createWriteStream(output_file_name)); +``` + +`stream.to_json` uses Object-mode streams. A `Transform` stream can be used to +generate a normal stream for streaming to a file or the screen: + +```js +var XLSX = require("xlsx"); +var workbook = XLSX.readFile(process.argv[2], {dense: true}); +var worksheet = workbook.Sheets[workbook.SheetNames[0]]; +/* to_json returns an object-mode stream */ +// highlight-next-line +var stream = XLSX.stream.to_json(worksheet, {raw:true}); + +/* this Transform stream converts JS objects to text and prints to screen */ +var conv = new Transform({writableObjectMode:true}); +conv._transform = function(obj, e, cb){ cb(null, JSON.stringify(obj) + "\n"); }; +conv.pipe(process.stdout); + +// highlight-next-line +stream.pipe(conv); +``` + +### Browser + +
Live Demo (click to show) + +The following live demo fetches and parses a file in a Web Worker. The `to_csv` +streaming function is used to generate CSV rows and pass back to the main thread +for further processing. + +:::note + +For Chromium browsers, the File System Access API provides a modern worker-only +approach. [The Web Workers demo](/docs/demos/worker#streaming-write) includes a +live example of CSV streaming write. + +::: + +The demo has a URL input box. Feel free to change the URL. For example, + +`https://raw.githubusercontent.com/SheetJS/test_files/master/large_strings.xls` +is an XLS file over 50 MB + +`https://raw.githubusercontent.com/SheetJS/libreoffice_test-files/master/calc/xlsx-import/perf/8-by-300000-cells.xlsx` +is an XLSX file with 300000 rows (approximately 20 MB) + +```jsx live +function SheetJSFetchCSVStreamWorker() { + const [__html, setHTML] = React.useState(""); + const [state, setState] = React.useState(""); + const [cnt, setCnt] = React.useState(0); + const [url, setUrl] = React.useState("https://oss.sheetjs.com/test_files/large_strings.xlsx"); + + return ( <> + URL: setUrl(e.target.value)} size="80"/> + +
State: {state}
Number of rows: {cnt}
+
+   );
+}
+```
+
+
+ +NodeJS streaming APIs are not available in the browser. The following function +supplies a pseudo stream object compatible with the `to_csv` function: + +```js +function sheet_to_csv_cb(ws, cb, opts, batch = 1000) { + XLSX.stream.set_readable(() => ({ + __done: false, + // this function will be assigned by the SheetJS stream methods + _read: function() { this.__done = true; }, + // this function is called by the stream methods + push: function(d) { if(!this.__done) cb(d); if(d == null) this.__done = true; }, + resume: function pump() { for(var i = 0; i < batch && !this.__done; ++i) this._read(); if(!this.__done) setTimeout(pump.bind(this), 0); } + })); + return XLSX.stream.to_csv(ws, opts); +} + +// assuming `workbook` is a workbook, stream the first sheet +const ws = workbook.Sheets[workbook.SheetNames[0]]; +const strm = sheet_to_csv_cb(ws, (csv)=>{ if(csv != null) console.log(csv); }); +strm.resume(); +``` + +#### Web Workers + +For processing large files in the browser, it is strongly encouraged to use Web +Workers. The [Worker demo](/docs/demos/worker#streaming-write) includes examples +using the File System Access API. + +Typically, the file and stream processing occurs in the Web Worker. CSV rows +can be sent back to the main thread in the callback: + +```js title="worker.js" +/* load standalone script from CDN */ +importScripts("https://cdn.sheetjs.com/xlsx-latest/package/dist/xlsx.full.min.js"); + +function sheet_to_csv_cb(ws, cb, opts, batch = 1000) { + XLSX.stream.set_readable(() => ({ + __done: false, + // this function will be assigned by the SheetJS stream methods + _read: function() { this.__done = true; }, + // this function is called by the stream methods + push: function(d) { if(!this.__done) cb(d); if(d == null) this.__done = true; }, + resume: function pump() { for(var i = 0; i < batch && !this.__done; ++i) this._read(); if(!this.__done) setTimeout(pump.bind(this), 0); } + })); + return XLSX.stream.to_csv(ws, opts); +} + +/* this callback will run once the main context sends a message */ +self.addEventListener('message', async(e) => { + try { + postMessage({state: "fetching " + e.data.url}); + /* Fetch file */ + const res = await fetch(e.data.url); + const ab = await res.arrayBuffer(); + + /* Parse file */ + postMessage({state: "parsing"}); + const wb = XLSX.read(ab, {dense: true}); + const ws = wb.Sheets[wb.SheetNames[0]]; + + /* Generate CSV rows */ + postMessage({state: "csv"}); + const strm = sheet_to_csv_cb(ws, (csv) => { + if(csv != null) postMessage({csv}); + else postMessage({state: "done"}); + }); + strm.resume(); + } catch(e) { + /* Pass the error message back */ + postMessage({error: String(e.message || e) }); + } +}, false); +``` + +The main thread will receive messages with CSV rows for further processing: + +```js +worker.onmessage = function(e) { + if(e.data.error) { console.error(e.data.error); /* show an error message */ } + else if(e.data.state) { console.info(e.data.state); /* current state */ } + else { + /* e.data.csv is the row generated by the stream */ + console.log(e.data.csv); + } +}; +``` + +### Deno + +Deno does not support NodeJS streams in normal execution, so a wrapper is used. +This example fetches and prints CSV rows: + +```ts title="sheet2csv.ts" +// @deno-types="https://cdn.sheetjs.com/xlsx-latest/package/types/index.d.ts" +import { stream, Sheet2CSVOpts, WorkSheet } from 'https://cdn.sheetjs.com/xlsx-latest/package/xlsx.mjs'; + +interface Resumable { resume:()=>void; }; +/* Generate row strings from a worksheet */ +function sheet_to_csv_cb(ws: WorkSheet, cb:(d:string|null)=>void, opts: Sheet2CSVOpts = {}, batch = 1000): Resumable { + stream.set_readable(() => ({ + __done: false, + // this function will be assigned by the SheetJS stream methods + _read: function() { this.__done = true; }, + // this function is called by the stream methods + push: function(d: any) { if(!this.__done) cb(d); if(d == null) this.__done = true; }, + resume: function pump() { for(var i = 0; i < batch && !this.__done; ++i) this._read(); if(!this.__done) setTimeout(pump.bind(this), 0); } + })); + return stream.to_csv(ws, opts) as Resumable; +} + +/* Callback invoked on each row (string) and at the end (null) */ +const csv_cb = (d:string|null) => { + if(d == null) return; + /* The strings include line endings, so raw write ops should be used */ + Deno.stdout.write(new TextEncoder().encode(d)); +}; + +/* Fetch https://sheetjs.com/pres.numbers, parse, and get first worksheet */ +import { read } from 'https://cdn.sheetjs.com/xlsx-latest/package/xlsx.mjs'; +const ab = await (await fetch("https://sheetjs.com/pres.numbers")).arrayBuffer(); +const wb = read(ab, { dense: true }); +const ws = wb.Sheets[wb.SheetNames[0]]; + +/* Create and start CSV stream */ +sheet_to_csv_cb(ws, csv_cb).resume(); +``` diff --git a/docz/docs/03-demos/19-bundler.md b/docz/docs/03-demos/19-bundler.md index 83fefa5..ab3f5a6 100644 --- a/docz/docs/03-demos/19-bundler.md +++ b/docz/docs/03-demos/19-bundler.md @@ -1061,8 +1061,8 @@ Access http://localhost:8080 in your web browser. :::note -The [Vite section of the Content demo](/docs/demos/content#vitejs) covers SheetJS-powered -asset loaders, suitable for static sites pulling data from fixed spreadsheets. +The [Vite section of the Content demo](/docs/demos/content#vitejs) covers asset +loaders. They are ideal for static sites pulling data from sheets at build time. ::: diff --git a/docz/docs/03-demos/33-localfile.md b/docz/docs/03-demos/33-localfile.md index 1a4759c..1915664 100644 --- a/docz/docs/03-demos/33-localfile.md +++ b/docz/docs/03-demos/33-localfile.md @@ -76,8 +76,7 @@ self.addEventListener('message', (e) => {
IE10 Binary Strings (click to show) -In IE10, binary strings are more performant than `ArrayBuffer`. `XLSX.read` -supports binary strings with `type: "binary"`: +`XLSX.read` supports binary strings with `type: "binary"`: ```js // usage: file_bs_to_wb(file, function(wb) { /* wb is a workbook object */ }); diff --git a/docz/docs/06-solutions/05-output.md b/docz/docs/06-solutions/05-output.md index 6106bd2..260d80f 100644 --- a/docz/docs/06-solutions/05-output.md +++ b/docz/docs/06-solutions/05-output.md @@ -840,103 +840,6 @@ Readable Stream. - `XLSX.stream.to_html` is the streaming version of `XLSX.utils.sheet_to_html`. - `XLSX.stream.to_json` is the streaming version of `XLSX.utils.sheet_to_json`. - - - -:::note - -In a CommonJS context, NodeJS Streams and `fs` immediately work with SheetJS: - -```js -const XLSX = require("xlsx"); // "just works" -``` - -In NodeJS ESM, the dependency must be loaded manually: - -```js -import * as XLSX from 'xlsx'; -import { Readable } from 'stream'; - -XLSX.stream.set_readable(Readable); // manually load stream helpers -``` - -Additionally, for file-related operations in NodeJS ESM, `fs` must be loaded: - -```js -import * as XLSX from 'xlsx'; -import * as fs from 'fs'; - -XLSX.set_fs(fs); // manually load fs helpers -``` - -**It is strongly encouraged to use CommonJS in NodeJS whenever possible.** - -::: - -This example reads a worksheet passed as an argument to the script, pulls the -first worksheet, converts to CSV and writes to `out.csv`: - -```js -const workbook = XLSX.readFile(process.argv[2]); -const worksheet = workbook.Sheets[workbook.SheetNames[0]]; -// highlight-next-line -const stream = XLSX.stream.to_csv(worksheet); - -const output_file_name = "out.csv"; -// highlight-next-line -stream.pipe(fs.createWriteStream(output_file_name)); -``` - -`stream.to_json` uses Object-mode streams. A `Transform` stream can be used to -generate a normal stream for streaming to a file or the screen: - -```js -/* to_json returns an object-mode stream */ -// highlight-next-line -var stream = XLSX.stream.to_json(worksheet, {raw:true}); - -/* this Transform stream converts JS objects to text and prints to screen */ -var conv = new Transform({writableObjectMode:true}); -conv._transform = function(obj, e, cb){ cb(null, JSON.stringify(obj) + "\n"); }; -conv.pipe(process.stdout); - -// highlight-next-line -stream.pipe(conv); -``` - - - - -Deno does not support NodeJS streams in normal execution, so a wrapper is used. -This demo converts a worksheet to CSV and prints each row to the screen: - -```ts -// @deno-types="https://cdn.sheetjs.com/xlsx-latest/package/types/index.d.ts" -import {utils, stream, set_cptable} from 'https://cdn.sheetjs.com/xlsx-latest/package/xlsx.mjs'; - -/* `Readable` will be compatible with how SheetJS uses `stream.Readable` */ -function NodeReadableCB(cb:(d:any)=>void) { - var rd = { - __done: false, - _read: function() {}, - push: function(d: any) { if(!this.__done) cb(d); if(d == null) this.__done = true; }, - resume: function pump() {for(var i = 0; i < 10000 && !this.__done; ++i) rd._read(); if(!rd.__done) setTimeout(pump, 0); } - }; - return rd; -} -function NodeReadable(rd: any) { return function() { return rd; }; } -/* The callback gets each CSV row. It will be `null` when the stream is drained */ -const rt = NodeReadableCB((d: any) => { if(d != null) console.log(d); }); -const Readable = NodeReadable(rt); -stream.set_readable(Readable); - -/* wire up and start the stream */ -const rd = stream.to_csv(worksheet); -rd.resume(); -``` - - - - +Examples are included in ["Large Datasets"](/docs/demos/stream#streaming-write) pipes write streams to nodejs response. diff --git a/docz/docs/07-csf/07-features/01-formulae.md b/docz/docs/07-csf/07-features/01-formulae.md index a50d5dd..bf4fed7 100644 --- a/docz/docs/07-csf/07-features/01-formulae.md +++ b/docz/docs/07-csf/07-features/01-formulae.md @@ -599,7 +599,7 @@ Z.TEST In some cases, seemingly valid formulae may be rejected by spreadsheet software. -`EVALUATE` unprefixed function is supported in WPS Office formulae. It is not -valid in a cell formula in Excel. It can be used in an Excel defined name when -exporting to XLSM format but not XLSX. This is a limitation of Excel. Since WPS -Office accepts files with `EVALUATE`, the writer does not warn or throw errors. \ No newline at end of file +`EVALUATE` is a supported function in WPS Office. It is not valid in a cell +formula in Excel. It can be used in an Excel defined name when exporting to XLSM +format but not XLSX. This is a limitation of Excel. Since WPS Office accepts +files with `EVALUATE`, the writer does not warn or throw errors. \ No newline at end of file diff --git a/docz/docs/08-api/05-parse-options.md b/docz/docs/08-api/05-parse-options.md index c2e3332..2c68d17 100644 --- a/docz/docs/08-api/05-parse-options.md +++ b/docz/docs/08-api/05-parse-options.md @@ -70,7 +70,7 @@ The read functions accept an options argument: errors on single worksheets, allowing you to read from the worksheets that do parse properly. Setting `WTF:true` forces those errors to be thrown. - By default, "sparse" mode worksheets are generated. Individual cells are - accessed by indexing the worksheet object with an A1-style address. "dense" + accessed by indexing the worksheet object with an A1-Style address. "dense" worksheets store cells in an array of arrays at `sheet["!data"]`. ### Input Type diff --git a/docz/docs/09-miscellany/02-errors.md b/docz/docs/09-miscellany/02-errors.md index c268023..9570a49 100644 --- a/docz/docs/09-miscellany/02-errors.md +++ b/docz/docs/09-miscellany/02-errors.md @@ -37,8 +37,8 @@ manifest with error messages such as `Invalid string length`. There are memory bottlenecks associated with string addresses. A number of bugs have been reported to the V8 and Chromium projects on this subject. While those -bugs are being resolved, for sheets containing >100K rows, dense mode worksheets -should be used. +bugs are being resolved, for sheets containing hundreds of thousands of rows, +dense mode worksheets should be used.
diff --git a/docz/package.json b/docz/package.json index 8a23db2..7f230a7 100644 --- a/docz/package.json +++ b/docz/package.json @@ -15,11 +15,11 @@ }, "dependencies": { "@cmfcmf/docusaurus-search-local": "0.11.0", - "@docusaurus/core": "2.1.0", - "@docusaurus/plugin-client-redirects": "2.1.0", - "@docusaurus/preset-classic": "2.1.0", - "@docusaurus/theme-common": "2.1.0", - "@docusaurus/theme-live-codeblock": "2.1.0", + "@docusaurus/core": "2.2.0", + "@docusaurus/plugin-client-redirects": "2.2.0", + "@docusaurus/preset-classic": "2.2.0", + "@docusaurus/theme-common": "2.2.0", + "@docusaurus/theme-live-codeblock": "2.2.0", "@mdx-js/react": "1.6.22", "clsx": "1.2.1", "prism-react-renderer": "1.3.5", @@ -28,7 +28,7 @@ "xlsx": "https://cdn.sheetjs.com/xlsx-latest/xlsx-latest.tgz" }, "devDependencies": { - "@docusaurus/module-type-aliases": "2.1.0" + "@docusaurus/module-type-aliases": "2.2.0" }, "browserslist": { "production": [