This commit is contained in:
SheetJS 2023-10-22 21:20:18 -04:00
parent 35d76f9a62
commit 66c787222e
14 changed files with 483 additions and 119 deletions

@ -156,7 +156,7 @@ export class AppComponent {
:::note
This demo was last run on 2023-07-24 using Angular CLI `16.1.5`
This demo was last run on 2023-10-22 using Angular CLI `16.2.7`
:::
@ -169,7 +169,7 @@ npx @angular/cli analytics disable -g
1) Create a new project:
```bash
npx @angular/cli new --minimal --defaults --no-interactive sheetjs-angular
npx @angular/cli@16.2.7 new --minimal --defaults --no-interactive sheetjs-angular
```
2) Install the SheetJS dependency and start the dev server:
@ -184,10 +184,11 @@ npm start`}
3) Open a web browser and access the displayed URL (`http://localhost:4200`)
4) Replace `src/app/app.component.ts` with the code snippet.
4) Replace `src/app/app.component.ts` with the previous code snippet.
The page will refresh and show a table with an Export button. Click the button
and the page will attempt to download `SheetJSAngularAoO.xlsx`.
The page will refresh and show a table with an Export button. Click the button
and the page will attempt to download `SheetJSAngularAoO.xlsx`. Open the file
with a spreadsheet editor.
5) Stop the dev server and build the site:
@ -263,7 +264,7 @@ export class AppComponent {
:::note
This demo was last run on 2023-07-24 using Angular CLI `16.1.5`
This demo was last run on 2023-10-22 using Angular CLI `16.2.7`
:::
@ -276,7 +277,7 @@ npx @angular/cli analytics disable -g
1) Create a new project:
```bash
npx @angular/cli new --minimal --defaults --no-interactive sheetjs-angular
npx @angular/cli@16.2.7 new --minimal --defaults --no-interactive sheetjs-angular
```
2) Install the SheetJS dependency and start the dev server:
@ -291,10 +292,11 @@ npm start`}
3) Open a web browser and access the displayed URL (`http://localhost:4200`)
4) Replace `src/app/app.component.ts` with the code snippet.
4) Replace `src/app/app.component.ts` with the previous code snippet.
The page will refresh and show a table with an Export button. Click the button
and the page will attempt to download `SheetJSAngularHTML.xlsx`.
The page will refresh and show a table with an Export button. Click the button
and the page will attempt to download `SheetJSAngularHTML.xlsx`. Open the file
with a spreadsheet editor.
5) Stop the dev server and build the site:

@ -40,7 +40,7 @@ import { read, utils, writeFileXLSX } from 'xlsx';
:::warning Parcel Bug
Errors of the form `Could not statically evaluate fs call` stem from a Parcel
bug. Upgrade to Parcel version 1.5.0 or later.
bug[^1]. Upgrade to Parcel version 1.5.0 or later.
:::
@ -164,3 +164,5 @@ npx http-server dist
Access the displayed URL (typically `http://localhost:8080/`) in a web browser.
Click on "Click here to export" to generate a file.
[^1]: See [Issue 523 in the Parcel issue tracker](https://github.com/parcel-bundler/parcel/pull/523#issuecomment-357486164)

@ -2,7 +2,7 @@
title: Sheets in ExpressJS
sidebar_label: ExpressJS
pagination_prev: demos/net/network
pagination_next: demos/net/email
pagination_next: demos/net/email/index
---
import current from '/version.js';

@ -2,7 +2,7 @@
title: Sheets in Drash
sidebar_label: Drash
pagination_prev: demos/net/network
pagination_next: demos/net/email
pagination_next: demos/net/email/index
---
import current from '/version.js';

@ -2,7 +2,7 @@
title: Sheets in Elysia
sidebar_label: ElysiaJS
pagination_prev: demos/net/network
pagination_next: demos/net/email
pagination_next: demos/net/email/index
---
import current from '/version.js';

@ -2,7 +2,7 @@
title: Sheets in NestJS
sidebar_label: NestJS
pagination_prev: demos/net/network
pagination_next: demos/net/email
pagination_next: demos/net/email/index
---
import current from '/version.js';

@ -2,7 +2,7 @@
title: Sheets in FastifyJS
sidebar_label: FastifyJS
pagination_prev: demos/net/network
pagination_next: demos/net/email
pagination_next: demos/net/email/index
---
import current from '/version.js';

@ -1,7 +1,7 @@
---
title: HTTP Server Processing
pagination_prev: demos/net/network
pagination_next: demos/net/email
pagination_next: demos/net/email/index
---
import current from '/version.js';

@ -0,0 +1,388 @@
---
title: Sheets in PST Mailboxes
sidebar_label: PST Mailboxes
pagination_prev: demos/net/server/index
pagination_next: demos/net/headless
---
import current from '/version.js';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import CodeBlock from '@theme/CodeBlock';
<head>
<script src="/pst/pstextractor.js"></script>
</head>
PST (Personal Storage Table) is a common file format for storing messages.
Electronic discovery commonly involves extracting data from attached
spreadsheets in e-mail messages stored in PST archives.
`pst-extractor`[^1] is a NodeJS module designed for extracting objects from PST
files. It has been used to extract spreadsheets from the Enron Corpus[^2] and
other large mailboxes.
[SheetJS](https://sheetjs.com) is a JavaScript library for reading and writing
data from spreadsheets.
This demo uses `pst-extractor` and SheetJS to read spreadsheets. We'll explore
how to load SheetJS in a NodeJS script or website, extract spreadsheets files,
and generate HTML and CSV views of the underlying data.
The ["Live Demo"](#live-demo) reads PST files. Individual spreadsheets within
the file can be downloaded or previewed in the browser.
:::note
This demo was last tested on 2023 October 22 against `pst-extractor` 1.9.0
:::
## Overview
The [SheetJS NodeJS module](/docs/getting-started/installation/nodejs) can be
imported from scripts that use `pst-extractor`.
### Parsing PST Files
The `pst-extractor` module exposes a `PSTFile` class. The constructor requires a
proper NodeJS buffer.
The following snippet reads and parses `enron.pst` from the local filesystem.
`fs.readFileSync`[^3] accepts a filename and returns a Buffer:
```js
const fs = require("fs"), PSTExtractor = require("pst-extractor");
const file = fs.readFileSync("enron.pst");
const pst = new (PSTExtractor.PSTFile)(file);
```
### Walking the Tree
`pst-extractor` presents a tree-like structure to inspect the contents of the
PST file. It is recommended to use recursive functions to walk the tree.
The following tree walker will collect all XLSX and XLS attachments:
```js
/* walk the PST file and add all attachments to the specified array */
function walk(f,arr) {
if(f.hasSubfolders) for(let sf of f.getSubFolders()) walk(sf,arr);
if(f.contentCount <= 0) return;
for(let e = f.getNextChild(); e != null; e = f.getNextChild()) {
for(let i = 0; i < e.numberOfAttachments; ++i) {
var a = e.getAttachment(i);
/* XLS spreadsheet test by filename */
if(/.xls[xmb]?$/.test(a.filename)) arr.push(a);
}
}
}
/* generate a list of attachments */
const files = [];
walk(pst.getRootFolder(), files);
```
### Generating Buffers
The `PSTAttachment` class holds attachment metadata. To avoid loading everything
in memory, the raw data is exposed as a custom stream object. Since the SheetJS
`read` function requires data in a `Buffer` or `Uint8Array`, a helper function
is used to collect the data:
```js
/* collect data from the attachment into a "Buffer" */
function collect(file) {
const strm = file.fileInputStream;
const data = Buffer.alloc(strm._length.low);
strm.readCompletely(data);
return data;
}
/* collect data from the first attachment */
const buf0 = collect(files[0]);
```
### Processing Attachments
Given a NodeJS Buffer, the SheetJS `read` method[^4] parses the data and returns
a workbook object[^5]. Individual worksheets can be extracted from the workbook
and converted to CSV[^6] or HTML[^7].
The following example prints the contents of each worksheet in CSV form:
```js
const XLSX = require("xlsx");
/* parse workbook and print CSV contents of each sheet */
const wb = XLSX.read(buf0);
wb.SheetNames.forEach(n => {
const ws = wb.Sheets[n];
const csv = XLSX.utils.sheet_to_csv(ws);
console.log(`#### ${file.filename} ! ${n}`);
console.log(csv);
});
```
### Browser Caveats
The [SheetJS Standalone scripts](/docs/getting-started/installation/standalone)
can be loaded through a `SCRIPT` tag.
This demo uses [a special `pst-extractor` build](#browser-build) for the web.
Compared to the NodeJS build, browser scripts require special Buffer wrappers.
For example, the following function will fail since the library does not support
`ArrayBuffer` objects:
```js
async function error_fetch_and_parse_pst(url) {
const ab = await (await fetch(url)).arrayBuffer();
// this will throw an error
return new (PSTExtractor.PSTFile)(ab);
}
```
The browser build exposes the `Buffer` object in the `PSTExtractor` global:
```js
async function correct_fetch_and_parse_pst(url) {
const ab = await (await fetch(url)).arrayBuffer();
// highlight-next-line
const buf = new PSTExtractor.Buffer(ab);
return new (PSTExtractor.PSTFile)(buf);
}
```
### Browser Build
The `pst-extractor` library is designed for NodeJS. Parts of the library expect
a NodeJS `Buffer`, which does not exist in the browser. A fake `Buffer` can be
added and exposed in a script.
[`pstextractor.js`](pathname:///pst/pstextractor.js) is loaded in the demo page.
<details><summary><b>Build instructions</b> (click to show)</summary>
1) Initialize a new NodeJS project and install the dependency:
```bash
mkdir pstextract
cd pstextract
npm init -y
npm i --save pst-extractor@1.9.0
```
2) Save the following to `shim.js`:
```js title="shim.js"
const PSTExtractor = require("pst-extractor");
module.exports = PSTExtractor;
module.exports.Buffer = Buffer;
```
3) Build the script:
```bash
npx browserify@17.0.0 -s PSTExtractor -o pstextractor.js shim.js
```
</details>
## Demos
### NodeJS
This demo will fetch a [test PST](pathnamme:///pst/enron.pst) and extract all
embedded spreadsheets. The script can be adapted to read local PST files or pull
PST files from a different URL.
0) Initialize a new project:
```bash
mkdir sheetjs-pst
cd sheetjs-pst
npm init -y
```
2) Install the SheetJS NodeJS module and `pst-extractor`:
<Tabs groupId="pm">
<TabItem value="npm" label="npm">
<CodeBlock language="bash">{`\
npm i --save https://cdn.sheetjs.com/xlsx-${current}/xlsx-${current}.tgz pst-extractor`}
</CodeBlock>
</TabItem>
<TabItem value="pnpm" label="pnpm">
<CodeBlock language="bash">{`\
pnpm install https://cdn.sheetjs.com/xlsx-${current}/xlsx-${current}.tgz pst-extractor`}
</CodeBlock>
</TabItem>
<TabItem value="yarn" label="Yarn" default>
<CodeBlock language="bash">{`\
yarn add https://cdn.sheetjs.com/xlsx-${current}/xlsx-${current}.tgz pst-extractor`}
</CodeBlock>
</TabItem>
</Tabs>
2) Download [`SheetJSPST.js`](pathname:///pst/SheetJSPST.js) into project folder:
```bash
curl -LO https://docs.sheetjs.com/pst/SheetJSPST.js
```
3) Run the script:
```js
node SheetJSPST.js
```
The process will fetch [the test PST](pathnamme:///pst/enron.pst) and extract
the embedded spreadsheets. The terminal will display info on the exported files.
:::note pass
Lines starting with `saving file` show how attachments correspond to files. The
following line states that the first attachment (index `0`) was originally named
`RedRockA.xls` and was saved to `file0.xls` on the file system:
```
saving file 0 |RedRockA.xls| to file0.xls
```
Lines starting with `####` show the attachment file name and the worksheet name.
The following line explains that there is a worksheet named `"Oct 26, 2001"` in
the file `RedRockA.xls`:
```
#### RedRockA.xls ! Oct 26, 2001
```
Every other line is a CSV row from the named worksheet. For example, the first
four lines of worksheet `"Oct 26, 2001"` in `RedRockA.xls` are shown below:
```text
#### RedRockA.xls ! Oct 26, 2001
// highlight-start
RED ROCK EXPANSION PROJECT,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,
,,,, , , ,,,,,,,,,,,,
SHIPPER,CONTRACT #,Term,MMBtu/d,RECEIPT POINT,DELIVERY POINT,MMBtu/d,,,,,,,,,,,,
// highlight-end
```
:::
### Live Demo
This demo reads PST mailboxes. Due to browser limitations, PST files larger than
100 MB may crash the browser.
After parsing the PST file, the "Attachments" table will list attached XLSX and
XLS spreadsheets in the file. The "preview" link will display a HTML table with
the data in the spreadsheet. The "download" link will download the attachment.
The [test file](pathname:///pst/enron.pst) was based on the EDRM clean extract
from the "Enron Corpus" and includes a few XLS attachments.
:::caution pass
If the live demo shows a message
```
Please reload the page
```
please refresh the page. This is a known bug in the documentation generator.
:::
```jsx live
function SheetJSPreviewPSTSheets() {
const [ files, setFiles ] = React.useState([]);
const [ __html, setHTML ] = React.useState("");
/* recursively walk PST and collect attachments */
const walk = (f,arr) => {
if(f.hasSubfolders) for(let sf of f.getSubFolders()) walk(sf,arr);
if(f.contentCount <= 0) return;
for(let e = f.getNextChild(); e != null; e = f.getNextChild()) {
for(let i = 0; i < e.numberOfAttachments; ++i) {
var a = e.getAttachment(i);
/* XLS spreadsheet test by filename */
if(/.xls[xmb]?$/.test(a.filename)) arr.push(a);
}
}
}
/* collect data from the attachment into a "Buffer" */
const collect = (j) => {
const strm = files[j].fileInputStream;
const data = new PSTExtractor.Buffer(strm._length.low);
strm.readCompletely(data);
return data;
}
/* view selected attachment */
const view = (j) => {
const data = collect(j);
/* parse */
const wb = XLSX.read(data);
/* convert first sheet to HTML */
const ws = wb.Sheets[wb.SheetNames[0]];
setHTML(XLSX.utils.sheet_to_html(ws));
}
/* process array buffer */
const process_ab = (ab) => {
const pst = new (PSTExtractor.PSTFile)(new PSTExtractor.Buffer(ab));
const data = [];
walk(pst.getRootFolder(), data);
setFiles(data);
};
/* on click, fetch and process file */
const doit = async() => {
const ab = await (await fetch("/pst/enron.pst")).arrayBuffer();
process_ab(ab);
};
const chg = async(e) => process_ab(await e.target.files[0].arrayBuffer());
/* download selected attachment */
const dl = (j) => {
const a = document.createElement("a");
a.download = files[j].filename;
a.href = URL.createObjectURL(new Blob([collect(j)]));
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
}
if(typeof PSTExtractor == "undefined") return <b>Please reload the page</b>;
return ( <>
<p>Use the file input to select a file, or click "Use a Sample PST"</p>
<input type="file" accept=".pst" onChange={chg}/>
<button onClick={doit}>Use a Sample PST!</button><br/><br/>
<table><thead><th colspan="3">Attachments</th></thead>
<tbody>{files.map((f,j) => (
<tr key={j}><th>{f.filename}</th>
<td><a onClick={()=>view(j)}>(preview)</a></td>
<td><a onClick={()=>dl(j)}>(download)</a></td>
</tr>
))}</tbody>
</table>
<b>Preview of first worksheet</b><br/>
<div dangerouslySetInnerHTML={{__html}}></div>
</> );
}
```
[^1]: The project has no official website. The official [repository](https://github.com/epfromer/pst-extractor) is hosted on GitHub.
[^2]: Extracted spreadsheets are [available on GitHub](https://github.com/SheetJS/enron_xls)
[^3]: See [`fs.readFileSync`](https://nodejs.org/api/fs.html#fsreadfilesyncpath-options) in the NodeJS documentation
[^4]: See [`read` in "Reading Files"](/docs/api/parse-options)
[^5]: See ["Workbook Object"](/docs/csf/book)
[^6]: See [`sheet_to_csv` in "CSV and Text"](/docs/api/utilities/csv#delimiter-separated-output)
[^7]: See [`sheet_to_html` in "Utilities"](/docs/api/utilities/html#html-table-output)

@ -0,0 +1,4 @@
{
"label": "Electronic Mail",
"position": 4
}

@ -1,16 +1,12 @@
---
title: Electronic Mail
pagination_prev: demos/net/server/index
pagination_next: demos/net/headless
---
import current from '/version.js';
import CodeBlock from '@theme/CodeBlock';
<head>
<script src="/pst/pstextractor.js"></script>
</head>
Electronic mail ("email" or "e-mail") is an essential part of modern business
workflows. Spreadsheets are commonly passed around and processed.
@ -367,98 +363,4 @@ proprietary mail and email account file formats.
### PST
`PST` is a common file format. The `pst-extractor` library is designed for
extracting messages and attachments from `PST` files in NodeJS and the browser.
This demo uses [a special build](pathname:///pst/pstextractor.js) for the web.
<details><summary><b>Build details</b> (click to show)</summary>
1) Initialize a new NodeJS project and install the dependency:
```bash
mkdir pstextract
cd pstextract
npm init -y
npm i --save pst-extractor@1.9.0
```
2) Save the following to `shim.js`:
```js title="shim.js"
const PSTExtractor = require("pst-extractor");
module.exports = PSTExtractor;
module.exports.Buffer = Buffer;
```
3) Build the script:
```bash
npx browserify@17.0.0 -s PSTExtractor -o pstextractor.js shim.js
```
</details>
The [test file](pathname:///pst/enron.pst) was based on the EDRM clean extract
from the "Enron Corpus" and includes a few XLS attachments.
```jsx live
function SheetJSPreviewPSTSheets() {
const [ files, setFiles ] = React.useState([]);
const [ __html, setHTML ] = React.useState("");
/* recursively walk PST and collect attachments */
const walk = (f,arr) => {
if(f.hasSubfolders) for(let sf of f.getSubFolders()) walk(sf,arr);
if(f.contentCount > 0) for(let e = f.getNextChild(); e != null; e = f.getNextChild()) {
for(var i = 0; i < e.numberOfAttachments; ++i) {
var a = e.getAttachment(i);
/* XLS spreadsheet test by filename */
if(a.filename.endsWith(".xls")) arr.push(a);
}
}
}
/* view selected attachment */
const view = (j) => {
/* collect data into a "Buffer" */
const strm = files[j].fileInputStream;
const data = new PSTExtractor.Buffer(strm._length.low);
strm.readCompletely(data);
/* parse */
const wb = XLSX.read(data);
/* convert first sheet to HTML */
const ws = wb.Sheets[wb.SheetNames[0]];
setHTML(XLSX.utils.sheet_to_html(ws));
}
/* process array buffer */
const process_ab = (ab) => {
const pst = new (PSTExtractor.PSTFile)(new PSTExtractor.Buffer(ab));
const data = [];
walk(pst.getRootFolder(), data);
setFiles(data);
};
/* on click, fetch and process file */
const doit = async() => {
const ab = await (await fetch("/pst/enron.pst")).arrayBuffer();
process_ab(ab);
};
const chg = async(e) => process_ab(await e.target.files[0].arrayBuffer());
return ( <>
<p>Use the file input to select a file, or click "Use a Sample PST"</p>
<button onClick={doit}>Use a Sample PST!</button><br/><br/>
<input type="file" accept=".pst" onChange={chg}/><br/>
<b>Attachments</b>
<ul>{files.map((f,j) => (
<li key={j}><a onClick={()=>view(j)}>{f.filename} (click to view)</a></li>
))}</ul>
<b>Table View</b><br/>
<div dangerouslySetInnerHTML={{__html}}></div>
</> );
}
**[The exposition has been moved to a separate page.](/docs/demos/net/email/pst)**

@ -1,5 +1,6 @@
---
title: Browser Automation
pagination_prev: demos/net/email/index
---
import current from '/version.js';

@ -1,9 +1,9 @@
---
title: Hyperlinks and Tooltips
sidebar_label: Hyperlinks
sidebar_position: 3
---
# Hyperlinks
<details>
<summary><b>File Format Support</b> (click to show)</summary>
@ -254,6 +254,20 @@ XLSX documents. A workaround was added in library version 0.18.12.
:::
## Tooltips
Tooltips are attached to hyperlink information. There is no way to specify a
tooltip without assigning a cell link.
:::warning pass
**Excel has an undocumented tooltip length limit of 255 characters.**
Writing longer tooltips is currently permitted by the library but the generated
files will not open in Excel.
:::
## HTML
The HTML DOM parser[^1] will process `<a>` links in the table.

@ -0,0 +1,51 @@
const fs = require("fs");
const PSTExtractor = require("pst-extractor");
const XLSX = require("xlsx");
/* walk the PST file and add all attachments to the specified array */
function walk(f,arr) {
if(f.hasSubfolders) for(let sf of f.getSubFolders()) walk(sf,arr);
if(f.contentCount <= 0) return;
for(let e = f.getNextChild(); e != null; e = f.getNextChild()) {
for(let i = 0; i < e.numberOfAttachments; ++i) {
var a = e.getAttachment(i);
/* XLS spreadsheet test by filename */
if(/.xls[xmb]?$/.test(a.filename)) arr.push(a);
}
}
}
/* collect data from the attachment into a "Buffer" */
function collect(file) {
const strm = file.fileInputStream;
const data = Buffer.alloc(strm._length.low);
strm.readCompletely(data);
return data;
}
(async() => {
/* fetch https://docs.sheetjs.com/pst/enron.pst */
const ab = await (await fetch("https://docs.sheetjs.com/pst/enron.pst")).arrayBuffer();
const pst = new (PSTExtractor.PSTFile)(Buffer.from(ab));
/* generate a list of attachments */
const files = [];
walk(pst.getRootFolder(), files);
files.forEach((file, idx) => {
/* extract and save workbook to file */
const ext = file.filename.slice(file.filename.lastIndexOf(".") + 1);
console.log(`saving file ${idx} |${file.filename}| to file${idx}.${ext}`);
const buf = collect(file);
fs.writeFileSync(`file${idx}.${ext}`, buf);
/* parse workbook and print CSV contents of each sheet */
const wb = XLSX.read(buf);
wb.SheetNames.forEach(n => {
const ws = wb.Sheets[n];
const csv = XLSX.utils.sheet_to_csv(ws);
console.log(`#### ${file.filename} ! ${n}`);
console.log(csv);
});
});
})();