pst
This commit is contained in:
parent
35d76f9a62
commit
66c787222e
@ -156,7 +156,7 @@ export class AppComponent {
|
||||
|
||||
:::note
|
||||
|
||||
This demo was last run on 2023-07-24 using Angular CLI `16.1.5`
|
||||
This demo was last run on 2023-10-22 using Angular CLI `16.2.7`
|
||||
|
||||
:::
|
||||
|
||||
@ -169,7 +169,7 @@ npx @angular/cli analytics disable -g
|
||||
1) Create a new project:
|
||||
|
||||
```bash
|
||||
npx @angular/cli new --minimal --defaults --no-interactive sheetjs-angular
|
||||
npx @angular/cli@16.2.7 new --minimal --defaults --no-interactive sheetjs-angular
|
||||
```
|
||||
|
||||
2) Install the SheetJS dependency and start the dev server:
|
||||
@ -184,10 +184,11 @@ npm start`}
|
||||
|
||||
3) Open a web browser and access the displayed URL (`http://localhost:4200`)
|
||||
|
||||
4) Replace `src/app/app.component.ts` with the code snippet.
|
||||
4) Replace `src/app/app.component.ts` with the previous code snippet.
|
||||
|
||||
The page will refresh and show a table with an Export button. Click the button
|
||||
and the page will attempt to download `SheetJSAngularAoO.xlsx`.
|
||||
The page will refresh and show a table with an Export button. Click the button
|
||||
and the page will attempt to download `SheetJSAngularAoO.xlsx`. Open the file
|
||||
with a spreadsheet editor.
|
||||
|
||||
5) Stop the dev server and build the site:
|
||||
|
||||
@ -263,7 +264,7 @@ export class AppComponent {
|
||||
|
||||
:::note
|
||||
|
||||
This demo was last run on 2023-07-24 using Angular CLI `16.1.5`
|
||||
This demo was last run on 2023-10-22 using Angular CLI `16.2.7`
|
||||
|
||||
:::
|
||||
|
||||
@ -276,7 +277,7 @@ npx @angular/cli analytics disable -g
|
||||
1) Create a new project:
|
||||
|
||||
```bash
|
||||
npx @angular/cli new --minimal --defaults --no-interactive sheetjs-angular
|
||||
npx @angular/cli@16.2.7 new --minimal --defaults --no-interactive sheetjs-angular
|
||||
```
|
||||
|
||||
2) Install the SheetJS dependency and start the dev server:
|
||||
@ -291,10 +292,11 @@ npm start`}
|
||||
|
||||
3) Open a web browser and access the displayed URL (`http://localhost:4200`)
|
||||
|
||||
4) Replace `src/app/app.component.ts` with the code snippet.
|
||||
4) Replace `src/app/app.component.ts` with the previous code snippet.
|
||||
|
||||
The page will refresh and show a table with an Export button. Click the button
|
||||
and the page will attempt to download `SheetJSAngularHTML.xlsx`.
|
||||
The page will refresh and show a table with an Export button. Click the button
|
||||
and the page will attempt to download `SheetJSAngularHTML.xlsx`. Open the file
|
||||
with a spreadsheet editor.
|
||||
|
||||
5) Stop the dev server and build the site:
|
||||
|
||||
|
@ -40,7 +40,7 @@ import { read, utils, writeFileXLSX } from 'xlsx';
|
||||
:::warning Parcel Bug
|
||||
|
||||
Errors of the form `Could not statically evaluate fs call` stem from a Parcel
|
||||
bug. Upgrade to Parcel version 1.5.0 or later.
|
||||
bug[^1]. Upgrade to Parcel version 1.5.0 or later.
|
||||
|
||||
:::
|
||||
|
||||
@ -164,3 +164,5 @@ npx http-server dist
|
||||
|
||||
Access the displayed URL (typically `http://localhost:8080/`) in a web browser.
|
||||
Click on "Click here to export" to generate a file.
|
||||
|
||||
[^1]: See [Issue 523 in the Parcel issue tracker](https://github.com/parcel-bundler/parcel/pull/523#issuecomment-357486164)
|
@ -2,7 +2,7 @@
|
||||
title: Sheets in ExpressJS
|
||||
sidebar_label: ExpressJS
|
||||
pagination_prev: demos/net/network
|
||||
pagination_next: demos/net/email
|
||||
pagination_next: demos/net/email/index
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: Sheets in Drash
|
||||
sidebar_label: Drash
|
||||
pagination_prev: demos/net/network
|
||||
pagination_next: demos/net/email
|
||||
pagination_next: demos/net/email/index
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: Sheets in Elysia
|
||||
sidebar_label: ElysiaJS
|
||||
pagination_prev: demos/net/network
|
||||
pagination_next: demos/net/email
|
||||
pagination_next: demos/net/email/index
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: Sheets in NestJS
|
||||
sidebar_label: NestJS
|
||||
pagination_prev: demos/net/network
|
||||
pagination_next: demos/net/email
|
||||
pagination_next: demos/net/email/index
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: Sheets in FastifyJS
|
||||
sidebar_label: FastifyJS
|
||||
pagination_prev: demos/net/network
|
||||
pagination_next: demos/net/email
|
||||
pagination_next: demos/net/email/index
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
|
@ -1,7 +1,7 @@
|
||||
---
|
||||
title: HTTP Server Processing
|
||||
pagination_prev: demos/net/network
|
||||
pagination_next: demos/net/email
|
||||
pagination_next: demos/net/email/index
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
|
388
docz/docs/03-demos/03-net/04-email/11-pst.md
Normal file
388
docz/docs/03-demos/03-net/04-email/11-pst.md
Normal file
@ -0,0 +1,388 @@
|
||||
---
|
||||
title: Sheets in PST Mailboxes
|
||||
sidebar_label: PST Mailboxes
|
||||
pagination_prev: demos/net/server/index
|
||||
pagination_next: demos/net/headless
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
import CodeBlock from '@theme/CodeBlock';
|
||||
|
||||
<head>
|
||||
<script src="/pst/pstextractor.js"></script>
|
||||
</head>
|
||||
|
||||
PST (Personal Storage Table) is a common file format for storing messages.
|
||||
Electronic discovery commonly involves extracting data from attached
|
||||
spreadsheets in e-mail messages stored in PST archives.
|
||||
|
||||
`pst-extractor`[^1] is a NodeJS module designed for extracting objects from PST
|
||||
files. It has been used to extract spreadsheets from the Enron Corpus[^2] and
|
||||
other large mailboxes.
|
||||
|
||||
[SheetJS](https://sheetjs.com) is a JavaScript library for reading and writing
|
||||
data from spreadsheets.
|
||||
|
||||
This demo uses `pst-extractor` and SheetJS to read spreadsheets. We'll explore
|
||||
how to load SheetJS in a NodeJS script or website, extract spreadsheets files,
|
||||
and generate HTML and CSV views of the underlying data.
|
||||
|
||||
The ["Live Demo"](#live-demo) reads PST files. Individual spreadsheets within
|
||||
the file can be downloaded or previewed in the browser.
|
||||
|
||||
:::note
|
||||
|
||||
This demo was last tested on 2023 October 22 against `pst-extractor` 1.9.0
|
||||
|
||||
:::
|
||||
|
||||
## Overview
|
||||
|
||||
The [SheetJS NodeJS module](/docs/getting-started/installation/nodejs) can be
|
||||
imported from scripts that use `pst-extractor`.
|
||||
|
||||
### Parsing PST Files
|
||||
|
||||
The `pst-extractor` module exposes a `PSTFile` class. The constructor requires a
|
||||
proper NodeJS buffer.
|
||||
|
||||
The following snippet reads and parses `enron.pst` from the local filesystem.
|
||||
`fs.readFileSync`[^3] accepts a filename and returns a Buffer:
|
||||
|
||||
```js
|
||||
const fs = require("fs"), PSTExtractor = require("pst-extractor");
|
||||
const file = fs.readFileSync("enron.pst");
|
||||
const pst = new (PSTExtractor.PSTFile)(file);
|
||||
```
|
||||
|
||||
### Walking the Tree
|
||||
|
||||
`pst-extractor` presents a tree-like structure to inspect the contents of the
|
||||
PST file. It is recommended to use recursive functions to walk the tree.
|
||||
|
||||
The following tree walker will collect all XLSX and XLS attachments:
|
||||
|
||||
```js
|
||||
/* walk the PST file and add all attachments to the specified array */
|
||||
function walk(f,arr) {
|
||||
if(f.hasSubfolders) for(let sf of f.getSubFolders()) walk(sf,arr);
|
||||
if(f.contentCount <= 0) return;
|
||||
for(let e = f.getNextChild(); e != null; e = f.getNextChild()) {
|
||||
for(let i = 0; i < e.numberOfAttachments; ++i) {
|
||||
var a = e.getAttachment(i);
|
||||
/* XLS spreadsheet test by filename */
|
||||
if(/.xls[xmb]?$/.test(a.filename)) arr.push(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* generate a list of attachments */
|
||||
const files = [];
|
||||
walk(pst.getRootFolder(), files);
|
||||
```
|
||||
|
||||
### Generating Buffers
|
||||
|
||||
The `PSTAttachment` class holds attachment metadata. To avoid loading everything
|
||||
in memory, the raw data is exposed as a custom stream object. Since the SheetJS
|
||||
`read` function requires data in a `Buffer` or `Uint8Array`, a helper function
|
||||
is used to collect the data:
|
||||
|
||||
```js
|
||||
/* collect data from the attachment into a "Buffer" */
|
||||
function collect(file) {
|
||||
const strm = file.fileInputStream;
|
||||
const data = Buffer.alloc(strm._length.low);
|
||||
strm.readCompletely(data);
|
||||
return data;
|
||||
}
|
||||
|
||||
/* collect data from the first attachment */
|
||||
const buf0 = collect(files[0]);
|
||||
```
|
||||
|
||||
### Processing Attachments
|
||||
|
||||
Given a NodeJS Buffer, the SheetJS `read` method[^4] parses the data and returns
|
||||
a workbook object[^5]. Individual worksheets can be extracted from the workbook
|
||||
and converted to CSV[^6] or HTML[^7].
|
||||
|
||||
The following example prints the contents of each worksheet in CSV form:
|
||||
|
||||
```js
|
||||
const XLSX = require("xlsx");
|
||||
|
||||
/* parse workbook and print CSV contents of each sheet */
|
||||
const wb = XLSX.read(buf0);
|
||||
wb.SheetNames.forEach(n => {
|
||||
const ws = wb.Sheets[n];
|
||||
const csv = XLSX.utils.sheet_to_csv(ws);
|
||||
console.log(`#### ${file.filename} ! ${n}`);
|
||||
console.log(csv);
|
||||
});
|
||||
```
|
||||
|
||||
### Browser Caveats
|
||||
|
||||
The [SheetJS Standalone scripts](/docs/getting-started/installation/standalone)
|
||||
can be loaded through a `SCRIPT` tag.
|
||||
|
||||
This demo uses [a special `pst-extractor` build](#browser-build) for the web.
|
||||
|
||||
Compared to the NodeJS build, browser scripts require special Buffer wrappers.
|
||||
For example, the following function will fail since the library does not support
|
||||
`ArrayBuffer` objects:
|
||||
|
||||
```js
|
||||
async function error_fetch_and_parse_pst(url) {
|
||||
const ab = await (await fetch(url)).arrayBuffer();
|
||||
// this will throw an error
|
||||
return new (PSTExtractor.PSTFile)(ab);
|
||||
}
|
||||
```
|
||||
|
||||
The browser build exposes the `Buffer` object in the `PSTExtractor` global:
|
||||
|
||||
```js
|
||||
async function correct_fetch_and_parse_pst(url) {
|
||||
const ab = await (await fetch(url)).arrayBuffer();
|
||||
// highlight-next-line
|
||||
const buf = new PSTExtractor.Buffer(ab);
|
||||
return new (PSTExtractor.PSTFile)(buf);
|
||||
}
|
||||
```
|
||||
|
||||
### Browser Build
|
||||
|
||||
The `pst-extractor` library is designed for NodeJS. Parts of the library expect
|
||||
a NodeJS `Buffer`, which does not exist in the browser. A fake `Buffer` can be
|
||||
added and exposed in a script.
|
||||
|
||||
[`pstextractor.js`](pathname:///pst/pstextractor.js) is loaded in the demo page.
|
||||
|
||||
<details><summary><b>Build instructions</b> (click to show)</summary>
|
||||
|
||||
1) Initialize a new NodeJS project and install the dependency:
|
||||
|
||||
```bash
|
||||
mkdir pstextract
|
||||
cd pstextract
|
||||
npm init -y
|
||||
npm i --save pst-extractor@1.9.0
|
||||
```
|
||||
|
||||
2) Save the following to `shim.js`:
|
||||
|
||||
```js title="shim.js"
|
||||
const PSTExtractor = require("pst-extractor");
|
||||
module.exports = PSTExtractor;
|
||||
module.exports.Buffer = Buffer;
|
||||
```
|
||||
|
||||
3) Build the script:
|
||||
|
||||
```bash
|
||||
npx browserify@17.0.0 -s PSTExtractor -o pstextractor.js shim.js
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## Demos
|
||||
|
||||
### NodeJS
|
||||
|
||||
This demo will fetch a [test PST](pathnamme:///pst/enron.pst) and extract all
|
||||
embedded spreadsheets. The script can be adapted to read local PST files or pull
|
||||
PST files from a different URL.
|
||||
|
||||
0) Initialize a new project:
|
||||
|
||||
```bash
|
||||
mkdir sheetjs-pst
|
||||
cd sheetjs-pst
|
||||
npm init -y
|
||||
```
|
||||
|
||||
2) Install the SheetJS NodeJS module and `pst-extractor`:
|
||||
|
||||
<Tabs groupId="pm">
|
||||
<TabItem value="npm" label="npm">
|
||||
<CodeBlock language="bash">{`\
|
||||
npm i --save https://cdn.sheetjs.com/xlsx-${current}/xlsx-${current}.tgz pst-extractor`}
|
||||
</CodeBlock>
|
||||
</TabItem>
|
||||
<TabItem value="pnpm" label="pnpm">
|
||||
<CodeBlock language="bash">{`\
|
||||
pnpm install https://cdn.sheetjs.com/xlsx-${current}/xlsx-${current}.tgz pst-extractor`}
|
||||
</CodeBlock>
|
||||
</TabItem>
|
||||
<TabItem value="yarn" label="Yarn" default>
|
||||
<CodeBlock language="bash">{`\
|
||||
yarn add https://cdn.sheetjs.com/xlsx-${current}/xlsx-${current}.tgz pst-extractor`}
|
||||
</CodeBlock>
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
2) Download [`SheetJSPST.js`](pathname:///pst/SheetJSPST.js) into project folder:
|
||||
|
||||
```bash
|
||||
curl -LO https://docs.sheetjs.com/pst/SheetJSPST.js
|
||||
```
|
||||
|
||||
3) Run the script:
|
||||
|
||||
```js
|
||||
node SheetJSPST.js
|
||||
```
|
||||
|
||||
The process will fetch [the test PST](pathnamme:///pst/enron.pst) and extract
|
||||
the embedded spreadsheets. The terminal will display info on the exported files.
|
||||
|
||||
:::note pass
|
||||
|
||||
Lines starting with `saving file` show how attachments correspond to files. The
|
||||
following line states that the first attachment (index `0`) was originally named
|
||||
`RedRockA.xls` and was saved to `file0.xls` on the file system:
|
||||
|
||||
```
|
||||
saving file 0 |RedRockA.xls| to file0.xls
|
||||
```
|
||||
|
||||
Lines starting with `####` show the attachment file name and the worksheet name.
|
||||
The following line explains that there is a worksheet named `"Oct 26, 2001"` in
|
||||
the file `RedRockA.xls`:
|
||||
|
||||
```
|
||||
#### RedRockA.xls ! Oct 26, 2001
|
||||
```
|
||||
|
||||
Every other line is a CSV row from the named worksheet. For example, the first
|
||||
four lines of worksheet `"Oct 26, 2001"` in `RedRockA.xls` are shown below:
|
||||
|
||||
```text
|
||||
#### RedRockA.xls ! Oct 26, 2001
|
||||
// highlight-start
|
||||
RED ROCK EXPANSION PROJECT,,,,,,,,,,,,,,,,,,
|
||||
,,,,,,,,,,,,,,,,,,
|
||||
,,,, , , ,,,,,,,,,,,,
|
||||
SHIPPER,CONTRACT #,Term,MMBtu/d,RECEIPT POINT,DELIVERY POINT,MMBtu/d,,,,,,,,,,,,
|
||||
// highlight-end
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
### Live Demo
|
||||
|
||||
This demo reads PST mailboxes. Due to browser limitations, PST files larger than
|
||||
100 MB may crash the browser.
|
||||
|
||||
After parsing the PST file, the "Attachments" table will list attached XLSX and
|
||||
XLS spreadsheets in the file. The "preview" link will display a HTML table with
|
||||
the data in the spreadsheet. The "download" link will download the attachment.
|
||||
|
||||
The [test file](pathname:///pst/enron.pst) was based on the EDRM clean extract
|
||||
from the "Enron Corpus" and includes a few XLS attachments.
|
||||
|
||||
:::caution pass
|
||||
|
||||
If the live demo shows a message
|
||||
|
||||
```
|
||||
Please reload the page
|
||||
```
|
||||
|
||||
please refresh the page. This is a known bug in the documentation generator.
|
||||
|
||||
:::
|
||||
|
||||
```jsx live
|
||||
function SheetJSPreviewPSTSheets() {
|
||||
const [ files, setFiles ] = React.useState([]);
|
||||
const [ __html, setHTML ] = React.useState("");
|
||||
|
||||
/* recursively walk PST and collect attachments */
|
||||
const walk = (f,arr) => {
|
||||
if(f.hasSubfolders) for(let sf of f.getSubFolders()) walk(sf,arr);
|
||||
if(f.contentCount <= 0) return;
|
||||
for(let e = f.getNextChild(); e != null; e = f.getNextChild()) {
|
||||
for(let i = 0; i < e.numberOfAttachments; ++i) {
|
||||
var a = e.getAttachment(i);
|
||||
/* XLS spreadsheet test by filename */
|
||||
if(/.xls[xmb]?$/.test(a.filename)) arr.push(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* collect data from the attachment into a "Buffer" */
|
||||
const collect = (j) => {
|
||||
const strm = files[j].fileInputStream;
|
||||
const data = new PSTExtractor.Buffer(strm._length.low);
|
||||
strm.readCompletely(data);
|
||||
return data;
|
||||
}
|
||||
|
||||
/* view selected attachment */
|
||||
const view = (j) => {
|
||||
const data = collect(j);
|
||||
|
||||
/* parse */
|
||||
const wb = XLSX.read(data);
|
||||
|
||||
/* convert first sheet to HTML */
|
||||
const ws = wb.Sheets[wb.SheetNames[0]];
|
||||
setHTML(XLSX.utils.sheet_to_html(ws));
|
||||
}
|
||||
|
||||
/* process array buffer */
|
||||
const process_ab = (ab) => {
|
||||
const pst = new (PSTExtractor.PSTFile)(new PSTExtractor.Buffer(ab));
|
||||
const data = [];
|
||||
walk(pst.getRootFolder(), data);
|
||||
setFiles(data);
|
||||
};
|
||||
|
||||
/* on click, fetch and process file */
|
||||
const doit = async() => {
|
||||
const ab = await (await fetch("/pst/enron.pst")).arrayBuffer();
|
||||
process_ab(ab);
|
||||
};
|
||||
const chg = async(e) => process_ab(await e.target.files[0].arrayBuffer());
|
||||
|
||||
/* download selected attachment */
|
||||
const dl = (j) => {
|
||||
const a = document.createElement("a");
|
||||
a.download = files[j].filename;
|
||||
a.href = URL.createObjectURL(new Blob([collect(j)]));
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
}
|
||||
|
||||
if(typeof PSTExtractor == "undefined") return <b>Please reload the page</b>;
|
||||
return ( <>
|
||||
<p>Use the file input to select a file, or click "Use a Sample PST"</p>
|
||||
<input type="file" accept=".pst" onChange={chg}/>
|
||||
<button onClick={doit}>Use a Sample PST!</button><br/><br/>
|
||||
<table><thead><th colspan="3">Attachments</th></thead>
|
||||
<tbody>{files.map((f,j) => (
|
||||
<tr key={j}><th>{f.filename}</th>
|
||||
<td><a onClick={()=>view(j)}>(preview)</a></td>
|
||||
<td><a onClick={()=>dl(j)}>(download)</a></td>
|
||||
</tr>
|
||||
))}</tbody>
|
||||
</table>
|
||||
<b>Preview of first worksheet</b><br/>
|
||||
<div dangerouslySetInnerHTML={{__html}}></div>
|
||||
</> );
|
||||
}
|
||||
```
|
||||
|
||||
[^1]: The project has no official website. The official [repository](https://github.com/epfromer/pst-extractor) is hosted on GitHub.
|
||||
[^2]: Extracted spreadsheets are [available on GitHub](https://github.com/SheetJS/enron_xls)
|
||||
[^3]: See [`fs.readFileSync`](https://nodejs.org/api/fs.html#fsreadfilesyncpath-options) in the NodeJS documentation
|
||||
[^4]: See [`read` in "Reading Files"](/docs/api/parse-options)
|
||||
[^5]: See ["Workbook Object"](/docs/csf/book)
|
||||
[^6]: See [`sheet_to_csv` in "CSV and Text"](/docs/api/utilities/csv#delimiter-separated-output)
|
||||
[^7]: See [`sheet_to_html` in "Utilities"](/docs/api/utilities/html#html-table-output)
|
4
docz/docs/03-demos/03-net/04-email/_category_.json
Normal file
4
docz/docs/03-demos/03-net/04-email/_category_.json
Normal file
@ -0,0 +1,4 @@
|
||||
{
|
||||
"label": "Electronic Mail",
|
||||
"position": 4
|
||||
}
|
@ -1,16 +1,12 @@
|
||||
---
|
||||
title: Electronic Mail
|
||||
pagination_prev: demos/net/server/index
|
||||
pagination_next: demos/net/headless
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
import CodeBlock from '@theme/CodeBlock';
|
||||
|
||||
|
||||
<head>
|
||||
<script src="/pst/pstextractor.js"></script>
|
||||
</head>
|
||||
|
||||
Electronic mail ("email" or "e-mail") is an essential part of modern business
|
||||
workflows. Spreadsheets are commonly passed around and processed.
|
||||
|
||||
@ -367,98 +363,4 @@ proprietary mail and email account file formats.
|
||||
|
||||
### PST
|
||||
|
||||
`PST` is a common file format. The `pst-extractor` library is designed for
|
||||
extracting messages and attachments from `PST` files in NodeJS and the browser.
|
||||
|
||||
This demo uses [a special build](pathname:///pst/pstextractor.js) for the web.
|
||||
|
||||
<details><summary><b>Build details</b> (click to show)</summary>
|
||||
|
||||
1) Initialize a new NodeJS project and install the dependency:
|
||||
|
||||
```bash
|
||||
mkdir pstextract
|
||||
cd pstextract
|
||||
npm init -y
|
||||
npm i --save pst-extractor@1.9.0
|
||||
```
|
||||
|
||||
2) Save the following to `shim.js`:
|
||||
|
||||
```js title="shim.js"
|
||||
const PSTExtractor = require("pst-extractor");
|
||||
module.exports = PSTExtractor;
|
||||
module.exports.Buffer = Buffer;
|
||||
```
|
||||
|
||||
3) Build the script:
|
||||
|
||||
```bash
|
||||
npx browserify@17.0.0 -s PSTExtractor -o pstextractor.js shim.js
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
The [test file](pathname:///pst/enron.pst) was based on the EDRM clean extract
|
||||
from the "Enron Corpus" and includes a few XLS attachments.
|
||||
|
||||
```jsx live
|
||||
function SheetJSPreviewPSTSheets() {
|
||||
const [ files, setFiles ] = React.useState([]);
|
||||
const [ __html, setHTML ] = React.useState("");
|
||||
|
||||
/* recursively walk PST and collect attachments */
|
||||
const walk = (f,arr) => {
|
||||
if(f.hasSubfolders) for(let sf of f.getSubFolders()) walk(sf,arr);
|
||||
if(f.contentCount > 0) for(let e = f.getNextChild(); e != null; e = f.getNextChild()) {
|
||||
for(var i = 0; i < e.numberOfAttachments; ++i) {
|
||||
var a = e.getAttachment(i);
|
||||
/* XLS spreadsheet test by filename */
|
||||
if(a.filename.endsWith(".xls")) arr.push(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* view selected attachment */
|
||||
const view = (j) => {
|
||||
/* collect data into a "Buffer" */
|
||||
const strm = files[j].fileInputStream;
|
||||
const data = new PSTExtractor.Buffer(strm._length.low);
|
||||
strm.readCompletely(data);
|
||||
|
||||
/* parse */
|
||||
const wb = XLSX.read(data);
|
||||
|
||||
/* convert first sheet to HTML */
|
||||
const ws = wb.Sheets[wb.SheetNames[0]];
|
||||
setHTML(XLSX.utils.sheet_to_html(ws));
|
||||
}
|
||||
|
||||
/* process array buffer */
|
||||
const process_ab = (ab) => {
|
||||
const pst = new (PSTExtractor.PSTFile)(new PSTExtractor.Buffer(ab));
|
||||
const data = [];
|
||||
walk(pst.getRootFolder(), data);
|
||||
setFiles(data);
|
||||
};
|
||||
|
||||
|
||||
/* on click, fetch and process file */
|
||||
const doit = async() => {
|
||||
const ab = await (await fetch("/pst/enron.pst")).arrayBuffer();
|
||||
process_ab(ab);
|
||||
};
|
||||
const chg = async(e) => process_ab(await e.target.files[0].arrayBuffer());
|
||||
|
||||
return ( <>
|
||||
<p>Use the file input to select a file, or click "Use a Sample PST"</p>
|
||||
<button onClick={doit}>Use a Sample PST!</button><br/><br/>
|
||||
<input type="file" accept=".pst" onChange={chg}/><br/>
|
||||
<b>Attachments</b>
|
||||
<ul>{files.map((f,j) => (
|
||||
<li key={j}><a onClick={()=>view(j)}>{f.filename} (click to view)</a></li>
|
||||
))}</ul>
|
||||
<b>Table View</b><br/>
|
||||
<div dangerouslySetInnerHTML={{__html}}></div>
|
||||
</> );
|
||||
}
|
||||
**[The exposition has been moved to a separate page.](/docs/demos/net/email/pst)**
|
@ -1,5 +1,6 @@
|
||||
---
|
||||
title: Browser Automation
|
||||
pagination_prev: demos/net/email/index
|
||||
---
|
||||
|
||||
import current from '/version.js';
|
||||
|
@ -1,9 +1,9 @@
|
||||
---
|
||||
title: Hyperlinks and Tooltips
|
||||
sidebar_label: Hyperlinks
|
||||
sidebar_position: 3
|
||||
---
|
||||
|
||||
# Hyperlinks
|
||||
|
||||
<details>
|
||||
<summary><b>File Format Support</b> (click to show)</summary>
|
||||
|
||||
@ -254,6 +254,20 @@ XLSX documents. A workaround was added in library version 0.18.12.
|
||||
|
||||
:::
|
||||
|
||||
## Tooltips
|
||||
|
||||
Tooltips are attached to hyperlink information. There is no way to specify a
|
||||
tooltip without assigning a cell link.
|
||||
|
||||
:::warning pass
|
||||
|
||||
**Excel has an undocumented tooltip length limit of 255 characters.**
|
||||
|
||||
Writing longer tooltips is currently permitted by the library but the generated
|
||||
files will not open in Excel.
|
||||
|
||||
:::
|
||||
|
||||
## HTML
|
||||
|
||||
The HTML DOM parser[^1] will process `<a>` links in the table.
|
||||
|
51
docz/static/pst/SheetJSPST.js
Normal file
51
docz/static/pst/SheetJSPST.js
Normal file
@ -0,0 +1,51 @@
|
||||
const fs = require("fs");
|
||||
const PSTExtractor = require("pst-extractor");
|
||||
const XLSX = require("xlsx");
|
||||
|
||||
/* walk the PST file and add all attachments to the specified array */
|
||||
function walk(f,arr) {
|
||||
if(f.hasSubfolders) for(let sf of f.getSubFolders()) walk(sf,arr);
|
||||
if(f.contentCount <= 0) return;
|
||||
for(let e = f.getNextChild(); e != null; e = f.getNextChild()) {
|
||||
for(let i = 0; i < e.numberOfAttachments; ++i) {
|
||||
var a = e.getAttachment(i);
|
||||
/* XLS spreadsheet test by filename */
|
||||
if(/.xls[xmb]?$/.test(a.filename)) arr.push(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* collect data from the attachment into a "Buffer" */
|
||||
function collect(file) {
|
||||
const strm = file.fileInputStream;
|
||||
const data = Buffer.alloc(strm._length.low);
|
||||
strm.readCompletely(data);
|
||||
return data;
|
||||
}
|
||||
|
||||
(async() => {
|
||||
/* fetch https://docs.sheetjs.com/pst/enron.pst */
|
||||
const ab = await (await fetch("https://docs.sheetjs.com/pst/enron.pst")).arrayBuffer();
|
||||
const pst = new (PSTExtractor.PSTFile)(Buffer.from(ab));
|
||||
|
||||
/* generate a list of attachments */
|
||||
const files = [];
|
||||
walk(pst.getRootFolder(), files);
|
||||
|
||||
files.forEach((file, idx) => {
|
||||
/* extract and save workbook to file */
|
||||
const ext = file.filename.slice(file.filename.lastIndexOf(".") + 1);
|
||||
console.log(`saving file ${idx} |${file.filename}| to file${idx}.${ext}`);
|
||||
const buf = collect(file);
|
||||
fs.writeFileSync(`file${idx}.${ext}`, buf);
|
||||
|
||||
/* parse workbook and print CSV contents of each sheet */
|
||||
const wb = XLSX.read(buf);
|
||||
wb.SheetNames.forEach(n => {
|
||||
const ws = wb.Sheets[n];
|
||||
const csv = XLSX.utils.sheet_to_csv(ws);
|
||||
console.log(`#### ${file.filename} ! ${n}`);
|
||||
console.log(csv);
|
||||
});
|
||||
});
|
||||
})();
|
Loading…
Reference in New Issue
Block a user