From 611704e001a3f8b281c2e16ae4f05bfa57241d8b Mon Sep 17 00:00:00 2001 From: SheetJS Date: Sat, 29 Jul 2023 23:17:31 -0400 Subject: [PATCH] pandas --- .../02-examples/04-import.md | 2 +- docz/docs/03-demos/01-frontend/index.md | 6 +- docz/docs/03-demos/02-grid/12-vtl.md | 4 +- docz/docs/03-demos/05-mobile/index.md | 2 +- docz/docs/03-demos/06-desktop/02-nwjs.md | 2 +- .../10-extensions/{05-osa.md => 06-osa.md} | 0 .../{06-mathematica.md => 09-mathematica.md} | 0 docz/docs/03-demos/12-engines/05-pandas.md | 437 ++++++++++++++++++ docz/static/pandas/SheetJSPandas.py | 41 ++ docz/static/pandas/sheetjs.py | 136 ++++++ 10 files changed, 622 insertions(+), 8 deletions(-) rename docz/docs/03-demos/10-extensions/{05-osa.md => 06-osa.md} (100%) rename docz/docs/03-demos/10-extensions/{06-mathematica.md => 09-mathematica.md} (100%) create mode 100644 docz/docs/03-demos/12-engines/05-pandas.md create mode 100644 docz/static/pandas/SheetJSPandas.py create mode 100644 docz/static/pandas/sheetjs.py diff --git a/docz/docs/02-getting-started/02-examples/04-import.md b/docz/docs/02-getting-started/02-examples/04-import.md index 8e77dbd..14aad26 100644 --- a/docz/docs/02-getting-started/02-examples/04-import.md +++ b/docz/docs/02-getting-started/02-examples/04-import.md @@ -171,7 +171,7 @@ To determine how to process the data, it is best to inspect the file first. ### List Sheet Names As explained in the "Workbook Object"[^4] section, the `SheetNames` property is -a ordered list of the sheets in the workbook. +a ordered list of the sheet names in the workbook. The following live code block displays an ordered list of the sheet names: diff --git a/docz/docs/03-demos/01-frontend/index.md b/docz/docs/03-demos/01-frontend/index.md index 841110f..3b7eb3a 100644 --- a/docz/docs/03-demos/01-frontend/index.md +++ b/docz/docs/03-demos/01-frontend/index.md @@ -36,13 +36,13 @@ built without frameworks, the framework ecosystems have battle-tested solutions for organizing data, page updates / routing, and other common problems. It is strongly recommended to stick with familiar frameworks. Teams well-versed -in Angular should continue using Angular. Teams well-versed in React should -continue using React. For common problems, there are official or community +in Angular should continue using Angular. Teams well-versed in ReactJS should +continue using ReactJS. For common problems, there are official or community solutions using any framework. Greenfield projects can be built with any framework. The popular frameworks have large ecosystems and many talented developers for hire. At the time of -writing, React has the largest developer pool and module ecosystem. +writing, ReactJS has the largest developer pool and module ecosystem. ::: diff --git a/docz/docs/03-demos/02-grid/12-vtl.md b/docz/docs/03-demos/02-grid/12-vtl.md index a3359b4..918abce 100644 --- a/docz/docs/03-demos/02-grid/12-vtl.md +++ b/docz/docs/03-demos/02-grid/12-vtl.md @@ -9,8 +9,8 @@ import CodeBlock from '@theme/CodeBlock'; :::note -This demo was tested against `vue3-table-lite 1.2.4`, VueJS `3.2.47`, ViteJS -4.3.1, and `@vitejs/plugin-vue` 4.1.0 on 2023 April 24 +This demo was tested against `vue3-table-lite 1.2.4`, VueJS `3.3.4`, ViteJS +4.4.7, and `@vitejs/plugin-vue` 4.2.3 on 2023 July 27 ::: diff --git a/docz/docs/03-demos/05-mobile/index.md b/docz/docs/03-demos/05-mobile/index.md index 7a80273..8319f38 100644 --- a/docz/docs/03-demos/05-mobile/index.md +++ b/docz/docs/03-demos/05-mobile/index.md @@ -12,7 +12,7 @@ extensions and libraries to create a hybrid development experience. Developers well-versed in web technologies can now build actual mobile applications that run on iOS and Android! -:::warning +:::warning pass **The ecosystem has broken backwards-compatibility many times!** diff --git a/docz/docs/03-demos/06-desktop/02-nwjs.md b/docz/docs/03-demos/06-desktop/02-nwjs.md index ea581b1..9733276 100644 --- a/docz/docs/03-demos/06-desktop/02-nwjs.md +++ b/docz/docs/03-demos/06-desktop/02-nwjs.md @@ -109,7 +109,7 @@ input.click(); :::note -This demo was tested against NW.js 0.73.0 on 2023 February 20. +This demo was tested against NW.js 0.78.0 on 2023 July 27. ::: diff --git a/docz/docs/03-demos/10-extensions/05-osa.md b/docz/docs/03-demos/10-extensions/06-osa.md similarity index 100% rename from docz/docs/03-demos/10-extensions/05-osa.md rename to docz/docs/03-demos/10-extensions/06-osa.md diff --git a/docz/docs/03-demos/10-extensions/06-mathematica.md b/docz/docs/03-demos/10-extensions/09-mathematica.md similarity index 100% rename from docz/docs/03-demos/10-extensions/06-mathematica.md rename to docz/docs/03-demos/10-extensions/09-mathematica.md diff --git a/docz/docs/03-demos/12-engines/05-pandas.md b/docz/docs/03-demos/12-engines/05-pandas.md new file mode 100644 index 0000000..3b98742 --- /dev/null +++ b/docz/docs/03-demos/12-engines/05-pandas.md @@ -0,0 +1,437 @@ +--- +title: Spreadsheet Data in Pandas +sidebar_label: Python (Pandas) +description: Process structured data in Python with Pandas. Seamlessly integrate spreadsheets into your workflow with SheetJS. Analyze complex Excel spreadsheets with confidence. +pagination_prev: demos/cloud/index +pagination_next: demos/bigdata/index +--- + +import current from '/version.js'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; + +Pandas[^1] is a Python software library for data analysis. + +[SheetJS](https://sheetjs.com) is a JavaScript library for reading and writing +data from spreadsheets. + +This demo uses SheetJS to process data from a spreadsheet and translate to the +Pandas DataFrame format. We'll explore how to load SheetJS from Python scripts, +generate DataFrames from workbooks, and write DataFrames back to workbooks. + +:::note + +This demo was tested in the following deployments: + +| Architecture | V8 version | Pandas | Python | Date | +|:-------------|:--------------|:-------|:-------|:-----------| +| `darwin-x64` | `11.5.150.16` | 2.0.3 | 3.11.4 | 2023-07-29 | + +::: + +:::info pass + +Pandas includes limited support for reading spreadsheets (`pandas.from_excel`) +and writing XLSX spreadsheets (`pandas.DataFrame.to_excel`). + +The SheetJS approach supports many common spreadsheet formats that are not +supported by the current set of Pandas codecs and offers greater flexibility in +processing complex worksheets. + +::: + +## Integration Details + +JS code cannot literally be run in the Python interpreter. To run JS code from +Python, JavaScript engines[^2] can be embedded in CPython modules. + +### Loading SheetJS + +This demo uses the `STPyV8` module[^3] to access the V8 JavaScript engine. + +_Initialize V8_ + +The engine library provides a convenient context manager `JSContext` for context +resource management. Within the context, the `eval` method can evaluate code: + +```py +from STPyV8 import JSContext + +# Initialize JS context +with JSContext() as ctxt: + # Run code + res = ctxt.eval("'Sheet' + 'JS'") + + # print result + print(res) +``` + +`STPyV8` handles data interchange for common types. Arrays and JS objects can be +translated to Python `list` and `dict` respectively. The following `convert` +function is used in the test suite[^4] + +```py +# from `tests/test_Wrapper.py` in the STPyV8 library +# License: Apache 2.0 +def convert(obj): + if isinstance(obj, JSArray): + return [convert(v) for v in obj] + if isinstance(obj, JSObject): + return dict([[str(k), convert(obj.__getattr__(str(k)))] for k in obj.__dir__()]) + return obj +``` + +_Loading the Library_ + +The [Standalone scripts](/docs/getting-started/installation/standalone) can be +parsed and evaluated from the JS engine. Once evaluated, the `XLSX` variable is +available as a global. + +Assuming the standalone library is in the same directory as the source file, +the script can be evaluated with `eval`: + +```py + # Within a JSContext, open `xlsx.full.min.js` and evaluate + with open("xlsx.full.min.js") as f: + ctxt.eval(f.read()) +``` + +### Reading Files + +The following diagram depicts the spreadsheet salsa: + +```mermaid +flowchart LR + file[(workbook\nfile)] + subgraph SheetJS operations + base64(Base64\nstring) + wb((SheetJS\nWorkbook)) + aoo(array of\nobjects) + end + subgraph Pandas operations + lod(list of\nrecords) + df[(Pandas\nDataFrame)] + end + file --> |`open`/`read`\nPython ops| base64 + base64 --> |`XLSX.read`\nParse Bytes| wb + wb --> |`sheet_to_json`\nExtract Data| aoo + aoo --> |`convert`\nPython ops|lod + lod --> |`from_records`\nPandas ops| df +``` + +At a high level: + +1) Pure Python operations read the file and generate a Base64 string + +2) SheetJS libraries parse the string and generates JS records + +3) JS engine operations translate the rows to Python `list` of `dicts` + +4) Pandas operations translate the Python data to a DataFrame + +#### Read files + +The safest format for data interchange is Base64-encoded strings: + +```py +from base64 import b64encode + +with open(path, mode="rb") as f: + file_bytes = f.read() + b64 = b64encode(file_bytes) +``` + +#### Parse bytes + +From JS code, `XLSX.read`[^5] parses the Base64 string + +```py +wb = ctxt.eval("(b64 => XLSX.read(b64, {type: 'base64', dense: true}))")(b64) +``` + +The `wb` object follows the "Common Spreadsheet Format"[^6], an in-memory format +for representing workbooks, worksheets, cells, and spreadsheet features. + +#### Get First Worksheet + +As explained in the "Workbook Object"[^7] section: +- the `SheetNames` property is a ordered list of the sheet names in the workbook +- the `Sheets` property of the workbook object is an object whose keys are sheet + names and whose values are sheet objects. + +For use in Python, the `SheetNames` array must be converted to a `list`: + +```py +sheet_names = convert(wb.SheetNames) +first_sheet_name = sheet_names[0] +``` + +Since utility functions will process the worksheet object from JavaScript, it is +preferable not to convert the object: + +```py +first_sheet = wb.Sheets[first_sheet_name] # do not convert +``` + +#### Generate List of Records + +In JavaScript, the equivalent of the "`list` of `dict`s" or "`list` of records" +is "array of objects". They can be created with `XLSX.utils.sheet_to_json`[^8]: + +```py +rows = convert(ctxt.eval("(ws => XLSX.utils.sheet_to_json(ws))")(first_sheet)) +``` + +#### Generate Pandas DataFrame + +`rows` is a `list` of `dict` objects. `from_records`[^9] understands this data +shape and generates a proper DataFrame: + +```py +df = pd.DataFrame.from_records(rows) +``` + +### Writing Files + +The writing process looks similar to the reading process in reverse: + +```mermaid +flowchart LR + subgraph Pandas operations + df[(Pandas\nDataFrame)] + json(JSON\nString) + end + subgraph SheetJS operations + aoo(array of\nobjects) + wb((SheetJS\nWorkbook)) + base64(Base64\nstring) + end + file[(workbook\nfile)] + df --> |`to_json`\nPandas ops| json + json --> |`JSON.parse`\nJS Engine| aoo + aoo --> |`json_to_sheet`\nSheetJS Ops| wb + wb --> |`XLSX.write`\nBase64| base64 + base64 --> |`open`/`write`\nPython ops| file +``` + +At a high level: + +1) Pandas operations translate the Python data to JSON string + +2) JS engine operations translate the JSON string to an array of objects + +3) SheetJS libraries parse the array and generate a Base64-encoded workbook + +4) Pure Python operations decode the Base64 string and write the bytes to file. + +#### Generate JSON + +`DataFrame#to_json`[^10] with the option `orient="records"` generates a JSON +string that encodes an array of objects: + +```py +json = df.to_json(orient="records") +``` + +#### Generate Worksheet + +In JavaScript, `JSON.parse` will interpret the string as an array of objects. +`XLSX.utils.json_to_sheet`[^11] generates a SheetJS worksheet object: + +```py +sheet = ctxt.eval("(json => XLSX.utils.json_to_sheet(JSON.parse(json)) )")(json) +``` + +#### Export Enhancements + +At this point, there are many options for improving the appearance of the sheet. +For example, the "Export Tutorial"[^12] shows how to adjust column widths. + +:::tip pass + +[SheetJS Pro](https://sheetjs.com/pro) offers additional styling options such as +cell styling and frozen rows. + +"Pro Edit" offers a special approach for inserting data into an existing file. + +::: + +#### Generate Workbook + +`XLSX.utils.book_new`[^13] creates a new workbook and `XLSX.utils.book_append_sheet`[^14] +appends a worksheet to the workbook. The new worksheet will be called "Export": + +:::note pass + +The code in the string literal is reproduced below: + +```js +(ws, name) => { + const wb = XLSX.utils.book_new(); + XLSX.utils.book_append_sheet(wb, ws, name); + return wb; +} +``` + +::: + +```py +book = ctxt.eval("""((ws, name) => { + const wb = XLSX.utils.book_new(); + XLSX.utils.book_append_sheet(wb, ws, name); + return wb; +})""")(sheet, "Export") +``` + +#### Generate File + +`XLSX.write`[^15] with the option `type: "base64"` attempts to create a file and +generate a Base64 string: + +```py +b64 = ctxt.eval("(wb => XLSX.write(wb, {type:'base64', bookType:'xls'}))")(book) +``` + +With the Base64 string, standard Python operations can create a file: + +```py +from base64 import b64decode + +raw = b64decode(b64) +with open("export.xls", mode="wb") as f: + f.write(raw) +``` + +## Complete Demo + +This example will extract data from an Apple Numbers spreadsheet and generate a +DataFrame. The DataFrame will be exported to a legacy XLS spreadsheet. + +### Engine Setup + +0) Follow the official installation instructions[^16]. + +
Instructions for macOS 12 (click to show) + +- Install `boost-python3` package using `brew`: + +```bash +brew install boost-python3 +``` + +- Identify python version: + +```bash +python3 --version +``` + +:::note pass + +When the demo was last tested, the version was `3.11.4` + +::: + +- [Download latest release](https://github.com/cloudflare/stpyv8/releases) + +```bash +curl -LO https://github.com/cloudflare/stpyv8/releases/download/v11.5.150.16/stpyv8-macos-12-python-3.11.zip +``` + +- Extract ZIP file and enter folder + +```bash +unzip stpyv8-macos-12-python-3.11.zip +cd stpyv8-macos-12-3.11 +``` + +- Move `icudtl.dat` to `/Library/Application Support/STPyV8/`: + +```bash +sudo mkdir -p /Library/Application\ Support/STPyV8 +sudo mv icudtl.dat /Library/Application\ Support/STPyV8/ +``` + +- Install wheel: + +```bash +sudo python3 -m pip install --upgrade *.whl +cd .. +``` + +
+ +### Demo + +1) Follow the [standalone script](/docs/getting-started/installation/standalone) + instructions to download the script: + +{`\ +curl -LO https://cdn.sheetjs.com/xlsx-${current}/package/dist/xlsx.full.min.js`} + + +2) Install Pandas. On macOS: + +```python +sudo python3 -m pip install pandas +``` + +3) Download the following test scripts and files: + +- [`pres.numbers` test file](https://sheetjs.com/pres.numbers) +- [`sheetjs.py` wrapper](pathname:///pandas/sheetjs.py) +- [`SheetJSPandas.py` script](pathname:///pandas/SheetJSPandas.py) + +```bash +curl -LO https://sheetjs.com/pres.numbers +curl -LO https://docs.sheetjs.com/pandas/sheetjs.py +curl -LO https://docs.sheetjs.com/pandas/SheetJSPandas.py +``` + +4) Run the script: + +```bash +python3 SheetJSPandas.py pres.numbers +``` + +If successful, it will display data rows in the file: + +``` +Reading from sheet Sheet1 +{'Name': 'Bill Clinton', 'Index': 42} +{'Name': 'GeorgeW Bush', 'Index': 43} +{'Name': 'Barack Obama', 'Index': 44} +{'Name': 'Donald Trump', 'Index': 45} +{'Name': 'Joseph Biden', 'Index': 46} +``` + +If Pandas is installed, the script will display DataFrame metadata: + +``` +RangeIndex: 5 entries, 0 to 4 +Data columns (total 2 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 Name 5 non-null object + 1 Index 5 non-null int64 +dtypes: int64(1), object(1) +``` + +It will also export to `pres.xls`. The file can be read in a spreadsheet editor. + +[^1]: The official documentation site is and the official distribution point is +[^2]: See ["Other Languages"](/docs/demos/engines/) for more examples. +[^3]: [`STPyV8`](https://github.com/cloudflare/stpyv8) is a fork of the original [`PyV8` project](https://pypi.org/project/PyV8/). It is available under the permissive Apache 2.0 License. Special thanks to Flier Lu and CloudFlare! +[^4]: See [`tests/test_Wrapper.py`](https://github.com/cloudflare/stpyv8/blob/410b31abe7a103b408d362cb872ce81604281c48/tests/test_Wrapper.py#L15) in the `STPyV8` code repository. +[^5]: See [`read` in "Reading Files"](/docs/api/parse-options) +[^6]: See ["SheetJS Data Model"](/docs/csf/) +[^7]: See ["Workbook Object"](/docs/csf/book) +[^8]: See [`sheet_to_json` in "Utilities"](/docs/api/utilities/array#array-output) +[^9]: See [`pandas.DataFrame.from_records`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_records.html) in the Pandas documentation. +[^10]: See [`pandas.DataFrame.to_json`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html) in the Pandas documentation. +[^11]: See [`json_to_sheet` in "Utilities"](/docs/api/utilities/array#array-of-objects-input) +[^12]: See ["Clean up Workbook"](/docs/getting-started/examples/export#clean-up-workbook) in "Export Tutorial". +[^13]: See [`book_new` in "Utilities"](/docs/api/utilities/wb) +[^14]: See [`book_append_sheet` in "Utilities"](/docs/api/utilities/wb) +[^15]: See [`write` in "Writing Files"](/docs/api/write-options) +[^16]: See ["Installing"](https://github.com/cloudflare/stpyv8#installing) in the `STPyV8` project documentation \ No newline at end of file diff --git a/docz/static/pandas/SheetJSPandas.py b/docz/static/pandas/SheetJSPandas.py new file mode 100644 index 0000000..6652c8b --- /dev/null +++ b/docz/static/pandas/SheetJSPandas.py @@ -0,0 +1,41 @@ +from sheetjs import SheetJS +from sys import argv, exit + +test_pandas = True +try: + import pandas as pd +except: + test_pandas = False + +# Parse file and generate row objects +with SheetJS() as sheetjs: + # Print library version number + print(f"SheetJS Version {sheetjs.version()}") + + # Read and parse data from file + wb = sheetjs.read_file(argv[1]) + + # Get first worksheet name + wsname = wb.sheet_names()[0] + print(f"Reading from sheet {wsname}") + + # Get data from first sheet + ws = wb.get_sheet(wsname) + rows = ws.get_rows() + for row in rows: print(row) + +if not test_pandas: + print("Pandas could not be loaded, skipping tests") + exit() + +print("\n## Pandas DataFrame\n") + +# generate dataframe +df = pd.DataFrame.from_records(rows) +print(df.info()) + +outf="pres.xls" +print(f"\n## Writing to {outf}\n") +# write JSON string to XLS worksheet +with SheetJS() as sheetjs: + sheetjs.book_from_df(df).to_file(outf) diff --git a/docz/static/pandas/sheetjs.py b/docz/static/pandas/sheetjs.py new file mode 100644 index 0000000..8dc265f --- /dev/null +++ b/docz/static/pandas/sheetjs.py @@ -0,0 +1,136 @@ +from base64 import b64encode, b64decode +from contextlib import contextmanager +from STPyV8 import JSContext, JSArray, JSObject +from functools import wraps +from os.path import splitext + +def to_py(method): + # `convert` from STPyV8 tests/test_Wrapper.py + def convert(obj): + if isinstance(obj, JSArray): + return [convert(v) for v in obj] + if isinstance(obj, JSObject): + return dict([[str(k), convert(obj.__getattr__(str(k)))] for k in obj.__dir__()]) + return obj + + @wraps(method) + def func(self, *args, **kwargs): + res = method(self, *args, **kwargs) + return convert(res) + return func + +class SheetJSWorksheet: + ws = None + ctxt = None + + def __init__(self, ctxt, ws): + self.ctxt = ctxt + self.ws = ws + + def js(self): return self.ws + + @to_py + def get_rows(self): + return self.ctxt.eval("(ws => XLSX.utils.sheet_to_json(ws))")(self.ws) + +class SheetJSWorkbook: + wb = None + ctxt = None + + def __init__(self, ctxt, wb): + self.ctxt = ctxt + self.wb = wb + + def js(self): return self.wb + + @to_py + def sheet_names(self): + return self.wb.SheetNames + + def get_sheet(self, name): + return SheetJSWorksheet(self.ctxt, self.wb.Sheets[name]) + + def to_file(self, path, book_type=""): + b64ify = self.ctxt.eval("((wb, bT) => XLSX.write(wb, {type:'base64', bookType:bT}))") + if not book_type: book_type = splitext(path)[1][1:] + b64 = b64ify(self.wb, book_type) + raw = b64decode(b64) + with open(path, mode="wb") as f: + f.write(raw) + +class SheetJSWrapper: + ctxt = None + + def __init__(self, ctx): + self.ctxt = ctx + with open("xlsx.full.min.js") as f: self.ctxt.eval(f.read()) + + def version(self): + return self.ctxt.eval("XLSX.version") + + def read_binary(self, data): + read = self.ctxt.eval("(b64 => XLSX.read(b64, {type: 'base64', dense: true}))") + return SheetJSWorkbook(self.ctxt, read(b64encode(data))) + + def read_file(self, path): + with open(path, mode="rb") as f: + return self.read_binary(f.read()) + + def sheet_from_json(self, json): + jsonify = self.ctxt.eval("(json => XLSX.utils.json_to_sheet(JSON.parse(json)) )") + return SheetJSWorksheet(self.ctxt, jsonify(json)) + + def book_new(self): + booknew = self.ctxt.eval("XLSX.utils.book_new()") + return SheetJSWorkbook(self.ctxt, booknew) + + def book_append_sheet(self, book, sheet, wsname): + bas = self.ctxt.eval("((wb, ws, wsname) => XLSX.utils.book_append_sheet(wb, ws, wsname))") + bas(book.js(), sheet.js(), wsname) + + def book_from_json(self, json, wsname = "Sheet1"): + booknew = self.book_new() + sheet = self.sheet_from_json(json) + self.book_append_sheet(booknew, sheet, wsname) + return booknew + + def book_from_df(self, df): + # convert from dataframe to JSON string + json = df.to_json(orient="records") + return self.book_from_json(json) + +@contextmanager +def SheetJS(): + """ + SheetJS Library context manager + + Returns an instance of the SheetJSWrapper class + + Reading data from file to Pandas DataFrame: + + ```py + with SheetJS() as sheetjs: + # read data from file + wb = sheetjs.read_file(argv[1]) + + # get first worksheet + first_ws_name = wb.sheet_names()[0] + ws = wb.get_sheet(wsname) + + # get data from first worksheet (list of dicts) + rows = ws.get_rows() + + # generate pandas DataFrame + df = pd.DataFrame.from_records(rows) + ``` + + Writing data from Pandas DataFrame to file: + + ```py + with SheetJS() as sheetjs: + sheetjs.book_from_df(df).to_file(outf) + ``` + + """ + with JSContext() as ctxt: + yield SheetJSWrapper(ctxt)