bnc
This commit is contained in:
parent
efbb38f80e
commit
ca3b497fe7
201
iwa/README.md
201
iwa/README.md
@ -94,8 +94,8 @@ references do not include message type info, so readers and writers must be
|
||||
aware of the message types and their interpretations.
|
||||
|
||||
Each framework is responsible for registering message types with the master
|
||||
registry by sending a message to the `TSPRegistry`. The actual types can be
|
||||
discovered from the frameworks. Some common message types are listed below:
|
||||
registry by sending specific messages like `setMessageType`. Some relevant
|
||||
message types for Numbers files are listed below:
|
||||
|
||||
| type | message |
|
||||
|-----:|:-------------------------|
|
||||
@ -125,10 +125,30 @@ message .TST.TableModelArchive {
|
||||
The reference in field 2 from `.TST.TableInfoArchive` is expected to be of type
|
||||
`.TST.TableModelArchive` so the latter must be registered.
|
||||
|
||||
`.TST.DataStore` is the type of field 4 from `.TST.TableModelArchive`. Since it
|
||||
`.TST.DataStore` is the type of field 4 from `.TST.TableModelArchive`. Since it
|
||||
is not referenced indirectly, the message type does not have to be registered.
|
||||
|
||||
## Data Storage in Numbers files
|
||||
_Discovering the registry mapping_
|
||||
|
||||
There are two approaches to discovering the registry mapping:
|
||||
|
||||
1) Search in the frameworks for places where `setMessageType` is sent.
|
||||
|
||||
2) Run the program in a debugger and inspect `[TSPRegistry sharedRegistry]`:
|
||||
|
||||
The latter approach was publicly documented by Sean Patrick O'Brien. A fully
|
||||
automated script runs on Intel Macs:
|
||||
|
||||
```bash
|
||||
deno run -A https://oss.sheetjs.com/notes/iwa/dump_registry.ts
|
||||
```
|
||||
|
||||
## Document Structure
|
||||
|
||||
The iWork apps start from the `DocumentArchive` message, whose definition
|
||||
varies between formats.
|
||||
|
||||
### Numbers
|
||||
|
||||
The root message (type 1) has the following structure:
|
||||
|
||||
@ -148,7 +168,9 @@ message .TN.SheetArchive {
|
||||
`name` is the name of the worksheet. Each worksheet can contain multiple tables.
|
||||
The messages referenced in field 2 (type 6000) refer to `.TST.TableInfoArchive`
|
||||
|
||||
### Table Storage in iWork
|
||||
## Table Storage in iWork
|
||||
|
||||
_Protobuf Structure_
|
||||
|
||||
Table structure is shared across iWork apps. The protobuf definitions are
|
||||
identical. The root element for tables is the `.TST.TableInfoArchive`:
|
||||
@ -170,9 +192,7 @@ message .TST.TableModelArchive {
|
||||
|
||||
message .TST.DataStore {
|
||||
required .TST.TileStorage tiles = 3;
|
||||
required .TSP.Reference stringTable = 4;
|
||||
optional .TSP.Reference formulaErrorTable = 12;
|
||||
optional .TSP.Reference rich_text_table = 17;
|
||||
required .TST.TableRBTree rowTileTree = 9;
|
||||
// ...
|
||||
}
|
||||
|
||||
@ -186,9 +206,6 @@ message .TST.TileStorage {
|
||||
}
|
||||
```
|
||||
|
||||
Numbers uses a "shared string table" like Excel. Excel stores both plaintext and
|
||||
rich strings in the same table, while Numbers has two separate tables.
|
||||
|
||||
The message referenced in the tiles (type 6002) has the following structure:
|
||||
|
||||
```proto
|
||||
@ -202,6 +219,8 @@ message .TST.TileRowInfo {
|
||||
required uint32 cell_count = 2;
|
||||
required bytes cell_storage_buffer_pre_bnc = 3;
|
||||
required bytes cell_offsets_pre_bnc = 4;
|
||||
optional bytes cell_storage_buffer = 6;
|
||||
optional bytes cell_offsets = 7;
|
||||
// ...
|
||||
}
|
||||
```
|
||||
@ -209,11 +228,55 @@ message .TST.TileRowInfo {
|
||||
Each `.TST.TileRowInfo` message holds the data and property references for a
|
||||
single row in the table.
|
||||
|
||||
The placement of rows in the table is governed by the tree in field 9:
|
||||
|
||||
```proto
|
||||
message .TST.TableRBTree {
|
||||
message Node {
|
||||
required uint32 key = 1;
|
||||
required uint32 value = 2;
|
||||
}
|
||||
repeated .TST.TableRBTree.Node nodes = 1;
|
||||
}
|
||||
```
|
||||
|
||||
Each node `key` is a row offset and `value` is an index into the tile array.
|
||||
For larger tables, tiles are generally expected to hold 256 rows.
|
||||
|
||||
### Data Storage
|
||||
|
||||
Non-numeric data values are stored in lists referenced from `.TST.DataStore`:
|
||||
|
||||
```proto
|
||||
message .TST.DataStore {
|
||||
required .TSP.Reference stringTable = 4;
|
||||
optional .TSP.Reference formulaErrorTable = 12;
|
||||
optional .TSP.Reference rich_text_table = 17;
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
iWork uses a "shared string table" like Excel. Excel stores both plaintext and
|
||||
rich strings in the same table, while iWork has two separate tables.
|
||||
|
||||
### Cell Storage
|
||||
|
||||
The cell offset fields are an array of 16-bit integers that describe offsets
|
||||
within the respective storage buffers. `0xFFFF` indicates that the column index
|
||||
for the given row is not included.
|
||||
|
||||
A 32-bit flag is stored at offset 4, describing which fields are in the cell:
|
||||
#### Old Storage
|
||||
|
||||
The "pre-BNC" storages are specified in fields 3 and 4
|
||||
|
||||
_Versions_
|
||||
|
||||
The first byte of the storage is the version number. There are three known
|
||||
versions for the old storage: `V1` (0-1), `V3` (2-3), and `V4` (4).
|
||||
|
||||
_Field Mask_
|
||||
|
||||
A bitmask is stored at offset 4, describing which fields are in the cell:
|
||||
|
||||
| field description | bit mask | size | notes |
|
||||
|:------------------|---------:|-----:|-------------------------------------|
|
||||
@ -223,18 +286,11 @@ A 32-bit flag is stored at offset 4, describing which fields are in the cell:
|
||||
| Double value | `0x0020` | 8 | raw value (IEEE754 double) |
|
||||
| Datetime value | `0x0040` | 8 | number of seconds since 1/1/2001 |
|
||||
|
||||
The starting offset depends on the cell storage version (`0-1` or `2-3`), which
|
||||
is stored in the first byte of each cell:
|
||||
The size of the bitmask is 2 bytes in `V1` and 4 bytes in `V3` and `V4`.
|
||||
|
||||
| description | v1 offset | v3 offset |
|
||||
|:----------------|---------------------------:|----------------------------:|
|
||||
| Error index |`8 + POPCNT(f & 0x008E) * 4`|`12 + POPCNT(f & 0x0C8E) * 4`|
|
||||
| Rich text index |`8 + POPCNT(f & 0x018E) * 4`|`12 + POPCNT(f & 0x0D8E) * 4`|
|
||||
| Plaintext index |`8 + POPCNT(f & 0x138E) * 4`|`12 + POPCNT(f & 0x3F8E) * 4`|
|
||||
| Double value |`8 + POPCNT(f & 0x139E) * 4`|`12 + POPCNT(f & 0x3F9E) * 4`|
|
||||
| Datetime value |`8 + POPCNT(f & 0x13BE) * 4`|`12 + POPCNT(f & 0x3FBE) * 4`|
|
||||
_Cell Type_
|
||||
|
||||
The cell type is stored at byte offset 2:
|
||||
The cell type is stored at byte offset 2 in `V1` / `V3` and offset 1 in `V4`:
|
||||
|
||||
| type | value |
|
||||
|-----:|:-----------------------------------------------------------------|
|
||||
@ -247,12 +303,105 @@ The cell type is stored at byte offset 2:
|
||||
| `8` | get error from formula error table at "Error index" |
|
||||
| `9` | get value from rich shared string table at "Rich text index" |
|
||||
|
||||
_Fields_
|
||||
|
||||
`V1` fields start at offset 8, while `V3` and `V4` start at offset 12. Fields
|
||||
are enumerated in order that the data appears in the storage.
|
||||
|
||||
| field mask | versions | size | description |
|
||||
|-----------:|:---------|-----:|:-------------------------------|
|
||||
| `0x000002` | all | 4 | cell style ID |
|
||||
| `0x000080` | all | 4 | text style ID |
|
||||
| `0x000400` | V3 / V4 | 4 | conditional style ID |
|
||||
| `0x000800` | V3 / V4 | 4 | conditional style applied rule |
|
||||
| `0x000004` | all | 4 | current format ID |
|
||||
| `0x000008` | all | 4 | formula ID |
|
||||
| `0x000100` | all | 4 | formula syntax error ID |
|
||||
| `0x000200` | all | 4 | rich text ID |
|
||||
| `0x001000` | all | 4 | comment storage ID |
|
||||
| `0x002000` | V3 / V4 | 4 | import warning set ID |
|
||||
| `0x000010` | all | 4 | string ID |
|
||||
| `0x000020` | all | 8 | double value |
|
||||
| `0x000040` | all | 8 | date time value |
|
||||
| `0x010000` | V3 / V4 | 4 | number format ID |
|
||||
| `0x080000` | V3 / V4 | 4 | currency format ID |
|
||||
| `0x020000` | V3 / V4 | 4 | date format ID |
|
||||
| `0x040000` | V3 / V4 | 4 | duration format ID |
|
||||
| `0x100000` | V3 / V4 | 4 | control format ID |
|
||||
| `0x200000` | V3 / V4 | 4 | custom format ID |
|
||||
| `0x400000` | V3 / V4 | 4 | base format ID |
|
||||
| `0x800000` | V3 / V4 | 4 | multiple choice list format ID |
|
||||
|
||||
#### New Storage
|
||||
|
||||
The "BNC" ("post-BNC"?) storages are specified in fields 6 and 7
|
||||
|
||||
_Versions_
|
||||
|
||||
The first byte of the storage is the version number. At the time of writing,
|
||||
the only version is `V5` (5).
|
||||
|
||||
_Field Mask_
|
||||
|
||||
A bitmask is stored at offset 8, describing which fields are in the cell:
|
||||
|
||||
| field description | bit mask | size | notes |
|
||||
|:------------------|---------:|-----:|-------------------------------------|
|
||||
| Error index | `0x0800` | 4 | index into formula error table |
|
||||
| Rich text index | `0x0010` | 4 | index into rich shared string table |
|
||||
| Plaintext index | `0x0008` | 4 | index into shared string table |
|
||||
| Double value | `0x0002` | 8 | raw value (IEEE754 double) |
|
||||
| Datetime value | `0x0004` | 8 | number of seconds since 1/1/2001 |
|
||||
| Decimal128 value | `0x0001` | 16 | raw value (128-bit floating point) |
|
||||
|
||||
_Cell Type_
|
||||
|
||||
The cell type is stored at byte offset 1:
|
||||
|
||||
| type | value |
|
||||
|-----:|:-----------------------------------------------------------------|
|
||||
| `0` | "blank cell" (no value) |
|
||||
| `2` | "Decimal value" (generally converted back to float) |
|
||||
| `3` | get value from shared string table at "Plaintext index" |
|
||||
| `5` | interpret "Datetime value" as number of seconds since 1/1/2001 |
|
||||
| `6` | `true` if "Double value" is greater than zero, `false` otherwise |
|
||||
| `7` | interpret "Double value" as number of seconds (Duration) |
|
||||
| `8` | get error from formula error table at "Error index" |
|
||||
| `9` | get value from rich shared string table at "Rich text index" |
|
||||
|
||||
_Fields_
|
||||
|
||||
Fields start at offset 12. The fields are in the same order as the bit flags.
|
||||
|
||||
| field mask | size | description |
|
||||
|-----------:|-----:|:-------------------------------|
|
||||
| `0x000001` | 16 | Decima128 value |
|
||||
| `0x000002` | 8 | double value |
|
||||
| `0x000004` | 8 | date time value |
|
||||
| `0x000008` | 4 | string ID |
|
||||
| `0x000010` | 4 | rich text ID |
|
||||
| `0x000020` | 4 | cell style ID |
|
||||
| `0x000040` | 4 | text style ID |
|
||||
| `0x000080` | 4 | conditional style ID |
|
||||
| `0x000100` | 4 | conditional style applied rule |
|
||||
| `0x000200` | 4 | formula ID |
|
||||
| `0x000400` | 4 | control cell spec ID |
|
||||
| `0x000800` | 4 | formula syntax error ID |
|
||||
| `0x001000` | 4 | suggest cell format kind |
|
||||
| `0x002000` | 4 | number format ID |
|
||||
| `0x004000` | 4 | currency format ID |
|
||||
| `0x008000` | 4 | date format ID |
|
||||
| `0x010000` | 4 | duration format ID |
|
||||
| `0x020000` | 4 | text format ID |
|
||||
| `0x040000` | 4 | boolean format ID |
|
||||
| `0x080000` | 4 | comment storage ID |
|
||||
| `0x100000` | 4 | import warning set ID |
|
||||
|
||||
## Misc
|
||||
|
||||
### Determining File Type
|
||||
|
||||
All three file types use the same message tag (1) for the root `DocumentArchive`
|
||||
message. However, the required fields vary between formats.
|
||||
The root `DocumentArchive` message fields vary between formats.
|
||||
|
||||
In the 12.1 apps, the required fields are:
|
||||
|
||||
@ -271,13 +420,13 @@ message .TN.DocumentArchive {
|
||||
required .TSP.Reference theme = 6;
|
||||
}
|
||||
|
||||
// Pages optional fields 2 - 7, 11 - 14, 16, 17, 20, 21, 30 - 50
|
||||
// Pages optional fields 2 - 7, 11 - 14, 16, 17, 20, 21, 30 - 49
|
||||
message .TP.DocumentArchive {
|
||||
required .TSA.DocumentArchive super = 15;
|
||||
}
|
||||
```
|
||||
|
||||
Pages is the only format to use and require field 15. Keynote requires field 2,
|
||||
Pages is the only format to use and require field 15. Keynote requires field 2,
|
||||
a field that does not appear in Numbers.
|
||||
|
||||
|
||||
|
42
iwa/dump_registry.ts
Normal file
42
iwa/dump_registry.ts
Normal file
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env -S deno run -A
|
||||
/*! dump_registry.ts (C) 2022-present SheetJS LLC -- https://sheetjs.com */
|
||||
|
||||
/*
|
||||
NOTE: this script requires an Intel Mac, Numbers, LLDB, and Deno
|
||||
|
||||
USAGE: deno run -A https://oss.sheetjs.com/notes/iwa/dump_registry.ts
|
||||
*/
|
||||
|
||||
if(Deno.build.os != "darwin") throw `Must run in macOS!`;
|
||||
if(Deno.build.arch != "x86_64") throw `Must run on Intel Mac (Apple Silicon currently unsupported)`;
|
||||
|
||||
const p = Deno.run({ cmd: "lldb /Applications/Numbers.app/Contents/MacOS/Numbers -a x86_64".split(" "),
|
||||
stdin: "piped", stdout: "piped"
|
||||
});
|
||||
|
||||
const doit = (x: string) => p?.stdin?.write(new TextEncoder().encode(x))
|
||||
|
||||
const cmds = [
|
||||
"b -[NSApplication _sendFinishLaunchingNotification]",
|
||||
"settings set auto-confirm 1",
|
||||
"breakpoint command add 1.1",
|
||||
"po [TSPRegistry sharedRegistry]",
|
||||
"process kill",
|
||||
"exit",
|
||||
"DONE",
|
||||
"run",
|
||||
];
|
||||
for(const cmd of cmds) await doit(cmd + "\n");
|
||||
|
||||
/* LLDB does not exit normally, setTimeout workaround */
|
||||
setTimeout(() => p.kill("SIGKILL"), 30000)
|
||||
|
||||
const [status, stdout] = await Promise.all([ p.status(), p.output() ]);
|
||||
await p.close();
|
||||
|
||||
const data = new TextDecoder().decode(stdout);
|
||||
const res = data.match(/_messageTypeToPrototypeMap = {([^]*?)}/m)?.[1];
|
||||
if(!res) throw `Could not find map!`
|
||||
const rows = res.split(/[\r\n]+/).map(r => r.trim().split(/\s+/)).filter(x => x.length > 1);
|
||||
rows.sort((l, r) => +l[0] - +r[0]);
|
||||
console.log(Object.fromEntries(rows.map(r => [r[0], r[3]]).filter(r => r[1] != "null")));
|
Loading…
Reference in New Issue
Block a user