CSV read Issue: Empty Content Returned When First Column Name Starts with "TABL" #3230

Closed
opened 2024-09-29 12:16:51 +00:00 by lowkeyfish · 1 comment

When attempting to read a CSV file, if the first column name begins with "TABL", the result of the read operation is an empty content.

When attempting to read a CSV file, if the first column name begins with "TABL", the result of the read operation is an empty content.
Owner

Thanks for sharing!

When a file starts with TABL, currently the parser attempts to interpret the worksheet as if it were a DIF spreadsheet

This logic is in the core switch statement in the readSync function https://git.sheetjs.com/sheetjs/sheetjs/src/branch/master/bits/87_read.js#L102

		case 0x54: if(n[1] === 0x41 && n[2] === 0x42 && n[3] === 0x4C) return DIF.to_workbook(d, o); break;

The current implementation only looks at the first 4 bytes and the DIF parser does not currently check for errors.


The SYLK file format starts with ID and there is a special workaround to handle that case. It makes sense to implement a similar workaround here.

Currently there is no real DIF error checking, so the simplest workaround is to check if the worksheet is missing a !ref property. This exploits a quirk in the aoa_to_sheet function so we will definitely need to add a test and ensure the logic changes when aoa_to_sheet is eventually changed.

You can follow the "Contributing" section of the docs then apply the patch. If you would like a pre-patched NodeJS package or browser script, join the SheetJS chat

diff --git a/bits/40_harb.js b/bits/40_harb.js
index 4ce011b..9be637a 100644
--- a/bits/40_harb.js
+++ b/bits/40_harb.js
@@ -1138,3 +1138,17 @@ function read_wb_ID(d, opts) {
 	}
 }
 
+function read_wb_TABL(d, opts) {
+	var o = opts || {}, OLD_WTF = !!o.WTF; o.WTF = true;
+	try {
+		var out = DIF.to_workbook(d, o);
+		if(!out || !out.Sheets) throw "DIF bad workbook";
+		var ws = out.Sheets[out.SheetNames[0]];
+		if(!ws || !ws["!ref"]) throw "DIF empty worksheet";
+		o.WTF = OLD_WTF;
+		return out;
+	} catch(e) {
+		o.WTF = OLD_WTF;
+		return PRN.to_workbook(d, opts);
+	}
+}
diff --git a/bits/87_read.js b/bits/87_read.js
index 4933c13..fe0e8eb 100644
--- a/bits/87_read.js
+++ b/bits/87_read.js
@@ -99,7 +99,7 @@ function readSync(data/*:RawData*/, opts/*:?ParseOpts*/)/*:Workbook*/ {
 			if(n[1] === 0x49 && n[2] === 0x2a && n[3] === 0x00) throw new Error("TIFF Image File is not a spreadsheet");
 			if(n[1] === 0x44) return read_wb_ID(d, o);
 			break;
-		case 0x54: if(n[1] === 0x41 && n[2] === 0x42 && n[3] === 0x4C) return DIF.to_workbook(d, o); break;
+		case 0x54: if(n[1] === 0x41 && n[2] === 0x42 && n[3] === 0x4C) return read_wb_TABL(d, o); break;
 		case 0x50: return (n[1] === 0x4B && n[2] < 0x09 && n[3] < 0x09) ? read_zip(d, o) : read_prn(data, d, o, str);
 		case 0xEF: return n[3] === 0x3C ? parse_xlml(d, o) : read_prn(data, d, o, str);
 		case 0xFF:
Thanks for sharing! When a file starts with `TABL`, currently the parser attempts to interpret the worksheet as if it were a [DIF spreadsheet](https://en.wikipedia.org/wiki/Data_Interchange_Format) This logic is in the core switch statement in the `readSync` function https://git.sheetjs.com/sheetjs/sheetjs/src/branch/master/bits/87_read.js#L102 ```js case 0x54: if(n[1] === 0x41 && n[2] === 0x42 && n[3] === 0x4C) return DIF.to_workbook(d, o); break; ``` The current implementation only looks at the first 4 bytes and the `DIF` parser does not currently check for errors. --- The `SYLK` file format starts with `ID` and there is a special workaround to handle that case. It makes sense to implement a similar workaround here. Currently there is no real DIF error checking, so the simplest workaround is to check if the worksheet is missing a `!ref` property. This exploits a quirk in the `aoa_to_sheet` function so we will definitely need to add a test and ensure the logic changes when `aoa_to_sheet` is eventually changed. You can follow the ["Contributing" section of the docs](https://docs.sheetjs.com/docs/miscellany/contributing) then apply the patch. If you would like a pre-patched NodeJS package or browser script, [join the SheetJS chat](https://sheetjs.com/chat) ```diff diff --git a/bits/40_harb.js b/bits/40_harb.js index 4ce011b..9be637a 100644 --- a/bits/40_harb.js +++ b/bits/40_harb.js @@ -1138,3 +1138,17 @@ function read_wb_ID(d, opts) { } } +function read_wb_TABL(d, opts) { + var o = opts || {}, OLD_WTF = !!o.WTF; o.WTF = true; + try { + var out = DIF.to_workbook(d, o); + if(!out || !out.Sheets) throw "DIF bad workbook"; + var ws = out.Sheets[out.SheetNames[0]]; + if(!ws || !ws["!ref"]) throw "DIF empty worksheet"; + o.WTF = OLD_WTF; + return out; + } catch(e) { + o.WTF = OLD_WTF; + return PRN.to_workbook(d, opts); + } +} diff --git a/bits/87_read.js b/bits/87_read.js index 4933c13..fe0e8eb 100644 --- a/bits/87_read.js +++ b/bits/87_read.js @@ -99,7 +99,7 @@ function readSync(data/*:RawData*/, opts/*:?ParseOpts*/)/*:Workbook*/ { if(n[1] === 0x49 && n[2] === 0x2a && n[3] === 0x00) throw new Error("TIFF Image File is not a spreadsheet"); if(n[1] === 0x44) return read_wb_ID(d, o); break; - case 0x54: if(n[1] === 0x41 && n[2] === 0x42 && n[3] === 0x4C) return DIF.to_workbook(d, o); break; + case 0x54: if(n[1] === 0x41 && n[2] === 0x42 && n[3] === 0x4C) return read_wb_TABL(d, o); break; case 0x50: return (n[1] === 0x4B && n[2] < 0x09 && n[3] < 0x09) ? read_zip(d, o) : read_prn(data, d, o, str); case 0xEF: return n[3] === 0x3C ? parse_xlml(d, o) : read_prn(data, d, o, str); case 0xFF: ```
Sign in to join this conversation.
No Milestone
No Assignees
2 Participants
Notifications
Due Date
The due date is invalid or out of range. Please use the format 'yyyy-mm-dd'.

No due date set.

Dependencies

No dependencies set.

Reference: sheetjs/sheetjs#3230
No description provided.