From 9199c2600cfb2037336e09753f24d998f216201c Mon Sep 17 00:00:00 2001 From: SheetJS Date: Mon, 13 Nov 2023 06:03:35 -0500 Subject: [PATCH] dta initial --- .gitignore | 1 + bits/10_ssf.js | 37 ++- packages/dta/.eslintrc | 26 ++ packages/dta/Makefile | 22 ++ packages/dta/README.md | 9 + packages/dta/bin/dta2csv.njs | 19 ++ packages/dta/dist/dta.js | 542 +++++++++++++++++++++++++++++++++ packages/dta/dist/dta.min.js | 2 + packages/dta/dta.ts | 543 ++++++++++++++++++++++++++++++++++ packages/dta/package.json | 36 +++ packages/dta/test.js | 32 ++ packages/dta/types/index.d.ts | 23 ++ 12 files changed, 1285 insertions(+), 7 deletions(-) create mode 100644 packages/dta/.eslintrc create mode 100644 packages/dta/Makefile create mode 100644 packages/dta/README.md create mode 100755 packages/dta/bin/dta2csv.njs create mode 100644 packages/dta/dist/dta.js create mode 100644 packages/dta/dist/dta.min.js create mode 100644 packages/dta/dta.ts create mode 100644 packages/dta/package.json create mode 100644 packages/dta/test.js create mode 100644 packages/dta/types/index.d.ts diff --git a/.gitignore b/.gitignore index dd85dc5..078e82a 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ tmp *.[eE][tT][hH] *.[nN][uU][mM][bB][eE][rR][sS] *.[mM][oO][dD] +*.[dD][tT][aA] *.123 *.htm *.html diff --git a/bits/10_ssf.js b/bits/10_ssf.js index 28d95b8..c3f1784 100644 --- a/bits/10_ssf.js +++ b/bits/10_ssf.js @@ -173,8 +173,20 @@ function SSF_frac(x/*:number*/, D/*:number*/, mixed/*:?boolean*/)/*:Array -1) { + var m = s.slice(0, s.indexOf("e")); + m = m.indexOf(".") > -1 ? m.slice(0, (m.slice(0,2) == "0." ? 17 : 16)) : (m.slice(0,15) + fill("0", m.length - 15)); + return m + s.slice(s.indexOf("e")); + } + var n = s.indexOf(".") > -1 ? s.slice(0, (s.slice(0,2) == "0." ? 17 : 16)) : (s.slice(0,15) + fill("0", s.length - 15)); + return Number(n); +} + function SSF_parse_date_code(v/*:number*/,opts/*:?any*/,b2/*:?boolean*/) { if(v > 2958465 || v < 0) return null; + v = SSF_normalize_xl_unsafe(v); var date = (v|0), time = Math.floor(86400 * (v - date)), dow=0; var dout=[]; var out={D:date, T:time, u:86400*(v-date)-time,y:0,m:0,d:0,H:0,M:0,S:0,q:0}; @@ -328,7 +340,7 @@ function SSF_write_date(type/*:number*/, fmt/*:string*/, val, ss0/*:?number*/)/* switch(fmt) { case '[h]': case '[hh]': out = val.D*24+val.H; break; case '[m]': case '[mm]': out = (val.D*24+val.H)*60+val.M; break; - case '[s]': case '[ss]': out = ((val.D*24+val.H)*60+val.M)*60+Math.round(val.S+val.u); break; + case '[s]': case '[ss]': out = ((val.D*24+val.H)*60+val.M)*60+(ss0 == 0 ? Math.round(val.S+val.u) : val.S); break; default: throw 'bad abstime format: ' + fmt; } outl = fmt.length === 3 ? 1 : 2; break; case 101: /* 'e' era */ @@ -776,10 +788,11 @@ function eval_fmt(fmt/*:string*/, v/*:any*/, opts/*:any*/, flen/*:number*/) { switch(out[i].t) { case 'h': case 'H': out[i].t = hr; lst='h'; if(bt < 1) bt = 1; break; case 's': - if((ssm=out[i].v.match(/\.0+$/))) ss0=Math.max(ss0,ssm[0].length-1); + if((ssm=out[i].v.match(/\.0+$/))) { ss0=Math.max(ss0,ssm[0].length-1); bt = 4;} if(bt < 3) bt = 3; /* falls through */ - case 'd': case 'y': case 'M': case 'e': lst=out[i].t; break; + case 'd': case 'y': case 'e': lst=out[i].t; break; + case 'M': lst=out[i].t; if(bt < 2) bt = 2; break; case 'm': if(lst === 's') { out[i].t = 'M'; if(bt < 2) bt = 2; } break; case 'X': /*if(out[i].v === "B2");*/ break; @@ -789,19 +802,29 @@ function eval_fmt(fmt/*:string*/, v/*:any*/, opts/*:any*/, flen/*:number*/) { if(bt < 3 && out[i].v.match(/[Ss]/)) bt = 3; } } + /* time rounding depends on presence of minute / second / usec fields */ + var _dt; switch(bt) { case 0: break; case 1: - /*::if(!dt) break;*/ + case 2: + case 3: if(dt.u >= 0.5) { dt.u = 0; ++dt.S; } if(dt.S >= 60) { dt.S = 0; ++dt.M; } if(dt.M >= 60) { dt.M = 0; ++dt.H; } + if(dt.H >= 24) { dt.H = 0; ++dt.D; _dt = SSF_parse_date_code(dt.D); _dt.u = dt.u; _dt.S = dt.S; _dt.M = dt.M; _dt.H = dt.H; dt = _dt; } break; - case 2: - /*::if(!dt) break;*/ - if(dt.u >= 0.5) { dt.u = 0; ++dt.S; } + case 4: + switch(ss0) { + case 1: dt.u = Math.round(dt.u * 10)/10; break; + case 2: dt.u = Math.round(dt.u * 100)/100; break; + case 3: dt.u = Math.round(dt.u * 1000)/1000; break; + } + if(dt.u >= 1) { dt.u = 0; ++dt.S; } if(dt.S >= 60) { dt.S = 0; ++dt.M; } + if(dt.M >= 60) { dt.M = 0; ++dt.H; } + if(dt.H >= 24) { dt.H = 0; ++dt.D; _dt = SSF_parse_date_code(dt.D); _dt.u = dt.u; _dt.S = dt.S; _dt.M = dt.M; _dt.H = dt.H; dt = _dt; } break; } diff --git a/packages/dta/.eslintrc b/packages/dta/.eslintrc new file mode 100644 index 0000000..8462dfb --- /dev/null +++ b/packages/dta/.eslintrc @@ -0,0 +1,26 @@ +{ + "env": { "shared-node-browser":true }, + "globals": {}, + "parserOptions": { + "ecmaVersion": 6 + }, + "plugins": [ "html", "json" ], + "extends": "eslint:recommended", + "rules": { + "comma-style": [ 2, "last" ], + "comma-dangle": [ 2, "never" ], + "curly": 0, + "no-bitwise": 0, + "no-cond-assign": 1, + "no-console": 0, + "no-control-regex": 0, + "no-unused-vars": 1, + "no-empty": 0, + "no-trailing-spaces": 2, + "no-use-before-define": [ 1, { + "functions":false, "classes":true, "variables":false + }], + "no-useless-escape": 0, + "semi": [ 2, "always" ] + } +} diff --git a/packages/dta/Makefile b/packages/dta/Makefile new file mode 100644 index 0000000..cf54114 --- /dev/null +++ b/packages/dta/Makefile @@ -0,0 +1,22 @@ +.PHONY: build +build: node browser + +## NodeJS target + +.PHONY: node +node: dist/dta.js + +dist/dta.js: dta.ts + npx esbuild@0.14.14 dta.ts --bundle --outdir=dist --platform=node + +.PHONY: test-node +test-node: dist/dta.js test.js + npx mocha@2.5.3 test.js + +## Browser target +.PHONY: browser +browser: dist/dta.min.js + +dist/dta.min.js: dta.ts + npx esbuild@0.14.14 dta.ts --bundle --outfile=dist/dta.min.js --minify --sourcemap --global-name=DTA + diff --git a/packages/dta/README.md b/packages/dta/README.md new file mode 100644 index 0000000..ff0b3a4 --- /dev/null +++ b/packages/dta/README.md @@ -0,0 +1,9 @@ +# DTA Data File Codec + +Codec for reading Stata .DTA files and generating CSF workbook objects +compatible with the [SheetJS](https://sheetjs.com) library constellation. + +DTA datasets can support millions of observations and over 32767 variables. +The codec will truncate data to 1048576 observations and 16384 variables. + + includes a live demo. \ No newline at end of file diff --git a/packages/dta/bin/dta2csv.njs b/packages/dta/bin/dta2csv.njs new file mode 100755 index 0000000..7ffd3f5 --- /dev/null +++ b/packages/dta/bin/dta2csv.njs @@ -0,0 +1,19 @@ +#!/usr/bin/env node +/* eslint-env node, es6 */ +const DTA = require("../"); +const XLSX = (() => { + try { + const XLSX = require("xlsx"); + DTA.set_utils(XLSX.utils); + return XLSX; + } catch(e) { + throw new Error("Must install the SheetJS file processing library! See https://docs.sheetjs.com/docs/getting-started/installation/nodejs for more details"); + } +})(); +const fs = require("fs"); + +const buf = fs.readFileSync(process.argv[2]); +const wb = DTA.parse(buf); +// translate stub cells to single blanks +wb.Sheets[wb.SheetNames[0]]["!data"].forEach(row => row.forEach(cell => {if(cell.t == "z") {cell.t = "s"; cell.v = " ";}})); +console.log(XLSX.utils.sheet_to_csv(wb.Sheets[wb.SheetNames[0]])); \ No newline at end of file diff --git a/packages/dta/dist/dta.js b/packages/dta/dist/dta.js new file mode 100644 index 0000000..c0dc382 --- /dev/null +++ b/packages/dta/dist/dta.js @@ -0,0 +1,542 @@ +var __defProp = Object.defineProperty; +var __getOwnPropDesc = Object.getOwnPropertyDescriptor; +var __getOwnPropNames = Object.getOwnPropertyNames; +var __hasOwnProp = Object.prototype.hasOwnProperty; +var __markAsModule = (target) => __defProp(target, "__esModule", { value: true }); +var __export = (target, all) => { + for (var name in all) + __defProp(target, name, { get: all[name], enumerable: true }); +}; +var __reExport = (target, module2, copyDefault, desc) => { + if (module2 && typeof module2 === "object" || typeof module2 === "function") { + for (let key of __getOwnPropNames(module2)) + if (!__hasOwnProp.call(target, key) && (copyDefault || key !== "default")) + __defProp(target, key, { get: () => module2[key], enumerable: !(desc = __getOwnPropDesc(module2, key)) || desc.enumerable }); + } + return target; +}; +var __toCommonJS = /* @__PURE__ */ ((cache) => { + return (module2, temp) => { + return cache && cache.get(module2) || (temp = __reExport(__markAsModule({}), module2, 1), cache && cache.set(module2, temp), temp); + }; +})(typeof WeakMap !== "undefined" ? /* @__PURE__ */ new WeakMap() : 0); + +// dta.ts +var dta_exports = {}; +__export(dta_exports, { + parse: () => parse, + set_utils: () => set_utils +}); +var _utils; +function set_utils(utils) { + _utils = utils; +} +function u8_to_dataview(array) { + return new DataView(array.buffer, array.byteOffset, array.byteLength); +} +function valid_inc(p, n) { + if (p.str.slice(p.ptr, p.ptr + n.length) != n) + return false; + p.ptr += n.length; + return true; +} +function skip_end(p, n) { + const idx = p.str.indexOf(n, p.ptr); + if (idx == -1) + throw new Error(`Expected ${n} after offset ${p.ptr}`); + p.ptr = idx + n.length; +} +function slice_end(p, n) { + const idx = p.str.indexOf(n, p.ptr); + if (idx == -1) + throw new Error(`Expected ${n} after offset ${p.ptr}`); + const raw = p.raw.slice(p.ptr, idx); + const res = { + ptr: 0, + raw, + str: p.str.slice(p.ptr, idx), + dv: u8_to_dataview(raw) + }; + p.ptr = idx + n.length; + return res; +} +function read_f64(p, LE) { + p.ptr += 8; + const d = p.dv.getFloat64(p.ptr - 8, LE); + return d > 8988e304 ? null : d; +} +function read_f32(p, LE) { + p.ptr += 4; + const d = p.dv.getFloat32(p.ptr - 4, LE); + return d > 1701e35 ? null : d; +} +function read_u32(p, LE) { + p.ptr += 4; + return p.dv.getUint32(p.ptr - 4, LE); +} +function read_i32(p, LE) { + p.ptr += 4; + const u = p.dv.getInt32(p.ptr - 4, LE); + return u > 2147483620 ? null : u; +} +function read_u16(p, LE) { + p.ptr += 2; + return p.dv.getUint16(p.ptr - 2, LE); +} +function read_i16(p, LE) { + p.ptr += 2; + const u = p.dv.getInt16(p.ptr - 2, LE); + return u > 32740 ? null : u; +} +function read_u8(p) { + return p.raw[p.ptr++]; +} +function read_i8(p) { + let u = p.raw[p.ptr++]; + u = u < 128 ? u : u - 256; + return u > 100 ? null : u; +} +var SUPPORTED_VERSIONS_TAGGED = [ + "117", + "118" +]; +function parse_tagged(raw) { + const err = "Not a DTA file"; + const str = new TextDecoder("latin1").decode(raw); + const d = { + ptr: 0, + raw, + str, + dv: u8_to_dataview(raw) + }; + let vers = 118; + let LE = true; + let nvar = 0, nobs = 0, nobs_lo = 0, nobs_hi = 0; + let label = "", timestamp = ""; + const var_types = []; + const var_names = []; + const formats = []; + if (!valid_inc(d, "")) + throw err; + { + if (!valid_inc(d, "
")) + throw err; + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + if (SUPPORTED_VERSIONS_TAGGED.indexOf(res.str) == -1) + throw `Unsupported DTA ${res.str} file`; + vers = +res.str; + } + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + switch (res.str) { + case "MSF": + LE = false; + break; + case "LSF": + LE = true; + break; + default: + throw `Unsupported byteorder ${res.str}`; + } + } + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + nvar = read_u16(res, LE); + } + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + if (vers == 117) + nobs = nobs_lo = read_u32(res, LE); + else { + const lo = read_u32(res, LE), hi = read_u32(res, LE); + nobs = LE ? (nobs_lo = lo) + (nobs_hi = hi) * Math.pow(2, 32) : (nobs_lo = hi) + (nobs_hi = lo) * Math.pow(2, 32); + } + if (nobs > 1e6) + console.error(`More than 1 million observations -- extra rows will be dropped`); + } + { + if (!valid_inc(d, ""); + const w = vers >= 118 ? 2 : 1; + const strlen = w == 1 ? read_u8(res) : read_u16(res, LE); + if (strlen + w != res.str.length) + throw `Expected string length ${strlen} but actual length was ${res.str.length - w}`; + if (strlen > 0) + label = new TextDecoder().decode(res.raw.slice(w)); + } + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + const strlen = read_u8(res); + if (strlen + 1 != res.str.length) + throw `Expected string length ${strlen} but actual length was ${res.str.length - 1}`; + if (strlen > 0) + timestamp = res.str.slice(1); + } + if (!valid_inc(d, "
")) + throw err; + } + { + if (!valid_inc(d, "")) + throw err; + skip_end(d, ""); + } + let stride = 0; + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + if (res.raw.length != 2 * nvar) + throw `Expected variable_types length ${nvar * 2}, found ${res.raw.length}`; + while (res.ptr < res.raw.length) { + const type = read_u16(res, LE); + var_types.push(type); + if (type >= 1 && type <= 2045) + stride += type; + else + switch (type) { + case 32768: + stride += 8; + break; + case 65526: + stride += 8; + break; + case 65527: + stride += 4; + break; + case 65528: + stride += 4; + break; + case 65529: + stride += 2; + break; + case 65530: + stride += 1; + break; + default: + throw `Unsupported field type ${type}`; + } + } + } + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + const w = vers >= 118 ? 129 : 33; + if (res.raw.length != w * nvar) + throw `Expected variable_types length ${nvar * w}, found ${res.raw.length}`; + while (res.ptr < res.raw.length) { + const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w)); + res.ptr += w; + var_names.push(name.replace(/\x00[\s\S]*/, "")); + } + } + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + if (res.raw.length != 2 * nvar + 2) + throw `Expected sortlist length ${nvar * 2 + 2}, found ${res.raw.length}`; + } + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + const w = vers >= 118 ? 57 : 49; + if (res.raw.length != w * nvar) + throw `Expected formats length ${nvar * w}, found ${res.raw.length}`; + while (res.ptr < res.raw.length) { + const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w)); + res.ptr += w; + formats.push(name.replace(/\x00[\s\S]*/, "")); + } + } + { + if (!valid_inc(d, "")) + throw err; + const w = vers >= 118 ? 129 : 33; + const res = slice_end(d, ""); + } + { + if (!valid_inc(d, "")) + throw err; + const w = vers >= 118 ? 321 : 81; + const res = slice_end(d, ""); + } + { + if (!valid_inc(d, "")) + throw err; + while (d.str.slice(d.ptr, d.ptr + 4) == "") { + d.ptr += 4; + const len = read_u32(d, LE); + d.ptr += len; + if (!valid_inc(d, "")) + throw err; + } + if (!valid_inc(d, "")) + throw err; + } + const ws = _utils.aoa_to_sheet([var_names], { dense: true }); + var ptrs = []; + { + if (!valid_inc(d, "")) + throw err; + for (let R = 0; R < nobs; ++R) { + const row = []; + for (let C = 0; C < nvar; ++C) { + let t = var_types[C]; + if (t >= 1 && t <= 2045) { + let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t)); + s = s.replace(/\x00[\s\S]*/, ""); + row[C] = s; + d.ptr += t; + } else + switch (t) { + case 65526: + row[C] = read_f64(d, LE); + break; + case 65527: + row[C] = read_f32(d, LE); + break; + case 65528: + row[C] = read_i32(d, LE); + break; + case 65529: + row[C] = read_i16(d, LE); + break; + case 65530: + row[C] = read_i8(d); + break; + case 32768: + { + row[C] = "##SheetJStrL##"; + ptrs.push([R + 1, C, d.raw.slice(d.ptr, d.ptr + 8)]); + d.ptr += 8; + } + break; + default: + throw `Unsupported field type ${t} for ${var_names[C]}`; + } + } + _utils.sheet_add_aoa(ws, [row], { origin: -1, sheetStubs: true }); + } + if (!valid_inc(d, "")) + throw err; + } + { + if (!valid_inc(d, "")) + throw err; + const strl_tbl = []; + while (d.raw[d.ptr] == 71) { + if (!valid_inc(d, "GSO")) + throw err; + const v = read_u32(d, LE); + let o = 0; + if (vers == 117) + o = read_u32(d, LE); + else { + const lo = read_u32(d, LE), hi = read_u32(d, LE); + o = LE ? lo + hi * Math.pow(2, 32) : hi + lo * Math.pow(2, 32); + if (o > 1e6) + console.error(`More than 1 million observations -- data will be dropped`); + } + const t = read_u8(d); + const len = read_u32(d, LE); + if (!strl_tbl[o]) + strl_tbl[o] = []; + let str2 = ""; + if (t == 129) { + str2 = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)); + d.ptr += len; + } else { + str2 = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)).replace(/\x00$/, ""); + d.ptr += len; + } + strl_tbl[o][v] = str2; + } + if (!valid_inc(d, "")) + throw err; + ptrs.forEach(([R, C, buf]) => { + const dv = u8_to_dataview(buf); + let v = 0, o = 0; + switch (vers) { + case 117: + { + v = dv.getUint32(0, LE); + o = dv.getUint32(4, LE); + } + break; + case 118: + case 120: + { + v = dv.getUint16(0, LE); + const o1 = dv.getUint16(2, LE), o2 = dv.getUint32(4, LE); + o = LE ? o1 + o2 * 65536 : o2 + o1 * 2 ** 32; + } + break; + case 119: + case 121: { + const v1 = dv.getUint16(0, LE), v2 = buf[2]; + v = LE ? v1 + (v2 << 16) : v2 + (v1 << 8); + const o1 = buf[3], o2 = dv.getUint32(4, LE); + o = LE ? o1 + o2 * 256 : o2 + o1 * 2 ** 32; + } + } + ws["!data"][R][C].v = strl_tbl[o][v]; + }); + } + { + if (!valid_inc(d, "")) + throw err; + const res = slice_end(d, ""); + } + if (!valid_inc(d, "
")) + throw err; + const wb = _utils.book_new(); + _utils.book_append_sheet(wb, ws, "Sheet1"); + return wb; +} +function parse_legacy(raw) { + let vers = raw[0]; + switch (vers) { + case 102: + case 112: + throw `Unsupported DTA ${vers} file`; + case 103: + case 104: + case 105: + case 108: + case 110: + case 111: + case 113: + case 114: + case 115: + break; + default: + throw new Error("Not a DTA file"); + } + const d = { + ptr: 1, + raw, + str: "", + dv: u8_to_dataview(raw) + }; + let LE = true; + let nvar = 0, nobs = 0; + let label = "", timestamp = ""; + const var_types = []; + const var_names = []; + const formats = []; + { + const byteorder = read_u8(d); + switch (byteorder) { + case 1: + LE = false; + break; + case 2: + LE = true; + break; + default: + throw `DTA ${vers} Unexpected byteorder ${byteorder}`; + } + let byte = read_u8(d); + if (byte != 1) + throw `DTA ${vers} Unexpected filetype ${byte}`; + d.ptr++; + nvar = read_u16(d, LE); + nobs = read_u32(d, LE); + d.ptr += vers >= 108 ? 81 : 32; + if (vers >= 105) + d.ptr += 18; + } + { + let C = 0; + for (C = 0; C < nvar; ++C) + var_types.push(read_u8(d)); + const w = vers >= 110 ? 33 : 9; + for (C = 0; C < nvar; ++C) { + var_names.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + w)).replace(/\x00[\s\S]*$/, "")); + d.ptr += w; + } + d.ptr += 2 * (nvar + 1); + const fw = vers >= 114 ? 49 : vers >= 105 ? 12 : 7; + for (C = 0; C < nvar; ++C) { + formats.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + fw)).replace(/\x00[\s\S]*$/, "")); + d.ptr += fw; + } + d.ptr += (vers >= 110 ? 33 : 9) * nvar; + } + d.ptr += (vers >= 106 ? 81 : 32) * nvar; + if (vers >= 105) + while (d.ptr < d.raw.length) { + const dt = read_u8(d), len = (vers >= 111 ? read_u32 : read_u16)(d, LE); + if (dt == 0 && len == 0) + break; + d.ptr += len; + } + const ws = _utils.aoa_to_sheet([var_names], { dense: true }); + for (let R = 0; R < nobs; ++R) { + const row = []; + for (let C = 0; C < nvar; ++C) { + let t = var_types[C]; + if (vers >= 111 && t >= 1 && t <= 244) { + let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t)); + s = s.replace(/\x00[\s\S]*/, ""); + row[C] = s; + d.ptr += t; + } else + switch (t) { + case 251: + case 98: + row[C] = read_i8(d); + break; + case 252: + case 105: + row[C] = read_i16(d, LE); + break; + case 253: + case 108: + row[C] = read_i32(d, LE); + break; + case 254: + case 102: + row[C] = read_f32(d, LE); + break; + case 255: + case 100: + row[C] = read_f64(d, LE); + break; + default: + throw `Unsupported field type ${t} for ${var_names[C]}`; + } + } + _utils.sheet_add_aoa(ws, [row], { origin: -1, sheetStubs: true }); + } + const wb = _utils.book_new(); + _utils.book_append_sheet(wb, ws, "Sheet1"); + return wb; +} +function parse(data) { + if (data[0] >= 102 && data[0] <= 115) + return parse_legacy(data); + if (data[0] === 60) + return parse_tagged(data); + throw new Error("Not a DTA file"); +} +module.exports = __toCommonJS(dta_exports); +// Annotate the CommonJS export names for ESM import in node: +0 && (module.exports = { + parse, + set_utils +}); diff --git a/packages/dta/dist/dta.min.js b/packages/dta/dist/dta.min.js new file mode 100644 index 0000000..6d47997 --- /dev/null +++ b/packages/dta/dist/dta.min.js @@ -0,0 +1,2 @@ +var DTA=(()=>{var O=Object.defineProperty;var I=Object.getOwnPropertyDescriptor;var K=Object.getOwnPropertyNames;var J=Object.prototype.hasOwnProperty;var j=r=>O(r,"__esModule",{value:!0});var q=(r,t)=>{for(var s in t)O(r,s,{get:t[s],enumerable:!0})},z=(r,t,s,e)=>{if(t&&typeof t=="object"||typeof t=="function")for(let c of K(t))!J.call(r,c)&&(s||c!=="default")&&O(r,c,{get:()=>t[c],enumerable:!(e=I(t,c))||e.enumerable});return r};var H=(r=>(t,s)=>r&&r.get(t)||(s=z(j({}),t,1),r&&r.set(t,s),s))(typeof WeakMap!="undefined"?new WeakMap:0);var re={};q(re,{parse:()=>te,set_utils:()=>Q});var k;function Q(r){k=r}function W(r){return new DataView(r.buffer,r.byteOffset,r.byteLength)}function i(r,t){return r.str.slice(r.ptr,r.ptr+t.length)!=t?!1:(r.ptr+=t.length,!0)}function X(r,t){let s=r.str.indexOf(t,r.ptr);if(s==-1)throw new Error(`Expected ${t} after offset ${r.ptr}`);r.ptr=s+t.length}function p(r,t){let s=r.str.indexOf(t,r.ptr);if(s==-1)throw new Error(`Expected ${t} after offset ${r.ptr}`);let e=r.raw.slice(r.ptr,s),c={ptr:0,raw:e,str:r.str.slice(r.ptr,s),dv:W(e)};return r.ptr=s+t.length,c}function F(r,t){r.ptr+=8;let s=r.dv.getFloat64(r.ptr-8,t);return s>8988e304?null:s}function R(r,t){r.ptr+=4;let s=r.dv.getFloat32(r.ptr-4,t);return s>1701e35?null:s}function g(r,t){return r.ptr+=4,r.dv.getUint32(r.ptr-4,t)}function V(r,t){r.ptr+=4;let s=r.dv.getInt32(r.ptr-4,t);return s>2147483620?null:s}function P(r,t){return r.ptr+=2,r.dv.getUint16(r.ptr-2,t)}function C(r,t){r.ptr+=2;let s=r.dv.getInt16(r.ptr-2,t);return s>32740?null:s}function x(r){return r.raw[r.ptr++]}function G(r){let t=r.raw[r.ptr++];return t=t<128?t:t-256,t>100?null:t}var Y=["117","118"];function Z(r){let t="Not a DTA file",s=new TextDecoder("latin1").decode(r),e={ptr:0,raw:r,str:s,dv:W(r)},c=118,n=!0,b=0,S=0,$=0,D=0,M="",A="",T=[],d=[],h=[];if(!i(e,""))throw t;{if(!i(e,"
"))throw t;{if(!i(e,""))throw t;let o=p(e,"");if(Y.indexOf(o.str)==-1)throw`Unsupported DTA ${o.str} file`;c=+o.str}{if(!i(e,""))throw t;let o=p(e,"");switch(o.str){case"MSF":n=!1;break;case"LSF":n=!0;break;default:throw`Unsupported byteorder ${o.str}`}}{if(!i(e,""))throw t;let o=p(e,"");b=P(o,n)}{if(!i(e,""))throw t;let o=p(e,"");if(c==117)S=$=g(o,n);else{let a=g(o,n),l=g(o,n);S=n?($=a)+(D=l)*Math.pow(2,32):($=l)+(D=a)*Math.pow(2,32)}S>1e6&&console.error("More than 1 million observations -- extra rows will be dropped")}{if(!i(e,""),a=c>=118?2:1,l=a==1?x(o):P(o,n);if(l+a!=o.str.length)throw`Expected string length ${l} but actual length was ${o.str.length-a}`;l>0&&(M=new TextDecoder().decode(o.raw.slice(a)))}{if(!i(e,""))throw t;let o=p(e,""),a=x(o);if(a+1!=o.str.length)throw`Expected string length ${a} but actual length was ${o.str.length-1}`;a>0&&(A=o.str.slice(1))}if(!i(e,"
"))throw t}{if(!i(e,""))throw t;X(e,"")}let f=0;{if(!i(e,""))throw t;let o=p(e,"");if(o.raw.length!=2*b)throw`Expected variable_types length ${b*2}, found ${o.raw.length}`;for(;o.ptr=1&&a<=2045)f+=a;else switch(a){case 32768:f+=8;break;case 65526:f+=8;break;case 65527:f+=4;break;case 65528:f+=4;break;case 65529:f+=2;break;case 65530:f+=1;break;default:throw`Unsupported field type ${a}`}}}{if(!i(e,""))throw t;let o=p(e,""),a=c>=118?129:33;if(o.raw.length!=a*b)throw`Expected variable_types length ${b*a}, found ${o.raw.length}`;for(;o.ptr"))throw t;let o=p(e,"");if(o.raw.length!=2*b+2)throw`Expected sortlist length ${b*2+2}, found ${o.raw.length}`}{if(!i(e,""))throw t;let o=p(e,""),a=c>=118?57:49;if(o.raw.length!=a*b)throw`Expected formats length ${b*a}, found ${o.raw.length}`;for(;o.ptr"))throw t;let o=c>=118?129:33,a=p(e,"")}{if(!i(e,""))throw t;let o=c>=118?321:81,a=p(e,"")}{if(!i(e,""))throw t;for(;e.str.slice(e.ptr,e.ptr+4)=="";){e.ptr+=4;let o=g(e,n);if(e.ptr+=o,!i(e,""))throw t}if(!i(e,""))throw t}let _=k.aoa_to_sheet([d],{dense:!0});var U=[];{if(!i(e,""))throw t;for(let o=0;o=1&&u<=2045){let w=new TextDecoder().decode(e.raw.slice(e.ptr,e.ptr+u));w=w.replace(/\x00[\s\S]*/,""),a[l]=w,e.ptr+=u}else switch(u){case 65526:a[l]=F(e,n);break;case 65527:a[l]=R(e,n);break;case 65528:a[l]=V(e,n);break;case 65529:a[l]=C(e,n);break;case 65530:a[l]=G(e);break;case 32768:a[l]="##SheetJStrL##",U.push([o+1,l,e.raw.slice(e.ptr,e.ptr+8)]),e.ptr+=8;break;default:throw`Unsupported field type ${u} for ${d[l]}`}}k.sheet_add_aoa(_,[a],{origin:-1,sheetStubs:!0})}if(!i(e,""))throw t}{if(!i(e,""))throw t;let o=[];for(;e.raw[e.ptr]==71;){if(!i(e,"GSO"))throw t;let a=g(e,n),l=0;if(c==117)l=g(e,n);else{let v=g(e,n),m=g(e,n);l=n?v+m*Math.pow(2,32):m+v*Math.pow(2,32),l>1e6&&console.error("More than 1 million observations -- data will be dropped")}let u=x(e),w=g(e,n);o[l]||(o[l]=[]);let y="";u==129?(y=new TextDecoder("latin1").decode(e.raw.slice(e.ptr,e.ptr+w)),e.ptr+=w):(y=new TextDecoder("latin1").decode(e.raw.slice(e.ptr,e.ptr+w)).replace(/\x00$/,""),e.ptr+=w),o[l][a]=y}if(!i(e,""))throw t;U.forEach(([a,l,u])=>{let w=W(u),y=0,v=0;switch(c){case 117:y=w.getUint32(0,n),v=w.getUint32(4,n);break;case 118:case 120:{y=w.getUint16(0,n);let m=w.getUint16(2,n),E=w.getUint32(4,n);v=n?m+E*65536:E+m*2**32}break;case 119:case 121:{let m=w.getUint16(0,n),E=u[2];y=n?m+(E<<16):E+(m<<8);let B=u[3],L=w.getUint32(4,n);v=n?B+L*256:L+B*2**32}}_["!data"][a][l].v=o[v][y]})}{if(!i(e,""))throw t;let o=p(e,"")}if(!i(e,"
"))throw t;let N=k.book_new();return k.book_append_sheet(N,_,"Sheet1"),N}function ee(r){let t=r[0];switch(t){case 102:case 112:throw`Unsupported DTA ${t} file`;case 103:case 104:case 105:case 108:case 110:case 111:case 113:case 114:case 115:break;default:throw new Error("Not a DTA file")}let s={ptr:1,raw:r,str:"",dv:W(r)},e=!0,c=0,n=0,b="",S="",$=[],D=[],M=[];{let d=x(s);switch(d){case 1:e=!1;break;case 2:e=!0;break;default:throw`DTA ${t} Unexpected byteorder ${d}`}let h=x(s);if(h!=1)throw`DTA ${t} Unexpected filetype ${h}`;s.ptr++,c=P(s,e),n=g(s,e),s.ptr+=t>=108?81:32,t>=105&&(s.ptr+=18)}{let d=0;for(d=0;d=110?33:9;for(d=0;d=114?49:t>=105?12:7;for(d=0;d=110?33:9)*c}if(s.ptr+=(t>=106?81:32)*c,t>=105)for(;s.ptr=111?g:P)(s,e);if(d==0&&h==0)break;s.ptr+=h}let A=k.aoa_to_sheet([D],{dense:!0});for(let d=0;d=111&&_>=1&&_<=244){let U=new TextDecoder().decode(s.raw.slice(s.ptr,s.ptr+_));U=U.replace(/\x00[\s\S]*/,""),h[f]=U,s.ptr+=_}else switch(_){case 251:case 98:h[f]=G(s);break;case 252:case 105:h[f]=C(s,e);break;case 253:case 108:h[f]=V(s,e);break;case 254:case 102:h[f]=R(s,e);break;case 255:case 100:h[f]=F(s,e);break;default:throw`Unsupported field type ${_} for ${D[f]}`}}k.sheet_add_aoa(A,[h],{origin:-1,sheetStubs:!0})}let T=k.book_new();return k.book_append_sheet(T,A,"Sheet1"),T}function te(r){if(r[0]>=102&&r[0]<=115)return ee(r);if(r[0]===60)return Z(r);throw new Error("Not a DTA file")}return H(re);})(); +//# sourceMappingURL=dta.min.js.map diff --git a/packages/dta/dta.ts b/packages/dta/dta.ts new file mode 100644 index 0000000..e43c49a --- /dev/null +++ b/packages/dta/dta.ts @@ -0,0 +1,543 @@ +import { DenseWorkSheet, WorkBook, type utils } from 'xlsx'; +export { parse, set_utils }; + +let _utils: typeof utils; +/** Set internal instance of `utils` + * + * Usage: + * + * ```js + * const XLSX = require("xlsx"); + * const DTA = require("dta"); + * DTA.set_utils(XLSX.utils); + * ``` + * + * @param utils utils object + */ +function set_utils(utils: any): void { + _utils = utils; +} + +interface Payload { + /** Offset */ + ptr: number; + + /** Raw data */ + raw: Uint8Array; + + /** Latin-1 encoded */ + str: string; + + /** DataView */ + dv: DataView; +} + +function u8_to_dataview(array: Uint8Array): DataView { return new DataView(array.buffer, array.byteOffset, array.byteLength); } +function valid_inc(p: Payload, n: string): boolean { + if(p.str.slice(p.ptr, p.ptr + n.length) != n) return false; + p.ptr += n.length; + return true; +} + +function skip_end(p: Payload, n: string): void { + const idx = p.str.indexOf(n, p.ptr); + if(idx == -1) throw new Error(`Expected ${n} after offset ${p.ptr}`); + p.ptr = idx + n.length; +} +function slice_end(p: Payload, n: string): Payload { + const idx = p.str.indexOf(n, p.ptr); + if(idx == -1) throw new Error(`Expected ${n} after offset ${p.ptr}`); + const raw = p.raw.slice(p.ptr, idx); + const res = { + ptr: 0, + raw, + str: p.str.slice(p.ptr, idx), + dv: u8_to_dataview(raw) + }; + p.ptr = idx + n.length; + return res; +} + +function read_f64(p: Payload, LE: boolean): number | null { + p.ptr += 8; + const d = p.dv.getFloat64(p.ptr - 8, LE); + return d > 8.988e+307 ? null : d; +} +function read_f32(p: Payload, LE: boolean): number | null { + p.ptr += 4; + const d = p.dv.getFloat32(p.ptr - 4, LE); + return d > 1.701e+38 ? null : d; + +} +function read_u32(p: Payload, LE: boolean) { + p.ptr += 4; + return p.dv.getUint32(p.ptr - 4, LE); +} +function read_i32(p: Payload, LE: boolean): number | null { + p.ptr += 4; + const u = p.dv.getInt32(p.ptr - 4, LE); + return u > 0x7fffffe4 ? null : u; +} +function read_u16(p: Payload, LE: boolean) { + p.ptr += 2; + return p.dv.getUint16(p.ptr - 2, LE); +} +function read_i16(p: Payload, LE: boolean): number | null { + p.ptr += 2; + const u = p.dv.getInt16(p.ptr - 2, LE); + return u > 32740 ? null : u; +} +function read_u8(p: Payload) { + return p.raw[p.ptr++]; +} +function read_i8(p: Payload): number | null { + let u = p.raw[p.ptr++]; + u = u < 128 ? u : u - 256; + return u > 100 ? null : u; +} + +const SUPPORTED_VERSIONS_TAGGED = [ + "117", // stata 13 + "118", // stata 14-18 + // "119", // stata 15/16/17/18 (> 32767 variables) + // "120", // stata 18 (<= 32767, with aliases) + // "121", // stata 18 (> 32767, with aliases) +]; + +function parse_tagged(raw: Uint8Array): WorkBook { + const err = ("Not a DTA file"); + /* sadly the web zealots decided to abandon binary strings */ + const str = new TextDecoder('latin1').decode(raw); + + const d: Payload = { + ptr: 0, + raw, + str, + dv: u8_to_dataview(raw) + } + + let vers: number = 118; + let LE: boolean = true; + let nvar: number = 0, nobs: number = 0, nobs_lo = 0, nobs_hi = 0; + let label: string = "", timestamp: string = ""; + const var_types: number[] = []; + const var_names: string[] = []; + const formats: string[] = []; + + /* 5. Dataset format definition */ + if(!valid_inc(d, "")) throw err; + + /* 5.1 Header
*/ + { + if(!valid_inc(d, "
")) throw err; + + /* */ + { + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + if(SUPPORTED_VERSIONS_TAGGED.indexOf(res.str) == -1) throw (`Unsupported DTA ${res.str} file`); + vers = +res.str; + } + + /* */ + { + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + switch(res.str) { + case "MSF": LE = false; break; + case "LSF": LE = true; break; + default: throw (`Unsupported byteorder ${res.str}`); + } + } + + /* */ + { + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + nvar = read_u16(res, LE); + } + + /* */ + { + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + if(vers == 117) nobs = nobs_lo = read_u32(res, LE); + else { + const lo = read_u32(res, LE), hi = read_u32(res, LE); + nobs = LE ? ((nobs_lo = lo) + (nobs_hi = hi) * Math.pow(2,32)) : ((nobs_lo = hi) + (nobs_hi = lo) * Math.pow(2,32)); + } + if(nobs > 1e6) console.error(`More than 1 million observations -- extra rows will be dropped`); + } + + /*
")) throw err; + } + + /* 5.2 Map */ + { + /* TODO: validate map? */ + if(!valid_inc(d, "")) throw err; + /* 14 8-byte offsets for: + + + + + + + + + + + + + + EOF + */ + skip_end(d, ""); + } + + let stride = 0; + /* 5.3 Variable types */ + { + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + if(res.raw.length != 2 * nvar) throw (`Expected variable_types length ${nvar * 2}, found ${res.raw.length}`); + while(res.ptr < res.raw.length) { + const type = read_u16(res, LE); + var_types.push(type); + if(type >= 1 && type <= 2045) stride += type; + else switch(type) { + case 32768: stride += 8; break; + case 65526: stride += 8; break; + case 65527: stride += 4; break; + case 65528: stride += 4; break; + case 65529: stride += 2; break; + case 65530: stride += 1; break; + default: throw (`Unsupported field type ${type}`); + } + } + } + + /* 5.4 Variable names */ + { + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + const w = vers >= 118 ? 129 : 33; + if(res.raw.length != w * nvar) throw (`Expected variable_types length ${nvar * w}, found ${res.raw.length}`); + while(res.ptr < res.raw.length) { + const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w)); + res.ptr += w; + var_names.push(name.replace(/\x00[\s\S]*/,"")); + } + } + + /* 5.5 Sort order of observations */ + { + /* TODO: check sort list? */ + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + if(res.raw.length != 2 * nvar + 2) throw (`Expected sortlist length ${nvar * 2 + 2}, found ${res.raw.length}`); + } + + /* 5.6 Display formats */ + { + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + const w = vers >= 118 ? 57 : 49; + if(res.raw.length != w * nvar) throw (`Expected formats length ${nvar * w}, found ${res.raw.length}`); + while(res.ptr < res.raw.length) { + const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w)); + res.ptr += w; + formats.push(name.replace(/\x00[\s\S]*/,"")); + } + } + + /* TODO: */ + { + if(!valid_inc(d, "")) throw err; + const w = vers >= 118 ? 129 : 33; + const res = slice_end(d, ""); + } + + /* TODO: */ + { + if(!valid_inc(d, "")) throw err; + const w = vers >= 118 ? 321 : 81; + const res = slice_end(d, ""); + } + + /* 5.9 Characteristics */ + { + if(!valid_inc(d, "")) throw err; + while(d.str.slice(d.ptr, d.ptr + 4) == "") { + d.ptr += 4; + const len = read_u32(d, LE); + d.ptr += len; + if(!valid_inc(d, "")) throw err; + } + if(!valid_inc(d, "")) throw err; + } + + const ws: DenseWorkSheet = (_utils.aoa_to_sheet([var_names], {dense: true}) as DenseWorkSheet); + + var ptrs: Array<[number, number, Uint8Array]> = [] + /* 5.10 Data */ + { + if(!valid_inc(d, "")) throw err; + for(let R = 0; R < nobs; ++R) { + const row: any[] = []; + for(let C = 0; C < nvar; ++C) { + let t = var_types[C]; + // TODO: formats, dta_12{0,1} aliases? + if(t >= 1 && t <= 2045) { + /* NOTE: dta_117 restricts strf to ASCII */ + let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t)); + s = s.replace(/\x00[\s\S]*/,""); + row[C] = s; + d.ptr += t; + } else switch(t) { + case 65526: row[C] = read_f64(d, LE); break; + case 65527: row[C] = read_f32(d, LE); break; + case 65528: row[C] = read_i32(d, LE); break; + case 65529: row[C] = read_i16(d, LE); break; + case 65530: row[C] = read_i8(d); break; + case 32768: { + row[C] = "##SheetJStrL##"; + ptrs.push([R+1,C, d.raw.slice(d.ptr, d.ptr + 8)]); + d.ptr += 8; + } break; + default: throw (`Unsupported field type ${t} for ${var_names[C]}`); + } + } + _utils.sheet_add_aoa(ws, [row], {origin: -1, sheetStubs: true}); + } + if(!valid_inc(d, "")) throw err; + } + + /* 5.11 StrLs */ + { + if(!valid_inc(d, "")) throw err; + + const strl_tbl: string[][] = []; + while(d.raw[d.ptr] == 71 /* G */) { + if(!valid_inc(d, "GSO")) throw err; + const v = read_u32(d, LE); + let o = 0; + if(vers == 117) o = read_u32(d, LE); + else { + const lo = read_u32(d, LE), hi = read_u32(d, LE); + o = LE ? (lo + hi * Math.pow(2,32)) : (hi + lo * Math.pow(2,32)); + if(o > 1e6) console.error(`More than 1 million observations -- data will be dropped`); + } + const t = read_u8(d); + const len = read_u32(d, LE); + if(!strl_tbl[o]) strl_tbl[o] = []; + let str = ""; + if(t == 129) { + // TODO: codepage + str = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)); + d.ptr += len; + } else { + str = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)).replace(/\x00$/,""); + d.ptr += len; + } + strl_tbl[o][v] = str; + } + if(!valid_inc(d, "")) throw err; + + ptrs.forEach(([R,C,buf]) => { + const dv = u8_to_dataview(buf); + let v = 0, o = 0; + switch(vers) { + case 117: { // v(4) o(4) + v = dv.getUint32(0, LE); + o = dv.getUint32(4, LE); + } break; + + case 118: case 120: { // v(2) o(6) + v = dv.getUint16(0, LE); + const o1 = dv.getUint16(2, LE), o2 = dv.getUint32(4, LE); + o = LE ? o1 + o2 * 65536 : o2 + o1 * (2**32); + } break; + + case 119: case 121: { // v(3) o(5) + const v1 = dv.getUint16(0, LE), v2 = buf[2]; + v = LE ? v1 + (v2 << 16) : v2 + (v1 << 8); + const o1 = buf[3], o2 = dv.getUint32(4, LE); + o = LE ? o1 + o2 * 256 : o2 + o1 * (2**32); + } + } + ws["!data"][R][C].v = strl_tbl[o][v]; + }); + } + + /* 5.12 Value labels */ + { + if(!valid_inc(d, "")) throw err; + const res = slice_end(d, ""); + } + + if(!valid_inc(d, "")) throw err; + + const wb = _utils.book_new(); + _utils.book_append_sheet(wb, ws, "Sheet1"); + return wb; +} + +function parse_legacy(raw: Uint8Array): WorkBook { + let vers: number = raw[0]; + switch(vers) { + case 102: // stata 1 + case 112: // stata 8/9 + throw (`Unsupported DTA ${vers} file`); + + case 103: // stata 2/3 + case 104: // stata 4 + case 105: // stata 5 + case 108: // stata 6 + case 110: // stata 7 + case 111: // stata 7 + case 113: // stata 8/9 + case 114: // stata 10/11 + case 115: // stata 12 + break; + + default: throw new Error("Not a DTA file"); + } + + const d: Payload = { + ptr: 1, + raw, + str:"", + dv: u8_to_dataview(raw) + } + + let LE: boolean = true; + let nvar: number = 0, nobs: number = 0; + let label: string = "", timestamp: string = ""; + const var_types: number[] = []; + const var_names: string[] = []; + const formats: string[] = []; + + /* 5.1 Header */ + { + const byteorder = read_u8(d); + switch(byteorder) { + case 1: LE = false; break; + case 2: LE = true; break; + default: throw (`DTA ${vers} Unexpected byteorder ${byteorder}`); + } + + let byte = read_u8(d); + if(byte != 1) throw (`DTA ${vers} Unexpected filetype ${byte}`); + // NOTE: dta_105 technically supports filetype 2 + + d.ptr++; // "unused" + nvar = read_u16(d, LE); + nobs = read_u32(d, LE); + d.ptr += (vers >= 108 ? 81 : 32); // TODO: data_label + if(vers >= 105) d.ptr += 18; // TODO: time_stamp + } + + /* 5.2 Descriptors */ + { + let C = 0; + + // typlist + for(C = 0; C < nvar; ++C) var_types.push(read_u8(d)); + + // varlist + const w = vers >= 110 ? 33 : 9; + for(C = 0; C < nvar; ++C) { + var_names.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + w)).replace(/\x00[\s\S]*$/,"")); + d.ptr += w; + } + + // srtlist + d.ptr += 2*(nvar + 1); + + // fmtlist + const fw = (vers >= 114 ? 49 : vers >= 105 ? 12 : 7); + for(C = 0; C < nvar; ++C) { + formats.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + fw)).replace(/\x00[\s\S]*$/,"")); + d.ptr += fw; + } + + // lbllist + d.ptr += (vers >= 110 ? 33 : 9) * nvar; + } + + /* 5.3 Variable labels */ + // TODO: should these names be used in the worksheet? + d.ptr += (vers >= 106 ? 81 : 32) * nvar; + + /* 5.4 Expansion fields */ + if(vers >= 105) while(d.ptr < d.raw.length) { + const dt = read_u8(d), len = (vers >= 111 ? read_u32 : read_u16)(d, LE); + if(dt == 0 && len == 0) break; + d.ptr += len; + } + + const ws: DenseWorkSheet = (_utils.aoa_to_sheet([var_names], {dense: true}) as DenseWorkSheet); + + /* 5.5 Data */ + for(let R = 0; R < nobs; ++R) { + const row: any[] = []; + for(let C = 0; C < nvar; ++C) { + let t = var_types[C]; + // TODO: data type processing + if(vers >= 111 && t >= 1 && t <= 244) { + /* NOTE: dta_117 restricts strf to ASCII */ + let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t)); + s = s.replace(/\x00[\s\S]*/,""); + row[C] = s; + d.ptr += t; + } else switch(t) { + case 251: case 0x62: row[C] = read_i8(d); break; // byte + case 252: case 0x69: row[C] = read_i16(d, LE); break; // int + case 253: case 0x6c: row[C] = read_i32(d, LE); break; // long + case 254: case 0x66: row[C] = read_f32(d, LE); break; // float + case 255: case 0x64: row[C] = read_f64(d, LE); break; // double + default: throw (`Unsupported field type ${t} for ${var_names[C]}`); + } + } + _utils.sheet_add_aoa(ws, [row], {origin: -1, sheetStubs: true}); + } + + /* 5.6 Value labels */ + // EOF or labels + + const wb: WorkBook = _utils.book_new(); + _utils.book_append_sheet(wb, ws, "Sheet1"); + return wb; +} + +/** Parse DTA file + * + * NOTE: In NodeJS, `Buffer` extends `Uint8Array` + * + * @param {Uint8Array} data File data + */ +function parse(data: Uint8Array): WorkBook { + if(data[0] >= 102 && data[0] <= 115) return parse_legacy(data); + if(data[0] === 60) return parse_tagged(data); + throw new Error("Not a DTA file"); +} diff --git a/packages/dta/package.json b/packages/dta/package.json new file mode 100644 index 0000000..0b14afa --- /dev/null +++ b/packages/dta/package.json @@ -0,0 +1,36 @@ +{ + "name": "dta", + "version": "0.0.1", + "author": "sheetjs", + "description": "Stata .dta codecs for SheetJS Common Spreadsheet Format", + "bin": { + "dta2csv": "./bin/dta2csv.njs" + }, + "main": "dist/dta.js", + "types": "types", + "files": [ + "dist/" + ], + "repository": { + "type": "git", + "url": "https://git.sheetjs.com/SheetJS/sheetjs", + "directory": "packages/dta" + }, + "scripts": { + "test": "make test", + "build": "make", + "lint": "make fullint", + "dtslint": "dtslint types" + }, + "homepage": "https://sheetjs.com/", + "bugs": { + "url": "https://git.sheetjs.com/SheetJS/sheetjs/issues" + }, + "license": "Apache-2.0", + "engines": { + "node": ">=12.0" + }, + "devDependencies": { + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.0/xlsx-0.20.0.tgz" + } +} diff --git a/packages/dta/test.js b/packages/dta/test.js new file mode 100644 index 0000000..c7c5d83 --- /dev/null +++ b/packages/dta/test.js @@ -0,0 +1,32 @@ +/* eslint-env mocha, node, es6 */ +const fs = require("fs"), assert = require("assert"); + +const DTA = require("./"); +const XLSX = require("xlsx"); +DTA.set_utils(XLSX.utils); + +const test_folders = [ + "test_files" +]; +for(let tF of test_folders) describe(tF, () => { + const test_files = fs.readdirSync(tF); + for(let tf of test_files) { + if(tf.endsWith("csv")) it(`${tf.replace(".csv", "")} [CSV]`, () => { + const buf = fs.readFileSync(`${tF}/${tf.replace(".csv", "")}`); + const wb = DTA.parse(buf); + assert(wb.SheetNames.length > 0); + /* stata will represent unspecified values as single spaces */ + wb.Sheets[wb.SheetNames[0]]["!data"].forEach(row => row.forEach(cell => {if(cell.t == "z") {cell.t = "s"; cell.v = " ";}})); + const csvstr = XLSX.utils.sheet_to_csv(wb.Sheets[wb.SheetNames[0]]); + const baseline = fs.readFileSync(`${tF}/${tf}`, "utf8").replace(/[\r\n]+/g,"\n"); + assert.equal(csvstr.trim(), baseline.trim()); + }); + if(!tf.endsWith("dta")) continue; + it(tf, () => { + const buf = fs.readFileSync(`${tF}/${tf}`); + const wb = DTA.parse(buf); + assert(wb.SheetNames.length > 0); + }); + } +}); + diff --git a/packages/dta/types/index.d.ts b/packages/dta/types/index.d.ts new file mode 100644 index 0000000..f69e4e7 --- /dev/null +++ b/packages/dta/types/index.d.ts @@ -0,0 +1,23 @@ +import type { WorkBook } from "xlsx"; + +/** Set internal instance of `utils` + * + * Usage: + * + * ```js + * const XLSX = require("xlsx"); + * const DTA = require("dta"); + * DTA.set_utils(XLSX.utils); + * ``` + * + * @param utils utils object + */ +export function set_utils(utils: any): void; + +/** Parse DTA file + * + * NOTE: In NodeJS, `Buffer` extends `Uint8Array` + * + * @param {Uint8Array} data File data + */ +export function parse(data: Uint8Array): WorkBook