dta initial

This commit is contained in:
SheetJS 2023-11-13 06:03:35 -05:00
parent cd5fafda32
commit 9199c2600c
12 changed files with 1285 additions and 7 deletions

1
.gitignore vendored

@ -32,6 +32,7 @@ tmp
*.[eE][tT][hH]
*.[nN][uU][mM][bB][eE][rR][sS]
*.[mM][oO][dD]
*.[dD][tT][aA]
*.123
*.htm
*.html

@ -173,8 +173,20 @@ function SSF_frac(x/*:number*/, D/*:number*/, mixed/*:?boolean*/)/*:Array<number
var q = Math.floor(sgn * P/Q);
return [q, sgn*P - q*Q, Q];
}
function SSF_normalize_xl_unsafe(v/*:number*/)/*:number*/ {
var s = v.toPrecision(16);
if(s.indexOf("e") > -1) {
var m = s.slice(0, s.indexOf("e"));
m = m.indexOf(".") > -1 ? m.slice(0, (m.slice(0,2) == "0." ? 17 : 16)) : (m.slice(0,15) + fill("0", m.length - 15));
return m + s.slice(s.indexOf("e"));
}
var n = s.indexOf(".") > -1 ? s.slice(0, (s.slice(0,2) == "0." ? 17 : 16)) : (s.slice(0,15) + fill("0", s.length - 15));
return Number(n);
}
function SSF_parse_date_code(v/*:number*/,opts/*:?any*/,b2/*:?boolean*/) {
if(v > 2958465 || v < 0) return null;
v = SSF_normalize_xl_unsafe(v);
var date = (v|0), time = Math.floor(86400 * (v - date)), dow=0;
var dout=[];
var out={D:date, T:time, u:86400*(v-date)-time,y:0,m:0,d:0,H:0,M:0,S:0,q:0};
@ -328,7 +340,7 @@ function SSF_write_date(type/*:number*/, fmt/*:string*/, val, ss0/*:?number*/)/*
switch(fmt) {
case '[h]': case '[hh]': out = val.D*24+val.H; break;
case '[m]': case '[mm]': out = (val.D*24+val.H)*60+val.M; break;
case '[s]': case '[ss]': out = ((val.D*24+val.H)*60+val.M)*60+Math.round(val.S+val.u); break;
case '[s]': case '[ss]': out = ((val.D*24+val.H)*60+val.M)*60+(ss0 == 0 ? Math.round(val.S+val.u) : val.S); break;
default: throw 'bad abstime format: ' + fmt;
} outl = fmt.length === 3 ? 1 : 2; break;
case 101: /* 'e' era */
@ -776,10 +788,11 @@ function eval_fmt(fmt/*:string*/, v/*:any*/, opts/*:any*/, flen/*:number*/) {
switch(out[i].t) {
case 'h': case 'H': out[i].t = hr; lst='h'; if(bt < 1) bt = 1; break;
case 's':
if((ssm=out[i].v.match(/\.0+$/))) ss0=Math.max(ss0,ssm[0].length-1);
if((ssm=out[i].v.match(/\.0+$/))) { ss0=Math.max(ss0,ssm[0].length-1); bt = 4;}
if(bt < 3) bt = 3;
/* falls through */
case 'd': case 'y': case 'M': case 'e': lst=out[i].t; break;
case 'd': case 'y': case 'e': lst=out[i].t; break;
case 'M': lst=out[i].t; if(bt < 2) bt = 2; break;
case 'm': if(lst === 's') { out[i].t = 'M'; if(bt < 2) bt = 2; } break;
case 'X': /*if(out[i].v === "B2");*/
break;
@ -789,19 +802,29 @@ function eval_fmt(fmt/*:string*/, v/*:any*/, opts/*:any*/, flen/*:number*/) {
if(bt < 3 && out[i].v.match(/[Ss]/)) bt = 3;
}
}
/* time rounding depends on presence of minute / second / usec fields */
var _dt;
switch(bt) {
case 0: break;
case 1:
/*::if(!dt) break;*/
case 2:
case 3:
if(dt.u >= 0.5) { dt.u = 0; ++dt.S; }
if(dt.S >= 60) { dt.S = 0; ++dt.M; }
if(dt.M >= 60) { dt.M = 0; ++dt.H; }
if(dt.H >= 24) { dt.H = 0; ++dt.D; _dt = SSF_parse_date_code(dt.D); _dt.u = dt.u; _dt.S = dt.S; _dt.M = dt.M; _dt.H = dt.H; dt = _dt; }
break;
case 2:
/*::if(!dt) break;*/
if(dt.u >= 0.5) { dt.u = 0; ++dt.S; }
case 4:
switch(ss0) {
case 1: dt.u = Math.round(dt.u * 10)/10; break;
case 2: dt.u = Math.round(dt.u * 100)/100; break;
case 3: dt.u = Math.round(dt.u * 1000)/1000; break;
}
if(dt.u >= 1) { dt.u = 0; ++dt.S; }
if(dt.S >= 60) { dt.S = 0; ++dt.M; }
if(dt.M >= 60) { dt.M = 0; ++dt.H; }
if(dt.H >= 24) { dt.H = 0; ++dt.D; _dt = SSF_parse_date_code(dt.D); _dt.u = dt.u; _dt.S = dt.S; _dt.M = dt.M; _dt.H = dt.H; dt = _dt; }
break;
}

26
packages/dta/.eslintrc Normal file

@ -0,0 +1,26 @@
{
"env": { "shared-node-browser":true },
"globals": {},
"parserOptions": {
"ecmaVersion": 6
},
"plugins": [ "html", "json" ],
"extends": "eslint:recommended",
"rules": {
"comma-style": [ 2, "last" ],
"comma-dangle": [ 2, "never" ],
"curly": 0,
"no-bitwise": 0,
"no-cond-assign": 1,
"no-console": 0,
"no-control-regex": 0,
"no-unused-vars": 1,
"no-empty": 0,
"no-trailing-spaces": 2,
"no-use-before-define": [ 1, {
"functions":false, "classes":true, "variables":false
}],
"no-useless-escape": 0,
"semi": [ 2, "always" ]
}
}

22
packages/dta/Makefile Normal file

@ -0,0 +1,22 @@
.PHONY: build
build: node browser
## NodeJS target
.PHONY: node
node: dist/dta.js
dist/dta.js: dta.ts
npx esbuild@0.14.14 dta.ts --bundle --outdir=dist --platform=node
.PHONY: test-node
test-node: dist/dta.js test.js
npx mocha@2.5.3 test.js
## Browser target
.PHONY: browser
browser: dist/dta.min.js
dist/dta.min.js: dta.ts
npx esbuild@0.14.14 dta.ts --bundle --outfile=dist/dta.min.js --minify --sourcemap --global-name=DTA

9
packages/dta/README.md Normal file

@ -0,0 +1,9 @@
# DTA Data File Codec
Codec for reading Stata .DTA files and generating CSF workbook objects
compatible with the [SheetJS](https://sheetjs.com) library constellation.
DTA datasets can support millions of observations and over 32767 variables.
The codec will truncate data to 1048576 observations and 16384 variables.
<https://docs.sheetjs.com/docs/constellation/dta> includes a live demo.

19
packages/dta/bin/dta2csv.njs Executable file

@ -0,0 +1,19 @@
#!/usr/bin/env node
/* eslint-env node, es6 */
const DTA = require("../");
const XLSX = (() => {
try {
const XLSX = require("xlsx");
DTA.set_utils(XLSX.utils);
return XLSX;
} catch(e) {
throw new Error("Must install the SheetJS file processing library! See https://docs.sheetjs.com/docs/getting-started/installation/nodejs for more details");
}
})();
const fs = require("fs");
const buf = fs.readFileSync(process.argv[2]);
const wb = DTA.parse(buf);
// translate stub cells to single blanks
wb.Sheets[wb.SheetNames[0]]["!data"].forEach(row => row.forEach(cell => {if(cell.t == "z") {cell.t = "s"; cell.v = " ";}}));
console.log(XLSX.utils.sheet_to_csv(wb.Sheets[wb.SheetNames[0]]));

542
packages/dta/dist/dta.js vendored Normal file

@ -0,0 +1,542 @@
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __markAsModule = (target) => __defProp(target, "__esModule", { value: true });
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __reExport = (target, module2, copyDefault, desc) => {
if (module2 && typeof module2 === "object" || typeof module2 === "function") {
for (let key of __getOwnPropNames(module2))
if (!__hasOwnProp.call(target, key) && (copyDefault || key !== "default"))
__defProp(target, key, { get: () => module2[key], enumerable: !(desc = __getOwnPropDesc(module2, key)) || desc.enumerable });
}
return target;
};
var __toCommonJS = /* @__PURE__ */ ((cache) => {
return (module2, temp) => {
return cache && cache.get(module2) || (temp = __reExport(__markAsModule({}), module2, 1), cache && cache.set(module2, temp), temp);
};
})(typeof WeakMap !== "undefined" ? /* @__PURE__ */ new WeakMap() : 0);
// dta.ts
var dta_exports = {};
__export(dta_exports, {
parse: () => parse,
set_utils: () => set_utils
});
var _utils;
function set_utils(utils) {
_utils = utils;
}
function u8_to_dataview(array) {
return new DataView(array.buffer, array.byteOffset, array.byteLength);
}
function valid_inc(p, n) {
if (p.str.slice(p.ptr, p.ptr + n.length) != n)
return false;
p.ptr += n.length;
return true;
}
function skip_end(p, n) {
const idx = p.str.indexOf(n, p.ptr);
if (idx == -1)
throw new Error(`Expected ${n} after offset ${p.ptr}`);
p.ptr = idx + n.length;
}
function slice_end(p, n) {
const idx = p.str.indexOf(n, p.ptr);
if (idx == -1)
throw new Error(`Expected ${n} after offset ${p.ptr}`);
const raw = p.raw.slice(p.ptr, idx);
const res = {
ptr: 0,
raw,
str: p.str.slice(p.ptr, idx),
dv: u8_to_dataview(raw)
};
p.ptr = idx + n.length;
return res;
}
function read_f64(p, LE) {
p.ptr += 8;
const d = p.dv.getFloat64(p.ptr - 8, LE);
return d > 8988e304 ? null : d;
}
function read_f32(p, LE) {
p.ptr += 4;
const d = p.dv.getFloat32(p.ptr - 4, LE);
return d > 1701e35 ? null : d;
}
function read_u32(p, LE) {
p.ptr += 4;
return p.dv.getUint32(p.ptr - 4, LE);
}
function read_i32(p, LE) {
p.ptr += 4;
const u = p.dv.getInt32(p.ptr - 4, LE);
return u > 2147483620 ? null : u;
}
function read_u16(p, LE) {
p.ptr += 2;
return p.dv.getUint16(p.ptr - 2, LE);
}
function read_i16(p, LE) {
p.ptr += 2;
const u = p.dv.getInt16(p.ptr - 2, LE);
return u > 32740 ? null : u;
}
function read_u8(p) {
return p.raw[p.ptr++];
}
function read_i8(p) {
let u = p.raw[p.ptr++];
u = u < 128 ? u : u - 256;
return u > 100 ? null : u;
}
var SUPPORTED_VERSIONS_TAGGED = [
"117",
"118"
];
function parse_tagged(raw) {
const err = "Not a DTA file";
const str = new TextDecoder("latin1").decode(raw);
const d = {
ptr: 0,
raw,
str,
dv: u8_to_dataview(raw)
};
let vers = 118;
let LE = true;
let nvar = 0, nobs = 0, nobs_lo = 0, nobs_hi = 0;
let label = "", timestamp = "";
const var_types = [];
const var_names = [];
const formats = [];
if (!valid_inc(d, "<stata_dta>"))
throw err;
{
if (!valid_inc(d, "<header>"))
throw err;
{
if (!valid_inc(d, "<release>"))
throw err;
const res = slice_end(d, "</release>");
if (SUPPORTED_VERSIONS_TAGGED.indexOf(res.str) == -1)
throw `Unsupported DTA ${res.str} file`;
vers = +res.str;
}
{
if (!valid_inc(d, "<byteorder>"))
throw err;
const res = slice_end(d, "</byteorder>");
switch (res.str) {
case "MSF":
LE = false;
break;
case "LSF":
LE = true;
break;
default:
throw `Unsupported byteorder ${res.str}`;
}
}
{
if (!valid_inc(d, "<K>"))
throw err;
const res = slice_end(d, "</K>");
nvar = read_u16(res, LE);
}
{
if (!valid_inc(d, "<N>"))
throw err;
const res = slice_end(d, "</N>");
if (vers == 117)
nobs = nobs_lo = read_u32(res, LE);
else {
const lo = read_u32(res, LE), hi = read_u32(res, LE);
nobs = LE ? (nobs_lo = lo) + (nobs_hi = hi) * Math.pow(2, 32) : (nobs_lo = hi) + (nobs_hi = lo) * Math.pow(2, 32);
}
if (nobs > 1e6)
console.error(`More than 1 million observations -- extra rows will be dropped`);
}
{
if (!valid_inc(d, "<label>"))
throw err;
const res = slice_end(d, "</label>");
const w = vers >= 118 ? 2 : 1;
const strlen = w == 1 ? read_u8(res) : read_u16(res, LE);
if (strlen + w != res.str.length)
throw `Expected string length ${strlen} but actual length was ${res.str.length - w}`;
if (strlen > 0)
label = new TextDecoder().decode(res.raw.slice(w));
}
{
if (!valid_inc(d, "<timestamp>"))
throw err;
const res = slice_end(d, "</timestamp>");
const strlen = read_u8(res);
if (strlen + 1 != res.str.length)
throw `Expected string length ${strlen} but actual length was ${res.str.length - 1}`;
if (strlen > 0)
timestamp = res.str.slice(1);
}
if (!valid_inc(d, "</header>"))
throw err;
}
{
if (!valid_inc(d, "<map>"))
throw err;
skip_end(d, "</map>");
}
let stride = 0;
{
if (!valid_inc(d, "<variable_types>"))
throw err;
const res = slice_end(d, "</variable_types>");
if (res.raw.length != 2 * nvar)
throw `Expected variable_types length ${nvar * 2}, found ${res.raw.length}`;
while (res.ptr < res.raw.length) {
const type = read_u16(res, LE);
var_types.push(type);
if (type >= 1 && type <= 2045)
stride += type;
else
switch (type) {
case 32768:
stride += 8;
break;
case 65526:
stride += 8;
break;
case 65527:
stride += 4;
break;
case 65528:
stride += 4;
break;
case 65529:
stride += 2;
break;
case 65530:
stride += 1;
break;
default:
throw `Unsupported field type ${type}`;
}
}
}
{
if (!valid_inc(d, "<varnames>"))
throw err;
const res = slice_end(d, "</varnames>");
const w = vers >= 118 ? 129 : 33;
if (res.raw.length != w * nvar)
throw `Expected variable_types length ${nvar * w}, found ${res.raw.length}`;
while (res.ptr < res.raw.length) {
const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w));
res.ptr += w;
var_names.push(name.replace(/\x00[\s\S]*/, ""));
}
}
{
if (!valid_inc(d, "<sortlist>"))
throw err;
const res = slice_end(d, "</sortlist>");
if (res.raw.length != 2 * nvar + 2)
throw `Expected sortlist length ${nvar * 2 + 2}, found ${res.raw.length}`;
}
{
if (!valid_inc(d, "<formats>"))
throw err;
const res = slice_end(d, "</formats>");
const w = vers >= 118 ? 57 : 49;
if (res.raw.length != w * nvar)
throw `Expected formats length ${nvar * w}, found ${res.raw.length}`;
while (res.ptr < res.raw.length) {
const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w));
res.ptr += w;
formats.push(name.replace(/\x00[\s\S]*/, ""));
}
}
{
if (!valid_inc(d, "<value_label_names>"))
throw err;
const w = vers >= 118 ? 129 : 33;
const res = slice_end(d, "</value_label_names>");
}
{
if (!valid_inc(d, "<variable_labels>"))
throw err;
const w = vers >= 118 ? 321 : 81;
const res = slice_end(d, "</variable_labels>");
}
{
if (!valid_inc(d, "<characteristics>"))
throw err;
while (d.str.slice(d.ptr, d.ptr + 4) == "<ch>") {
d.ptr += 4;
const len = read_u32(d, LE);
d.ptr += len;
if (!valid_inc(d, "</ch>"))
throw err;
}
if (!valid_inc(d, "</characteristics>"))
throw err;
}
const ws = _utils.aoa_to_sheet([var_names], { dense: true });
var ptrs = [];
{
if (!valid_inc(d, "<data>"))
throw err;
for (let R = 0; R < nobs; ++R) {
const row = [];
for (let C = 0; C < nvar; ++C) {
let t = var_types[C];
if (t >= 1 && t <= 2045) {
let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t));
s = s.replace(/\x00[\s\S]*/, "");
row[C] = s;
d.ptr += t;
} else
switch (t) {
case 65526:
row[C] = read_f64(d, LE);
break;
case 65527:
row[C] = read_f32(d, LE);
break;
case 65528:
row[C] = read_i32(d, LE);
break;
case 65529:
row[C] = read_i16(d, LE);
break;
case 65530:
row[C] = read_i8(d);
break;
case 32768:
{
row[C] = "##SheetJStrL##";
ptrs.push([R + 1, C, d.raw.slice(d.ptr, d.ptr + 8)]);
d.ptr += 8;
}
break;
default:
throw `Unsupported field type ${t} for ${var_names[C]}`;
}
}
_utils.sheet_add_aoa(ws, [row], { origin: -1, sheetStubs: true });
}
if (!valid_inc(d, "</data>"))
throw err;
}
{
if (!valid_inc(d, "<strls>"))
throw err;
const strl_tbl = [];
while (d.raw[d.ptr] == 71) {
if (!valid_inc(d, "GSO"))
throw err;
const v = read_u32(d, LE);
let o = 0;
if (vers == 117)
o = read_u32(d, LE);
else {
const lo = read_u32(d, LE), hi = read_u32(d, LE);
o = LE ? lo + hi * Math.pow(2, 32) : hi + lo * Math.pow(2, 32);
if (o > 1e6)
console.error(`More than 1 million observations -- data will be dropped`);
}
const t = read_u8(d);
const len = read_u32(d, LE);
if (!strl_tbl[o])
strl_tbl[o] = [];
let str2 = "";
if (t == 129) {
str2 = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len));
d.ptr += len;
} else {
str2 = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)).replace(/\x00$/, "");
d.ptr += len;
}
strl_tbl[o][v] = str2;
}
if (!valid_inc(d, "</strls>"))
throw err;
ptrs.forEach(([R, C, buf]) => {
const dv = u8_to_dataview(buf);
let v = 0, o = 0;
switch (vers) {
case 117:
{
v = dv.getUint32(0, LE);
o = dv.getUint32(4, LE);
}
break;
case 118:
case 120:
{
v = dv.getUint16(0, LE);
const o1 = dv.getUint16(2, LE), o2 = dv.getUint32(4, LE);
o = LE ? o1 + o2 * 65536 : o2 + o1 * 2 ** 32;
}
break;
case 119:
case 121: {
const v1 = dv.getUint16(0, LE), v2 = buf[2];
v = LE ? v1 + (v2 << 16) : v2 + (v1 << 8);
const o1 = buf[3], o2 = dv.getUint32(4, LE);
o = LE ? o1 + o2 * 256 : o2 + o1 * 2 ** 32;
}
}
ws["!data"][R][C].v = strl_tbl[o][v];
});
}
{
if (!valid_inc(d, "<value_labels>"))
throw err;
const res = slice_end(d, "</value_labels>");
}
if (!valid_inc(d, "</stata_dta>"))
throw err;
const wb = _utils.book_new();
_utils.book_append_sheet(wb, ws, "Sheet1");
return wb;
}
function parse_legacy(raw) {
let vers = raw[0];
switch (vers) {
case 102:
case 112:
throw `Unsupported DTA ${vers} file`;
case 103:
case 104:
case 105:
case 108:
case 110:
case 111:
case 113:
case 114:
case 115:
break;
default:
throw new Error("Not a DTA file");
}
const d = {
ptr: 1,
raw,
str: "",
dv: u8_to_dataview(raw)
};
let LE = true;
let nvar = 0, nobs = 0;
let label = "", timestamp = "";
const var_types = [];
const var_names = [];
const formats = [];
{
const byteorder = read_u8(d);
switch (byteorder) {
case 1:
LE = false;
break;
case 2:
LE = true;
break;
default:
throw `DTA ${vers} Unexpected byteorder ${byteorder}`;
}
let byte = read_u8(d);
if (byte != 1)
throw `DTA ${vers} Unexpected filetype ${byte}`;
d.ptr++;
nvar = read_u16(d, LE);
nobs = read_u32(d, LE);
d.ptr += vers >= 108 ? 81 : 32;
if (vers >= 105)
d.ptr += 18;
}
{
let C = 0;
for (C = 0; C < nvar; ++C)
var_types.push(read_u8(d));
const w = vers >= 110 ? 33 : 9;
for (C = 0; C < nvar; ++C) {
var_names.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + w)).replace(/\x00[\s\S]*$/, ""));
d.ptr += w;
}
d.ptr += 2 * (nvar + 1);
const fw = vers >= 114 ? 49 : vers >= 105 ? 12 : 7;
for (C = 0; C < nvar; ++C) {
formats.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + fw)).replace(/\x00[\s\S]*$/, ""));
d.ptr += fw;
}
d.ptr += (vers >= 110 ? 33 : 9) * nvar;
}
d.ptr += (vers >= 106 ? 81 : 32) * nvar;
if (vers >= 105)
while (d.ptr < d.raw.length) {
const dt = read_u8(d), len = (vers >= 111 ? read_u32 : read_u16)(d, LE);
if (dt == 0 && len == 0)
break;
d.ptr += len;
}
const ws = _utils.aoa_to_sheet([var_names], { dense: true });
for (let R = 0; R < nobs; ++R) {
const row = [];
for (let C = 0; C < nvar; ++C) {
let t = var_types[C];
if (vers >= 111 && t >= 1 && t <= 244) {
let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t));
s = s.replace(/\x00[\s\S]*/, "");
row[C] = s;
d.ptr += t;
} else
switch (t) {
case 251:
case 98:
row[C] = read_i8(d);
break;
case 252:
case 105:
row[C] = read_i16(d, LE);
break;
case 253:
case 108:
row[C] = read_i32(d, LE);
break;
case 254:
case 102:
row[C] = read_f32(d, LE);
break;
case 255:
case 100:
row[C] = read_f64(d, LE);
break;
default:
throw `Unsupported field type ${t} for ${var_names[C]}`;
}
}
_utils.sheet_add_aoa(ws, [row], { origin: -1, sheetStubs: true });
}
const wb = _utils.book_new();
_utils.book_append_sheet(wb, ws, "Sheet1");
return wb;
}
function parse(data) {
if (data[0] >= 102 && data[0] <= 115)
return parse_legacy(data);
if (data[0] === 60)
return parse_tagged(data);
throw new Error("Not a DTA file");
}
module.exports = __toCommonJS(dta_exports);
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
parse,
set_utils
});

2
packages/dta/dist/dta.min.js vendored Normal file

File diff suppressed because one or more lines are too long

543
packages/dta/dta.ts Normal file

@ -0,0 +1,543 @@
import { DenseWorkSheet, WorkBook, type utils } from 'xlsx';
export { parse, set_utils };
let _utils: typeof utils;
/** Set internal instance of `utils`
*
* Usage:
*
* ```js
* const XLSX = require("xlsx");
* const DTA = require("dta");
* DTA.set_utils(XLSX.utils);
* ```
*
* @param utils utils object
*/
function set_utils(utils: any): void {
_utils = utils;
}
interface Payload {
/** Offset */
ptr: number;
/** Raw data */
raw: Uint8Array;
/** Latin-1 encoded */
str: string;
/** DataView */
dv: DataView;
}
function u8_to_dataview(array: Uint8Array): DataView { return new DataView(array.buffer, array.byteOffset, array.byteLength); }
function valid_inc(p: Payload, n: string): boolean {
if(p.str.slice(p.ptr, p.ptr + n.length) != n) return false;
p.ptr += n.length;
return true;
}
function skip_end(p: Payload, n: string): void {
const idx = p.str.indexOf(n, p.ptr);
if(idx == -1) throw new Error(`Expected ${n} after offset ${p.ptr}`);
p.ptr = idx + n.length;
}
function slice_end(p: Payload, n: string): Payload {
const idx = p.str.indexOf(n, p.ptr);
if(idx == -1) throw new Error(`Expected ${n} after offset ${p.ptr}`);
const raw = p.raw.slice(p.ptr, idx);
const res = {
ptr: 0,
raw,
str: p.str.slice(p.ptr, idx),
dv: u8_to_dataview(raw)
};
p.ptr = idx + n.length;
return res;
}
function read_f64(p: Payload, LE: boolean): number | null {
p.ptr += 8;
const d = p.dv.getFloat64(p.ptr - 8, LE);
return d > 8.988e+307 ? null : d;
}
function read_f32(p: Payload, LE: boolean): number | null {
p.ptr += 4;
const d = p.dv.getFloat32(p.ptr - 4, LE);
return d > 1.701e+38 ? null : d;
}
function read_u32(p: Payload, LE: boolean) {
p.ptr += 4;
return p.dv.getUint32(p.ptr - 4, LE);
}
function read_i32(p: Payload, LE: boolean): number | null {
p.ptr += 4;
const u = p.dv.getInt32(p.ptr - 4, LE);
return u > 0x7fffffe4 ? null : u;
}
function read_u16(p: Payload, LE: boolean) {
p.ptr += 2;
return p.dv.getUint16(p.ptr - 2, LE);
}
function read_i16(p: Payload, LE: boolean): number | null {
p.ptr += 2;
const u = p.dv.getInt16(p.ptr - 2, LE);
return u > 32740 ? null : u;
}
function read_u8(p: Payload) {
return p.raw[p.ptr++];
}
function read_i8(p: Payload): number | null {
let u = p.raw[p.ptr++];
u = u < 128 ? u : u - 256;
return u > 100 ? null : u;
}
const SUPPORTED_VERSIONS_TAGGED = [
"117", // stata 13
"118", // stata 14-18
// "119", // stata 15/16/17/18 (> 32767 variables)
// "120", // stata 18 (<= 32767, with aliases)
// "121", // stata 18 (> 32767, with aliases)
];
function parse_tagged(raw: Uint8Array): WorkBook {
const err = ("Not a DTA file");
/* sadly the web zealots decided to abandon binary strings */
const str = new TextDecoder('latin1').decode(raw);
const d: Payload = {
ptr: 0,
raw,
str,
dv: u8_to_dataview(raw)
}
let vers: number = 118;
let LE: boolean = true;
let nvar: number = 0, nobs: number = 0, nobs_lo = 0, nobs_hi = 0;
let label: string = "", timestamp: string = "";
const var_types: number[] = [];
const var_names: string[] = [];
const formats: string[] = [];
/* 5. Dataset format definition */
if(!valid_inc(d, "<stata_dta>")) throw err;
/* 5.1 Header <header> */
{
if(!valid_inc(d, "<header>")) throw err;
/* <release> */
{
if(!valid_inc(d, "<release>")) throw err;
const res = slice_end(d, "</release>");
if(SUPPORTED_VERSIONS_TAGGED.indexOf(res.str) == -1) throw (`Unsupported DTA ${res.str} file`);
vers = +res.str;
}
/* <byteorder> */
{
if(!valid_inc(d, "<byteorder>")) throw err;
const res = slice_end(d, "</byteorder>");
switch(res.str) {
case "MSF": LE = false; break;
case "LSF": LE = true; break;
default: throw (`Unsupported byteorder ${res.str}`);
}
}
/* <K> */
{
if(!valid_inc(d, "<K>")) throw err;
const res = slice_end(d, "</K>");
nvar = read_u16(res, LE);
}
/* <N> */
{
if(!valid_inc(d, "<N>")) throw err;
const res = slice_end(d, "</N>");
if(vers == 117) nobs = nobs_lo = read_u32(res, LE);
else {
const lo = read_u32(res, LE), hi = read_u32(res, LE);
nobs = LE ? ((nobs_lo = lo) + (nobs_hi = hi) * Math.pow(2,32)) : ((nobs_lo = hi) + (nobs_hi = lo) * Math.pow(2,32));
}
if(nobs > 1e6) console.error(`More than 1 million observations -- extra rows will be dropped`);
}
/* <label> */
{
if(!valid_inc(d, "<label>")) throw err;
const res = slice_end(d, "</label>");
const w = vers >= 118 ? 2 : 1;
const strlen = w == 1 ? read_u8(res) : read_u16(res, LE);
if(strlen + w != res.str.length) throw (`Expected string length ${strlen} but actual length was ${res.str.length - w}`);
if(strlen > 0) label = new TextDecoder().decode(res.raw.slice(w));
}
/* <timestamp> */
{
if(!valid_inc(d, "<timestamp>")) throw err;
const res = slice_end(d, "</timestamp>");
const strlen = read_u8(res);
if(strlen + 1 != res.str.length) throw (`Expected string length ${strlen} but actual length was ${res.str.length - 1}`);
if(strlen > 0) timestamp = res.str.slice(1);
}
if(!valid_inc(d, "</header>")) throw err;
}
/* 5.2 Map <map> */
{
/* TODO: validate map? */
if(!valid_inc(d, "<map>")) throw err;
/* 14 8-byte offsets for:
<stata_data>
<map>
<variable_types>
<varnames>
<sortlist>
<formats>
<value_label_names>
<variable_labels>
<characteristics>
<data>
<strls>
<value_labels>
</stata_data>
EOF
*/
skip_end(d, "</map>");
}
let stride = 0;
/* 5.3 Variable types <variable_types> */
{
if(!valid_inc(d, "<variable_types>")) throw err;
const res = slice_end(d, "</variable_types>");
if(res.raw.length != 2 * nvar) throw (`Expected variable_types length ${nvar * 2}, found ${res.raw.length}`);
while(res.ptr < res.raw.length) {
const type = read_u16(res, LE);
var_types.push(type);
if(type >= 1 && type <= 2045) stride += type;
else switch(type) {
case 32768: stride += 8; break;
case 65526: stride += 8; break;
case 65527: stride += 4; break;
case 65528: stride += 4; break;
case 65529: stride += 2; break;
case 65530: stride += 1; break;
default: throw (`Unsupported field type ${type}`);
}
}
}
/* 5.4 Variable names <varnames> */
{
if(!valid_inc(d, "<varnames>")) throw err;
const res = slice_end(d, "</varnames>");
const w = vers >= 118 ? 129 : 33;
if(res.raw.length != w * nvar) throw (`Expected variable_types length ${nvar * w}, found ${res.raw.length}`);
while(res.ptr < res.raw.length) {
const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w));
res.ptr += w;
var_names.push(name.replace(/\x00[\s\S]*/,""));
}
}
/* 5.5 Sort order of observations <sortlist> */
{
/* TODO: check sort list? */
if(!valid_inc(d, "<sortlist>")) throw err;
const res = slice_end(d, "</sortlist>");
if(res.raw.length != 2 * nvar + 2) throw (`Expected sortlist length ${nvar * 2 + 2}, found ${res.raw.length}`);
}
/* 5.6 Display formats <formats> */
{
if(!valid_inc(d, "<formats>")) throw err;
const res = slice_end(d, "</formats>");
const w = vers >= 118 ? 57 : 49;
if(res.raw.length != w * nvar) throw (`Expected formats length ${nvar * w}, found ${res.raw.length}`);
while(res.ptr < res.raw.length) {
const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w));
res.ptr += w;
formats.push(name.replace(/\x00[\s\S]*/,""));
}
}
/* TODO: <value_label_names> */
{
if(!valid_inc(d, "<value_label_names>")) throw err;
const w = vers >= 118 ? 129 : 33;
const res = slice_end(d, "</value_label_names>");
}
/* TODO: <variable_labels> */
{
if(!valid_inc(d, "<variable_labels>")) throw err;
const w = vers >= 118 ? 321 : 81;
const res = slice_end(d, "</variable_labels>");
}
/* 5.9 Characteristics <characteristics> */
{
if(!valid_inc(d, "<characteristics>")) throw err;
while(d.str.slice(d.ptr, d.ptr + 4) == "<ch>") {
d.ptr += 4;
const len = read_u32(d, LE);
d.ptr += len;
if(!valid_inc(d, "</ch>")) throw err;
}
if(!valid_inc(d, "</characteristics>")) throw err;
}
const ws: DenseWorkSheet = (_utils.aoa_to_sheet([var_names], {dense: true}) as DenseWorkSheet);
var ptrs: Array<[number, number, Uint8Array]> = []
/* 5.10 Data <data> */
{
if(!valid_inc(d, "<data>")) throw err;
for(let R = 0; R < nobs; ++R) {
const row: any[] = [];
for(let C = 0; C < nvar; ++C) {
let t = var_types[C];
// TODO: formats, dta_12{0,1} aliases?
if(t >= 1 && t <= 2045) {
/* NOTE: dta_117 restricts strf to ASCII */
let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t));
s = s.replace(/\x00[\s\S]*/,"");
row[C] = s;
d.ptr += t;
} else switch(t) {
case 65526: row[C] = read_f64(d, LE); break;
case 65527: row[C] = read_f32(d, LE); break;
case 65528: row[C] = read_i32(d, LE); break;
case 65529: row[C] = read_i16(d, LE); break;
case 65530: row[C] = read_i8(d); break;
case 32768: {
row[C] = "##SheetJStrL##";
ptrs.push([R+1,C, d.raw.slice(d.ptr, d.ptr + 8)]);
d.ptr += 8;
} break;
default: throw (`Unsupported field type ${t} for ${var_names[C]}`);
}
}
_utils.sheet_add_aoa(ws, [row], {origin: -1, sheetStubs: true});
}
if(!valid_inc(d, "</data>")) throw err;
}
/* 5.11 StrLs <strls> */
{
if(!valid_inc(d, "<strls>")) throw err;
const strl_tbl: string[][] = [];
while(d.raw[d.ptr] == 71 /* G */) {
if(!valid_inc(d, "GSO")) throw err;
const v = read_u32(d, LE);
let o = 0;
if(vers == 117) o = read_u32(d, LE);
else {
const lo = read_u32(d, LE), hi = read_u32(d, LE);
o = LE ? (lo + hi * Math.pow(2,32)) : (hi + lo * Math.pow(2,32));
if(o > 1e6) console.error(`More than 1 million observations -- data will be dropped`);
}
const t = read_u8(d);
const len = read_u32(d, LE);
if(!strl_tbl[o]) strl_tbl[o] = [];
let str = "";
if(t == 129) {
// TODO: codepage
str = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len));
d.ptr += len;
} else {
str = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)).replace(/\x00$/,"");
d.ptr += len;
}
strl_tbl[o][v] = str;
}
if(!valid_inc(d, "</strls>")) throw err;
ptrs.forEach(([R,C,buf]) => {
const dv = u8_to_dataview(buf);
let v = 0, o = 0;
switch(vers) {
case 117: { // v(4) o(4)
v = dv.getUint32(0, LE);
o = dv.getUint32(4, LE);
} break;
case 118: case 120: { // v(2) o(6)
v = dv.getUint16(0, LE);
const o1 = dv.getUint16(2, LE), o2 = dv.getUint32(4, LE);
o = LE ? o1 + o2 * 65536 : o2 + o1 * (2**32);
} break;
case 119: case 121: { // v(3) o(5)
const v1 = dv.getUint16(0, LE), v2 = buf[2];
v = LE ? v1 + (v2 << 16) : v2 + (v1 << 8);
const o1 = buf[3], o2 = dv.getUint32(4, LE);
o = LE ? o1 + o2 * 256 : o2 + o1 * (2**32);
}
}
ws["!data"][R][C].v = strl_tbl[o][v];
});
}
/* 5.12 Value labels <value_labels> */
{
if(!valid_inc(d, "<value_labels>")) throw err;
const res = slice_end(d, "</value_labels>");
}
if(!valid_inc(d, "</stata_dta>")) throw err;
const wb = _utils.book_new();
_utils.book_append_sheet(wb, ws, "Sheet1");
return wb;
}
function parse_legacy(raw: Uint8Array): WorkBook {
let vers: number = raw[0];
switch(vers) {
case 102: // stata 1
case 112: // stata 8/9
throw (`Unsupported DTA ${vers} file`);
case 103: // stata 2/3
case 104: // stata 4
case 105: // stata 5
case 108: // stata 6
case 110: // stata 7
case 111: // stata 7
case 113: // stata 8/9
case 114: // stata 10/11
case 115: // stata 12
break;
default: throw new Error("Not a DTA file");
}
const d: Payload = {
ptr: 1,
raw,
str:"",
dv: u8_to_dataview(raw)
}
let LE: boolean = true;
let nvar: number = 0, nobs: number = 0;
let label: string = "", timestamp: string = "";
const var_types: number[] = [];
const var_names: string[] = [];
const formats: string[] = [];
/* 5.1 Header */
{
const byteorder = read_u8(d);
switch(byteorder) {
case 1: LE = false; break;
case 2: LE = true; break;
default: throw (`DTA ${vers} Unexpected byteorder ${byteorder}`);
}
let byte = read_u8(d);
if(byte != 1) throw (`DTA ${vers} Unexpected filetype ${byte}`);
// NOTE: dta_105 technically supports filetype 2
d.ptr++; // "unused"
nvar = read_u16(d, LE);
nobs = read_u32(d, LE);
d.ptr += (vers >= 108 ? 81 : 32); // TODO: data_label
if(vers >= 105) d.ptr += 18; // TODO: time_stamp
}
/* 5.2 Descriptors */
{
let C = 0;
// typlist
for(C = 0; C < nvar; ++C) var_types.push(read_u8(d));
// varlist
const w = vers >= 110 ? 33 : 9;
for(C = 0; C < nvar; ++C) {
var_names.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + w)).replace(/\x00[\s\S]*$/,""));
d.ptr += w;
}
// srtlist
d.ptr += 2*(nvar + 1);
// fmtlist
const fw = (vers >= 114 ? 49 : vers >= 105 ? 12 : 7);
for(C = 0; C < nvar; ++C) {
formats.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + fw)).replace(/\x00[\s\S]*$/,""));
d.ptr += fw;
}
// lbllist
d.ptr += (vers >= 110 ? 33 : 9) * nvar;
}
/* 5.3 Variable labels */
// TODO: should these names be used in the worksheet?
d.ptr += (vers >= 106 ? 81 : 32) * nvar;
/* 5.4 Expansion fields */
if(vers >= 105) while(d.ptr < d.raw.length) {
const dt = read_u8(d), len = (vers >= 111 ? read_u32 : read_u16)(d, LE);
if(dt == 0 && len == 0) break;
d.ptr += len;
}
const ws: DenseWorkSheet = (_utils.aoa_to_sheet([var_names], {dense: true}) as DenseWorkSheet);
/* 5.5 Data */
for(let R = 0; R < nobs; ++R) {
const row: any[] = [];
for(let C = 0; C < nvar; ++C) {
let t = var_types[C];
// TODO: data type processing
if(vers >= 111 && t >= 1 && t <= 244) {
/* NOTE: dta_117 restricts strf to ASCII */
let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t));
s = s.replace(/\x00[\s\S]*/,"");
row[C] = s;
d.ptr += t;
} else switch(t) {
case 251: case 0x62: row[C] = read_i8(d); break; // byte
case 252: case 0x69: row[C] = read_i16(d, LE); break; // int
case 253: case 0x6c: row[C] = read_i32(d, LE); break; // long
case 254: case 0x66: row[C] = read_f32(d, LE); break; // float
case 255: case 0x64: row[C] = read_f64(d, LE); break; // double
default: throw (`Unsupported field type ${t} for ${var_names[C]}`);
}
}
_utils.sheet_add_aoa(ws, [row], {origin: -1, sheetStubs: true});
}
/* 5.6 Value labels */
// EOF or labels
const wb: WorkBook = _utils.book_new();
_utils.book_append_sheet(wb, ws, "Sheet1");
return wb;
}
/** Parse DTA file
*
* NOTE: In NodeJS, `Buffer` extends `Uint8Array`
*
* @param {Uint8Array} data File data
*/
function parse(data: Uint8Array): WorkBook {
if(data[0] >= 102 && data[0] <= 115) return parse_legacy(data);
if(data[0] === 60) return parse_tagged(data);
throw new Error("Not a DTA file");
}

36
packages/dta/package.json Normal file

@ -0,0 +1,36 @@
{
"name": "dta",
"version": "0.0.1",
"author": "sheetjs",
"description": "Stata .dta codecs for SheetJS Common Spreadsheet Format",
"bin": {
"dta2csv": "./bin/dta2csv.njs"
},
"main": "dist/dta.js",
"types": "types",
"files": [
"dist/"
],
"repository": {
"type": "git",
"url": "https://git.sheetjs.com/SheetJS/sheetjs",
"directory": "packages/dta"
},
"scripts": {
"test": "make test",
"build": "make",
"lint": "make fullint",
"dtslint": "dtslint types"
},
"homepage": "https://sheetjs.com/",
"bugs": {
"url": "https://git.sheetjs.com/SheetJS/sheetjs/issues"
},
"license": "Apache-2.0",
"engines": {
"node": ">=12.0"
},
"devDependencies": {
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.0/xlsx-0.20.0.tgz"
}
}

32
packages/dta/test.js Normal file

@ -0,0 +1,32 @@
/* eslint-env mocha, node, es6 */
const fs = require("fs"), assert = require("assert");
const DTA = require("./");
const XLSX = require("xlsx");
DTA.set_utils(XLSX.utils);
const test_folders = [
"test_files"
];
for(let tF of test_folders) describe(tF, () => {
const test_files = fs.readdirSync(tF);
for(let tf of test_files) {
if(tf.endsWith("csv")) it(`${tf.replace(".csv", "")} [CSV]`, () => {
const buf = fs.readFileSync(`${tF}/${tf.replace(".csv", "")}`);
const wb = DTA.parse(buf);
assert(wb.SheetNames.length > 0);
/* stata will represent unspecified values as single spaces */
wb.Sheets[wb.SheetNames[0]]["!data"].forEach(row => row.forEach(cell => {if(cell.t == "z") {cell.t = "s"; cell.v = " ";}}));
const csvstr = XLSX.utils.sheet_to_csv(wb.Sheets[wb.SheetNames[0]]);
const baseline = fs.readFileSync(`${tF}/${tf}`, "utf8").replace(/[\r\n]+/g,"\n");
assert.equal(csvstr.trim(), baseline.trim());
});
if(!tf.endsWith("dta")) continue;
it(tf, () => {
const buf = fs.readFileSync(`${tF}/${tf}`);
const wb = DTA.parse(buf);
assert(wb.SheetNames.length > 0);
});
}
});

23
packages/dta/types/index.d.ts vendored Normal file

@ -0,0 +1,23 @@
import type { WorkBook } from "xlsx";
/** Set internal instance of `utils`
*
* Usage:
*
* ```js
* const XLSX = require("xlsx");
* const DTA = require("dta");
* DTA.set_utils(XLSX.utils);
* ```
*
* @param utils utils object
*/
export function set_utils(utils: any): void;
/** Parse DTA file
*
* NOTE: In NodeJS, `Buffer` extends `Uint8Array`
*
* @param {Uint8Array} data File data
*/
export function parse(data: Uint8Array): WorkBook