Merge pull request 'demo-postgresql-improved-type-detection' (#1) from demo-postgresql-improved-type-detection into master

Reviewed-on: asadbek064/docs.sheetjs.com#1
This commit is contained in:
Asad Karimov 2024-12-03 20:44:34 +00:00
commit 62572b9648
2 changed files with 268 additions and 120 deletions

@ -36,6 +36,7 @@ This demo was tested in the following environments:
| Postgres | Connector Library | Date |
|:---------|:------------------|:-----------|
| `16.6.1` | `pg` (`8.13.1`) | 2024-12-03 |
| `16.2.1` | `pg` (`8.11.4`) | 2024-03-31 |
| `15.6` | `pg` (`8.11.4`) | 2024-03-31 |
| `14.11` | `pg` (`8.11.4`) | 2024-03-31 |
@ -129,80 +130,139 @@ for(let row of aoo) {
### Creating a Table
The array of objects can be scanned to determine column names and types. With
the names and types, a `CREATE TABLE` query can be written.
The worksheet can be scanned to determine column names and types. With the names and types, a `CREATE TABLE` query can be written.
<details>
<summary><b>Implementation Details</b> (click to show)</summary>
The `aoo_to_pg_table` function:
The `sheet_to_pg_table` function:
- scans each row object to determine column names and types
- scans worksheet cells to determine column names and types
- drops and creates a new table with the determined column names and types
- loads the entire dataset into the new table
```js
/* create table and load data given an array of objects and a PostgreSQL client */
async function aoo_to_pg_table(client, aoo, table_name) {
/* define types that can be converted (e.g. boolean can be stored in float) */
const T_FLOAT = ["float8", "boolean"];
const T_BOOL = ["boolean"];
/* create table and load data given a worksheet and a PostgreSQL client */
async function sheet_to_pg_table(client, worksheet, tableName) {
if (!worksheet['!ref']) return;
/* types is a map from column headers to Knex schema column type */
const types = {};
const range = XLSX.utils.decode_range(worksheet['!ref']);
/* names is an ordered list of the column header names */
const names = [];
/* Extract headers from first row, clean names for PostgreSQL */
const headers = [];
for (let col = range.s.c; col <= range.e.c; col++) {
const cellAddress = XLSX.utils.encode_cell({ r: range.s.r, c: col });
const cell = worksheet[cellAddress];
const headerValue = cell ? String(cell.v).replace(/[^a-zA-Z0-9_]/g, '_') : `column_${col + 1}`;
headers.push(headerValue.toLowerCase());
}
/* loop across each row object */
aoo.forEach(row =>
/* Object.entries returns a row of [key, value] pairs */
Object.entries(row).forEach(([k,v]) => {
/* Group cell values by column for type deduction */
const columnValues = headers.map(() => []);
for (let row = range.s.r + 1; row <= range.e.r; row++) {
for (let col = range.s.c; col <= range.e.c; col++) {
const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
const cell = worksheet[cellAddress];
columnValues[col].push(cell);
}
}
/* If this is first occurrence, mark unknown and append header to names */
if(!types[k]) { types[k] = ""; names.push(k); }
/* Deduce PostgreSQL type for each column */
const types = {};
headers.forEach((header, idx) => {
types[header] = deduceType(columnValues[idx]);
});
/* skip null and undefined values */
if(v == null) return;
/* Delete table if it exists in the DB */
await client.query(format('DROP TABLE IF EXISTS %I', tableName));
/* check and resolve type */
switch(typeof v) {
/* change type if it is empty or can be stored in a float */
case "number": if(!types[k] || T_FLOAT.includes(types[k])) types[k] = "float8"; break;
/* change type if it is empty or can be stored in a boolean */
case "boolean": if(!types[k] || T_BOOL.includes(types[k])) types[k] = "boolean"; break;
/* no other type can hold strings */
case "string": types[k] = "text"; break;
default: types[k] = "text"; break;
}
})
);
/* Delete table if it exists in the DB */
const query = format("DROP TABLE IF EXISTS %I;", table_name);
await client.query(query);
/* Create table */
{
const entries = Object.entries(types);
const Istr = entries.map(e => format(`%I ${e[1]}`, e[0])).join(", ");
let query = format.withArray(`CREATE TABLE %I (${Istr});`, [ table_name ]);
await client.query(query);
}
/* Insert each row */
for(let row of aoo) {
const ent = Object.entries(row);
const Istr = Array.from({length: ent.length}, ()=>"%I").join(", ");
const Lstr = Array.from({length: ent.length}, ()=>"%L").join(", ");
let query = format.withArray(
`INSERT INTO %I (${Istr}) VALUES (${Lstr});`,
[ table_name, ...ent.map(x => x[0]), ...ent.map(x => x[1]) ]
/* Create table */
const createTableSQL = format(
'CREATE TABLE %I (%s)',
tableName,
headers.map(header => format('%I %s', header, types[header])).join(', ')
);
await client.query(query);
}
await client.query(createTableSQL);
return client;
/* Insert data row by row */
for (let row = range.s.r + 1; row <= range.e.r; row++) {
const values = headers.map((header, col) => {
const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
const cell = worksheet[cellAddress];
return parseValue(cell, types[header]);
});
const insertSQL = format(
'INSERT INTO %I (%s) VALUES (%s)',
tableName,
headers.map(h => format('%I', h)).join(', '),
values.map(() => '%L').join(', ')
);
await client.query(format(insertSQL, ...values));
}
}
function deduceType(cells) {
if (!cells || cells.length === 0) return 'text';
const nonEmptyCells = cells.filter(cell => cell && cell.v != null);
if (nonEmptyCells.length === 0) return 'text';
// Check for dates by looking at both cell type and formatted value
const isDateCell = cell => cell?.t === 'd' || (cell?.t === 'n' && cell.w && /\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\d{2}-[A-Za-z]{3}-\d{4}|[A-Za-z]{3}-\d{2}|\d{1,2}-[A-Za-z]{3}/.test(cell.w));
if (nonEmptyCells.some(isDateCell)) { return 'date'; }
const allBooleans = nonEmptyCells.every(cell => cell.t === 'b');
if (allBooleans) { return 'boolean'; }
const allNumbers = nonEmptyCells.every(cell => cell.t === 'n' || (cell.t === 's' && !isNaN(cell.v.replace(/[,$\s%()]/g, ''))));
if (allNumbers) {
const numbers = nonEmptyCells.map(cell => {
if (cell.t === 'n') return cell.v;
return parseFloat(cell.v.replace(/[,$\s%()]/g, ''));
});
const needsPrecision = numbers.some(num => {
const str = num.toString();
return str.includes('e') ||
(str.includes('.') && str.split('.')[1].length > 6) ||
Math.abs(num) > 1e15;
});
return needsPrecision ? 'numeric' : 'double precision';
}
return 'text';
}
function parseValue(cell, type) {
if (!cell || cell.v == null) return null;
switch (type) {
case 'date':
if (cell.t === 'd') { return cell.v.toISOString().split('T')[0]; }
if (cell.t === 'n') {
const date = new Date((cell.v - 25569) * 86400 * 1000);
return date.toISOString().split('T')[0];
}
return null;
case 'numeric':
case 'double precision':
if (cell.t === 'n') return cell.v;
if (cell.t === 's') {
const cleaned = cell.v.replace(/[,$\s%()]/g, '');
if (!isNaN(cleaned)) return parseFloat(cleaned);
}
return null;
case 'boolean':
return cell.t === 'b' ? cell.v : null;
default:
return String(cell.v);
}
}
```
@ -230,6 +290,22 @@ Or, if you don't want/need a background service you can just run:
LC_ALL="C" /usr/local/opt/postgresql@16/bin/postgres -D /usr/local/var/postgresql@16
```
On Linux, install `postgresql` by running the following script:
```bash
echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt update
sudo apt install postgresql-16
sudo systemctl start postgresql
# Optional: Create user with password
sudo -u postgres createuser -P $USER
sudo -u postgres psql -c "ALTER USER $USER WITH SUPERUSER;"
```
If running the optional user creation steps above, a PostgreSQL password will be required. [^69]
Run the command to start a local database instance.
</details>
@ -238,8 +314,13 @@ Run the command to start a local database instance.
```bash
dropdb SheetJSPG
# Ubuntu/Debian
sudo -i -u postgres dropdb SheetJSPG
```
[^69]: PostgreSQL on Linux uses [SCRAM authentication by default, which requires a password](https://www.postgresql.org/docs/current/auth-password.html)
:::info pass
If the server is running elsewhere, or if the username is different from the
@ -257,6 +338,9 @@ current user, command-line flags can override the defaults.
```bash
createdb SheetJSPG
# Ubuntu/Debian
sudo -i -u postgres createdb SheetJSPG
```
:::note pass
@ -278,7 +362,7 @@ npm init -y
4) Install the `pg` connector module:
```bash
npm i --save pg@8.11.4
npm i --save pg@8.13.1
```
5) Save the following example codeblock to `PGTest.js`:
@ -316,6 +400,8 @@ correct host name and port number.
- If the server expects a different username and password, uncomment the `user`
and `password` lines and replace the values with the username and password.
- For Ubuntu/Debian PostgreSQL installations, the default user is `postgres`. The password must be set during installation or using `sudo -u postgres psql` followed by `\password postgres` in the psql prompt.
7) Run the script:
```bash
@ -379,6 +465,9 @@ correct host name and port number.
- If the server expects a different username and password, uncomment the `user`
and `password` lines and replace the values with the username and password.
- For Ubuntu/Debian PostgreSQL installations, the default user is postgres. The password must be set during installation or using sudo -u postgres psql followed by \password postgres in the psql prompt.
11) Fetch the example file [`pres.numbers`](https://docs.sheetjs.com/pres.numbers):
```bash

@ -1,74 +1,136 @@
const pg = require("pg"), format = require("pg-format");
const XLSX = require("xlsx");
const opts = {
database:"SheetJSPG",
host: "127.0.0.1", // localhost
port: 5432,
//user: "",
//password: ""
};
database:"SheetJSPG",
host: "127.0.0.1", // localhost
port: 5432,
//user: "",
//password: ""
};
/* create table and load data given an array of objects and a PostgreSQL client */
async function aoo_to_pg_table(client, aoo, table_name) {
/* define types that can be converted (e.g. boolean can be stored in float) */
const T_FLOAT = ["float8", "boolean"];
const T_BOOL = ["boolean"];
function deduceType(cells) {
if (!cells || cells.length === 0) return 'text';
/* types is a map from column headers to Knex schema column type */
const types = {};
const nonEmptyCells = cells.filter(cell => cell && cell.v != null);
if (nonEmptyCells.length === 0) return 'text';
/* names is an ordered list of the column header names */
const names = [];
// Check for dates by looking at both cell type and formatted value
const isDateCell = cell => cell?.t === 'd' || (cell?.t === 'n' && cell.w && /\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\d{2}-[A-Za-z]{3}-\d{4}|[A-Za-z]{3}-\d{2}|\d{1,2}-[A-Za-z]{3}/.test(cell.w));
/* loop across each row object */
aoo.forEach(row =>
/* Object.entries returns a row of [key, value] pairs */
Object.entries(row).forEach(([k,v]) => {
if (nonEmptyCells.some(isDateCell)) { return 'date'; }
/* If this is first occurrence, mark unknown and append header to names */
if(!types[k]) { types[k] = ""; names.push(k); }
// Check for booleans
const allBooleans = nonEmptyCells.every(cell => cell.t === 'b');
if (allBooleans) { return 'boolean'; }
/* skip null and undefined values */
if(v == null) return;
// Check for numbers
const allNumbers = nonEmptyCells.every(cell => cell.t === 'n' || (cell.t === 's' && !isNaN(cell.v.replace(/[,$\s%()]/g, ''))));
/* check and resolve type */
switch(typeof v) {
/* change type if it is empty or can be stored in a float */
case "number": if(!types[k] || T_FLOAT.includes(types[k])) types[k] = "float8"; break;
/* change type if it is empty or can be stored in a boolean */
case "boolean": if(!types[k] || T_BOOL.includes(types[k])) types[k] = "boolean"; break;
/* no other type can hold strings */
case "string": types[k] = "text"; break;
default: types[k] = "text"; break;
}
})
);
if (allNumbers) {
const numbers = nonEmptyCells.map(cell => {
if (cell.t === 'n') return cell.v;
return parseFloat(cell.v.replace(/[,$\s%()]/g, ''));
});
/* Delete table if it exists in the DB */
const query = format("DROP TABLE IF EXISTS %I;", table_name);
await client.query(query);
const needsPrecision = numbers.some(num => {
const str = num.toString();
return str.includes('e') ||
(str.includes('.') && str.split('.')[1].length > 6) ||
Math.abs(num) > 1e15;
});
/* Create table */
{
const entries = Object.entries(types);
const Istr = entries.map(e => format(`%I ${e[1]}`, e[0])).join(", ");
let query = format.withArray(`CREATE TABLE %I (${Istr});`, [ table_name ]);
await client.query(query);
}
return needsPrecision ? 'numeric' : 'double precision';
}
return 'text'; // default to string type
}
/* Insert each row */
for(let row of aoo) {
const ent = Object.entries(row);
const Istr = Array.from({length: ent.length}, ()=>"%I").join(", ");
const Lstr = Array.from({length: ent.length}, ()=>"%L").join(", ");
let query = format.withArray(
`INSERT INTO %I (${Istr}) VALUES (${Lstr});`,
[ table_name, ...ent.map(x => x[0]), ...ent.map(x => x[1]) ]
function parseValue(cell, type) {
if (!cell || cell.v == null) return null;
switch (type) {
case 'date':
if (cell.t === 'd') { return cell.v.toISOString().split('T')[0]; }
if (cell.t === 'n') {
const date = new Date((cell.v - 25569) * 86400 * 1000);
return date.toISOString().split('T')[0];
}
return null;
case 'numeric':
case 'double precision':
if (cell.t === 'n') return cell.v;
if (cell.t === 's') {
const cleaned = cell.v.replace(/[,$\s%()]/g, '');
if (!isNaN(cleaned)) return parseFloat(cleaned);
}
return null;
case 'boolean':
return cell.t === 'b' ? cell.v : null;
default:
return String(cell.v);
}
}
/* create table and load data given a worksheet and a PostgreSQL client */
async function sheet_to_pg_table(client, worksheet, tableName) {
if (!worksheet['!ref']) return;
const range = XLSX.utils.decode_range(worksheet['!ref']);
/* Extract headers from first row, clean names for PostgreSQL */
const headers = [];
for (let col = range.s.c; col <= range.e.c; col++) {
const cellAddress = XLSX.utils.encode_cell({ r: range.s.r, c: col });
const cell = worksheet[cellAddress];
const headerValue = cell ? String(cell.v).replace(/[^a-zA-Z0-9_]/g, '_') : `column_${col + 1}`;
headers.push(headerValue.toLowerCase());
}
/* Group cell values by column for type deduction */
const columnValues = headers.map(() => []);
for (let row = range.s.r + 1; row <= range.e.r; row++) {
for (let col = range.s.c; col <= range.e.c; col++) {
const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
const cell = worksheet[cellAddress];
columnValues[col].push(cell);
}
}
/* Deduce PostgreSQL type for each column */
const types = {};
headers.forEach((header, idx) => {
types[header] = deduceType(columnValues[idx]);
});
/* Delete table if it exists in the DB */
await client.query(format('DROP TABLE IF EXISTS %I', tableName));
/* Create table */
const createTableSQL = format(
'CREATE TABLE %I (%s)',
tableName,
headers.map(header => format('%I %s', header, types[header])).join(', ')
);
await client.query(query);
}
await client.query(createTableSQL);
return client;
/* Insert data row by row */
for (let row = range.s.r + 1; row <= range.e.r; row++) {
const values = headers.map((header, col) => {
const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
const cell = worksheet[cellAddress];
return parseValue(cell, types[header]);
});
const insertSQL = format(
'INSERT INTO %I (%s) VALUES (%s)',
tableName,
headers.map(h => format('%I', h)).join(', '),
values.map(() => '%L').join(', ')
);
await client.query(format(insertSQL, ...values));
}
}
(async() => {
@ -83,11 +145,8 @@ try {
/* open connection to PostgreSQL database */
await client.connect();
/* generate array of objects from worksheet */
const aoo = XLSX.utils.sheet_to_json(oldws);
/* create table and load data */
await aoo_to_pg_table(client, aoo, "Presidents");
/* create table and load data given a worksheet */
await sheet_to_pg_table(client, oldws, "Presidents");
} finally {
/* disconnect */
await client.end();
@ -111,4 +170,4 @@ try {
/* disconnect */
await client.end();
}
})();
})();