Merge pull request 'demo-postgresql-improved-type-detection' (#1) from demo-postgresql-improved-type-detection into master
Reviewed-on: asadbek064/docs.sheetjs.com#1
This commit is contained in:
commit
62572b9648
@ -36,6 +36,7 @@ This demo was tested in the following environments:
|
||||
|
||||
| Postgres | Connector Library | Date |
|
||||
|:---------|:------------------|:-----------|
|
||||
| `16.6.1` | `pg` (`8.13.1`) | 2024-12-03 |
|
||||
| `16.2.1` | `pg` (`8.11.4`) | 2024-03-31 |
|
||||
| `15.6` | `pg` (`8.11.4`) | 2024-03-31 |
|
||||
| `14.11` | `pg` (`8.11.4`) | 2024-03-31 |
|
||||
@ -129,80 +130,139 @@ for(let row of aoo) {
|
||||
|
||||
### Creating a Table
|
||||
|
||||
The array of objects can be scanned to determine column names and types. With
|
||||
the names and types, a `CREATE TABLE` query can be written.
|
||||
The worksheet can be scanned to determine column names and types. With the names and types, a `CREATE TABLE` query can be written.
|
||||
|
||||
<details>
|
||||
<summary><b>Implementation Details</b> (click to show)</summary>
|
||||
|
||||
The `aoo_to_pg_table` function:
|
||||
The `sheet_to_pg_table` function:
|
||||
|
||||
- scans each row object to determine column names and types
|
||||
- scans worksheet cells to determine column names and types
|
||||
- drops and creates a new table with the determined column names and types
|
||||
- loads the entire dataset into the new table
|
||||
|
||||
```js
|
||||
/* create table and load data given an array of objects and a PostgreSQL client */
|
||||
async function aoo_to_pg_table(client, aoo, table_name) {
|
||||
/* define types that can be converted (e.g. boolean can be stored in float) */
|
||||
const T_FLOAT = ["float8", "boolean"];
|
||||
const T_BOOL = ["boolean"];
|
||||
/* create table and load data given a worksheet and a PostgreSQL client */
|
||||
async function sheet_to_pg_table(client, worksheet, tableName) {
|
||||
if (!worksheet['!ref']) return;
|
||||
|
||||
/* types is a map from column headers to Knex schema column type */
|
||||
const types = {};
|
||||
const range = XLSX.utils.decode_range(worksheet['!ref']);
|
||||
|
||||
/* names is an ordered list of the column header names */
|
||||
const names = [];
|
||||
/* Extract headers from first row, clean names for PostgreSQL */
|
||||
const headers = [];
|
||||
for (let col = range.s.c; col <= range.e.c; col++) {
|
||||
const cellAddress = XLSX.utils.encode_cell({ r: range.s.r, c: col });
|
||||
const cell = worksheet[cellAddress];
|
||||
const headerValue = cell ? String(cell.v).replace(/[^a-zA-Z0-9_]/g, '_') : `column_${col + 1}`;
|
||||
headers.push(headerValue.toLowerCase());
|
||||
}
|
||||
|
||||
/* loop across each row object */
|
||||
aoo.forEach(row =>
|
||||
/* Object.entries returns a row of [key, value] pairs */
|
||||
Object.entries(row).forEach(([k,v]) => {
|
||||
/* Group cell values by column for type deduction */
|
||||
const columnValues = headers.map(() => []);
|
||||
for (let row = range.s.r + 1; row <= range.e.r; row++) {
|
||||
for (let col = range.s.c; col <= range.e.c; col++) {
|
||||
const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
|
||||
const cell = worksheet[cellAddress];
|
||||
columnValues[col].push(cell);
|
||||
}
|
||||
}
|
||||
|
||||
/* If this is first occurrence, mark unknown and append header to names */
|
||||
if(!types[k]) { types[k] = ""; names.push(k); }
|
||||
/* Deduce PostgreSQL type for each column */
|
||||
const types = {};
|
||||
headers.forEach((header, idx) => {
|
||||
types[header] = deduceType(columnValues[idx]);
|
||||
});
|
||||
|
||||
/* skip null and undefined values */
|
||||
if(v == null) return;
|
||||
/* Delete table if it exists in the DB */
|
||||
await client.query(format('DROP TABLE IF EXISTS %I', tableName));
|
||||
|
||||
/* check and resolve type */
|
||||
switch(typeof v) {
|
||||
/* change type if it is empty or can be stored in a float */
|
||||
case "number": if(!types[k] || T_FLOAT.includes(types[k])) types[k] = "float8"; break;
|
||||
/* change type if it is empty or can be stored in a boolean */
|
||||
case "boolean": if(!types[k] || T_BOOL.includes(types[k])) types[k] = "boolean"; break;
|
||||
/* no other type can hold strings */
|
||||
case "string": types[k] = "text"; break;
|
||||
default: types[k] = "text"; break;
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
/* Delete table if it exists in the DB */
|
||||
const query = format("DROP TABLE IF EXISTS %I;", table_name);
|
||||
await client.query(query);
|
||||
|
||||
/* Create table */
|
||||
{
|
||||
const entries = Object.entries(types);
|
||||
const Istr = entries.map(e => format(`%I ${e[1]}`, e[0])).join(", ");
|
||||
let query = format.withArray(`CREATE TABLE %I (${Istr});`, [ table_name ]);
|
||||
await client.query(query);
|
||||
}
|
||||
|
||||
/* Insert each row */
|
||||
for(let row of aoo) {
|
||||
const ent = Object.entries(row);
|
||||
const Istr = Array.from({length: ent.length}, ()=>"%I").join(", ");
|
||||
const Lstr = Array.from({length: ent.length}, ()=>"%L").join(", ");
|
||||
let query = format.withArray(
|
||||
`INSERT INTO %I (${Istr}) VALUES (${Lstr});`,
|
||||
[ table_name, ...ent.map(x => x[0]), ...ent.map(x => x[1]) ]
|
||||
/* Create table */
|
||||
const createTableSQL = format(
|
||||
'CREATE TABLE %I (%s)',
|
||||
tableName,
|
||||
headers.map(header => format('%I %s', header, types[header])).join(', ')
|
||||
);
|
||||
await client.query(query);
|
||||
}
|
||||
await client.query(createTableSQL);
|
||||
|
||||
return client;
|
||||
/* Insert data row by row */
|
||||
for (let row = range.s.r + 1; row <= range.e.r; row++) {
|
||||
const values = headers.map((header, col) => {
|
||||
const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
|
||||
const cell = worksheet[cellAddress];
|
||||
return parseValue(cell, types[header]);
|
||||
});
|
||||
|
||||
const insertSQL = format(
|
||||
'INSERT INTO %I (%s) VALUES (%s)',
|
||||
tableName,
|
||||
headers.map(h => format('%I', h)).join(', '),
|
||||
values.map(() => '%L').join(', ')
|
||||
);
|
||||
await client.query(format(insertSQL, ...values));
|
||||
}
|
||||
}
|
||||
|
||||
function deduceType(cells) {
|
||||
if (!cells || cells.length === 0) return 'text';
|
||||
|
||||
const nonEmptyCells = cells.filter(cell => cell && cell.v != null);
|
||||
if (nonEmptyCells.length === 0) return 'text';
|
||||
|
||||
// Check for dates by looking at both cell type and formatted value
|
||||
const isDateCell = cell => cell?.t === 'd' || (cell?.t === 'n' && cell.w && /\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\d{2}-[A-Za-z]{3}-\d{4}|[A-Za-z]{3}-\d{2}|\d{1,2}-[A-Za-z]{3}/.test(cell.w));
|
||||
|
||||
if (nonEmptyCells.some(isDateCell)) { return 'date'; }
|
||||
|
||||
const allBooleans = nonEmptyCells.every(cell => cell.t === 'b');
|
||||
if (allBooleans) { return 'boolean'; }
|
||||
|
||||
const allNumbers = nonEmptyCells.every(cell => cell.t === 'n' || (cell.t === 's' && !isNaN(cell.v.replace(/[,$\s%()]/g, ''))));
|
||||
|
||||
if (allNumbers) {
|
||||
const numbers = nonEmptyCells.map(cell => {
|
||||
if (cell.t === 'n') return cell.v;
|
||||
return parseFloat(cell.v.replace(/[,$\s%()]/g, ''));
|
||||
});
|
||||
|
||||
const needsPrecision = numbers.some(num => {
|
||||
const str = num.toString();
|
||||
return str.includes('e') ||
|
||||
(str.includes('.') && str.split('.')[1].length > 6) ||
|
||||
Math.abs(num) > 1e15;
|
||||
});
|
||||
|
||||
return needsPrecision ? 'numeric' : 'double precision';
|
||||
}
|
||||
return 'text';
|
||||
}
|
||||
|
||||
function parseValue(cell, type) {
|
||||
if (!cell || cell.v == null) return null;
|
||||
|
||||
switch (type) {
|
||||
case 'date':
|
||||
if (cell.t === 'd') { return cell.v.toISOString().split('T')[0]; }
|
||||
if (cell.t === 'n') {
|
||||
const date = new Date((cell.v - 25569) * 86400 * 1000);
|
||||
return date.toISOString().split('T')[0];
|
||||
}
|
||||
return null;
|
||||
|
||||
case 'numeric':
|
||||
case 'double precision':
|
||||
if (cell.t === 'n') return cell.v;
|
||||
if (cell.t === 's') {
|
||||
const cleaned = cell.v.replace(/[,$\s%()]/g, '');
|
||||
if (!isNaN(cleaned)) return parseFloat(cleaned);
|
||||
}
|
||||
return null;
|
||||
|
||||
case 'boolean':
|
||||
return cell.t === 'b' ? cell.v : null;
|
||||
|
||||
default:
|
||||
return String(cell.v);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@ -230,6 +290,22 @@ Or, if you don't want/need a background service you can just run:
|
||||
LC_ALL="C" /usr/local/opt/postgresql@16/bin/postgres -D /usr/local/var/postgresql@16
|
||||
```
|
||||
|
||||
On Linux, install `postgresql` by running the following script:
|
||||
```bash
|
||||
echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
|
||||
wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
|
||||
|
||||
sudo apt update
|
||||
sudo apt install postgresql-16
|
||||
sudo systemctl start postgresql
|
||||
|
||||
# Optional: Create user with password
|
||||
sudo -u postgres createuser -P $USER
|
||||
sudo -u postgres psql -c "ALTER USER $USER WITH SUPERUSER;"
|
||||
```
|
||||
|
||||
If running the optional user creation steps above, a PostgreSQL password will be required. [^69]
|
||||
|
||||
Run the command to start a local database instance.
|
||||
|
||||
</details>
|
||||
@ -238,8 +314,13 @@ Run the command to start a local database instance.
|
||||
|
||||
```bash
|
||||
dropdb SheetJSPG
|
||||
|
||||
# Ubuntu/Debian
|
||||
sudo -i -u postgres dropdb SheetJSPG
|
||||
```
|
||||
|
||||
[^69]: PostgreSQL on Linux uses [SCRAM authentication by default, which requires a password](https://www.postgresql.org/docs/current/auth-password.html)
|
||||
|
||||
:::info pass
|
||||
|
||||
If the server is running elsewhere, or if the username is different from the
|
||||
@ -257,6 +338,9 @@ current user, command-line flags can override the defaults.
|
||||
|
||||
```bash
|
||||
createdb SheetJSPG
|
||||
|
||||
# Ubuntu/Debian
|
||||
sudo -i -u postgres createdb SheetJSPG
|
||||
```
|
||||
|
||||
:::note pass
|
||||
@ -278,7 +362,7 @@ npm init -y
|
||||
4) Install the `pg` connector module:
|
||||
|
||||
```bash
|
||||
npm i --save pg@8.11.4
|
||||
npm i --save pg@8.13.1
|
||||
```
|
||||
|
||||
5) Save the following example codeblock to `PGTest.js`:
|
||||
@ -316,6 +400,8 @@ correct host name and port number.
|
||||
- If the server expects a different username and password, uncomment the `user`
|
||||
and `password` lines and replace the values with the username and password.
|
||||
|
||||
- For Ubuntu/Debian PostgreSQL installations, the default user is `postgres`. The password must be set during installation or using `sudo -u postgres psql` followed by `\password postgres` in the psql prompt.
|
||||
|
||||
7) Run the script:
|
||||
|
||||
```bash
|
||||
@ -379,6 +465,9 @@ correct host name and port number.
|
||||
- If the server expects a different username and password, uncomment the `user`
|
||||
and `password` lines and replace the values with the username and password.
|
||||
|
||||
- For Ubuntu/Debian PostgreSQL installations, the default user is postgres. The password must be set during installation or using sudo -u postgres psql followed by \password postgres in the psql prompt.
|
||||
|
||||
|
||||
11) Fetch the example file [`pres.numbers`](https://docs.sheetjs.com/pres.numbers):
|
||||
|
||||
```bash
|
||||
|
@ -1,74 +1,136 @@
|
||||
const pg = require("pg"), format = require("pg-format");
|
||||
const XLSX = require("xlsx");
|
||||
const opts = {
|
||||
database:"SheetJSPG",
|
||||
host: "127.0.0.1", // localhost
|
||||
port: 5432,
|
||||
//user: "",
|
||||
//password: ""
|
||||
};
|
||||
database:"SheetJSPG",
|
||||
host: "127.0.0.1", // localhost
|
||||
port: 5432,
|
||||
//user: "",
|
||||
//password: ""
|
||||
};
|
||||
|
||||
/* create table and load data given an array of objects and a PostgreSQL client */
|
||||
async function aoo_to_pg_table(client, aoo, table_name) {
|
||||
/* define types that can be converted (e.g. boolean can be stored in float) */
|
||||
const T_FLOAT = ["float8", "boolean"];
|
||||
const T_BOOL = ["boolean"];
|
||||
function deduceType(cells) {
|
||||
if (!cells || cells.length === 0) return 'text';
|
||||
|
||||
/* types is a map from column headers to Knex schema column type */
|
||||
const types = {};
|
||||
const nonEmptyCells = cells.filter(cell => cell && cell.v != null);
|
||||
if (nonEmptyCells.length === 0) return 'text';
|
||||
|
||||
/* names is an ordered list of the column header names */
|
||||
const names = [];
|
||||
// Check for dates by looking at both cell type and formatted value
|
||||
const isDateCell = cell => cell?.t === 'd' || (cell?.t === 'n' && cell.w && /\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\d{2}-[A-Za-z]{3}-\d{4}|[A-Za-z]{3}-\d{2}|\d{1,2}-[A-Za-z]{3}/.test(cell.w));
|
||||
|
||||
/* loop across each row object */
|
||||
aoo.forEach(row =>
|
||||
/* Object.entries returns a row of [key, value] pairs */
|
||||
Object.entries(row).forEach(([k,v]) => {
|
||||
if (nonEmptyCells.some(isDateCell)) { return 'date'; }
|
||||
|
||||
/* If this is first occurrence, mark unknown and append header to names */
|
||||
if(!types[k]) { types[k] = ""; names.push(k); }
|
||||
// Check for booleans
|
||||
const allBooleans = nonEmptyCells.every(cell => cell.t === 'b');
|
||||
if (allBooleans) { return 'boolean'; }
|
||||
|
||||
/* skip null and undefined values */
|
||||
if(v == null) return;
|
||||
// Check for numbers
|
||||
const allNumbers = nonEmptyCells.every(cell => cell.t === 'n' || (cell.t === 's' && !isNaN(cell.v.replace(/[,$\s%()]/g, ''))));
|
||||
|
||||
/* check and resolve type */
|
||||
switch(typeof v) {
|
||||
/* change type if it is empty or can be stored in a float */
|
||||
case "number": if(!types[k] || T_FLOAT.includes(types[k])) types[k] = "float8"; break;
|
||||
/* change type if it is empty or can be stored in a boolean */
|
||||
case "boolean": if(!types[k] || T_BOOL.includes(types[k])) types[k] = "boolean"; break;
|
||||
/* no other type can hold strings */
|
||||
case "string": types[k] = "text"; break;
|
||||
default: types[k] = "text"; break;
|
||||
}
|
||||
})
|
||||
);
|
||||
if (allNumbers) {
|
||||
const numbers = nonEmptyCells.map(cell => {
|
||||
if (cell.t === 'n') return cell.v;
|
||||
return parseFloat(cell.v.replace(/[,$\s%()]/g, ''));
|
||||
});
|
||||
|
||||
/* Delete table if it exists in the DB */
|
||||
const query = format("DROP TABLE IF EXISTS %I;", table_name);
|
||||
await client.query(query);
|
||||
const needsPrecision = numbers.some(num => {
|
||||
const str = num.toString();
|
||||
return str.includes('e') ||
|
||||
(str.includes('.') && str.split('.')[1].length > 6) ||
|
||||
Math.abs(num) > 1e15;
|
||||
});
|
||||
|
||||
/* Create table */
|
||||
{
|
||||
const entries = Object.entries(types);
|
||||
const Istr = entries.map(e => format(`%I ${e[1]}`, e[0])).join(", ");
|
||||
let query = format.withArray(`CREATE TABLE %I (${Istr});`, [ table_name ]);
|
||||
await client.query(query);
|
||||
}
|
||||
return needsPrecision ? 'numeric' : 'double precision';
|
||||
}
|
||||
return 'text'; // default to string type
|
||||
}
|
||||
|
||||
/* Insert each row */
|
||||
for(let row of aoo) {
|
||||
const ent = Object.entries(row);
|
||||
const Istr = Array.from({length: ent.length}, ()=>"%I").join(", ");
|
||||
const Lstr = Array.from({length: ent.length}, ()=>"%L").join(", ");
|
||||
let query = format.withArray(
|
||||
`INSERT INTO %I (${Istr}) VALUES (${Lstr});`,
|
||||
[ table_name, ...ent.map(x => x[0]), ...ent.map(x => x[1]) ]
|
||||
function parseValue(cell, type) {
|
||||
if (!cell || cell.v == null) return null;
|
||||
|
||||
switch (type) {
|
||||
case 'date':
|
||||
if (cell.t === 'd') { return cell.v.toISOString().split('T')[0]; }
|
||||
if (cell.t === 'n') {
|
||||
const date = new Date((cell.v - 25569) * 86400 * 1000);
|
||||
return date.toISOString().split('T')[0];
|
||||
}
|
||||
return null;
|
||||
|
||||
case 'numeric':
|
||||
case 'double precision':
|
||||
if (cell.t === 'n') return cell.v;
|
||||
if (cell.t === 's') {
|
||||
const cleaned = cell.v.replace(/[,$\s%()]/g, '');
|
||||
if (!isNaN(cleaned)) return parseFloat(cleaned);
|
||||
}
|
||||
return null;
|
||||
|
||||
case 'boolean':
|
||||
return cell.t === 'b' ? cell.v : null;
|
||||
|
||||
default:
|
||||
return String(cell.v);
|
||||
}
|
||||
}
|
||||
|
||||
/* create table and load data given a worksheet and a PostgreSQL client */
|
||||
async function sheet_to_pg_table(client, worksheet, tableName) {
|
||||
if (!worksheet['!ref']) return;
|
||||
|
||||
const range = XLSX.utils.decode_range(worksheet['!ref']);
|
||||
|
||||
/* Extract headers from first row, clean names for PostgreSQL */
|
||||
const headers = [];
|
||||
for (let col = range.s.c; col <= range.e.c; col++) {
|
||||
const cellAddress = XLSX.utils.encode_cell({ r: range.s.r, c: col });
|
||||
const cell = worksheet[cellAddress];
|
||||
const headerValue = cell ? String(cell.v).replace(/[^a-zA-Z0-9_]/g, '_') : `column_${col + 1}`;
|
||||
headers.push(headerValue.toLowerCase());
|
||||
}
|
||||
|
||||
/* Group cell values by column for type deduction */
|
||||
const columnValues = headers.map(() => []);
|
||||
for (let row = range.s.r + 1; row <= range.e.r; row++) {
|
||||
for (let col = range.s.c; col <= range.e.c; col++) {
|
||||
const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
|
||||
const cell = worksheet[cellAddress];
|
||||
columnValues[col].push(cell);
|
||||
}
|
||||
}
|
||||
|
||||
/* Deduce PostgreSQL type for each column */
|
||||
const types = {};
|
||||
headers.forEach((header, idx) => {
|
||||
types[header] = deduceType(columnValues[idx]);
|
||||
});
|
||||
|
||||
/* Delete table if it exists in the DB */
|
||||
await client.query(format('DROP TABLE IF EXISTS %I', tableName));
|
||||
|
||||
/* Create table */
|
||||
const createTableSQL = format(
|
||||
'CREATE TABLE %I (%s)',
|
||||
tableName,
|
||||
headers.map(header => format('%I %s', header, types[header])).join(', ')
|
||||
);
|
||||
await client.query(query);
|
||||
}
|
||||
await client.query(createTableSQL);
|
||||
|
||||
return client;
|
||||
/* Insert data row by row */
|
||||
for (let row = range.s.r + 1; row <= range.e.r; row++) {
|
||||
const values = headers.map((header, col) => {
|
||||
const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
|
||||
const cell = worksheet[cellAddress];
|
||||
return parseValue(cell, types[header]);
|
||||
});
|
||||
|
||||
const insertSQL = format(
|
||||
'INSERT INTO %I (%s) VALUES (%s)',
|
||||
tableName,
|
||||
headers.map(h => format('%I', h)).join(', '),
|
||||
values.map(() => '%L').join(', ')
|
||||
);
|
||||
await client.query(format(insertSQL, ...values));
|
||||
}
|
||||
}
|
||||
|
||||
(async() => {
|
||||
@ -83,11 +145,8 @@ try {
|
||||
/* open connection to PostgreSQL database */
|
||||
await client.connect();
|
||||
|
||||
/* generate array of objects from worksheet */
|
||||
const aoo = XLSX.utils.sheet_to_json(oldws);
|
||||
|
||||
/* create table and load data */
|
||||
await aoo_to_pg_table(client, aoo, "Presidents");
|
||||
/* create table and load data given a worksheet */
|
||||
await sheet_to_pg_table(client, oldws, "Presidents");
|
||||
} finally {
|
||||
/* disconnect */
|
||||
await client.end();
|
||||
@ -111,4 +170,4 @@ try {
|
||||
/* disconnect */
|
||||
await client.end();
|
||||
}
|
||||
})();
|
||||
})();
|
Loading…
Reference in New Issue
Block a user