From 7315b4211c65d37513344550cc6d9bbb08d64ff1 Mon Sep 17 00:00:00 2001 From: Asad Date: Wed, 20 Nov 2024 19:45:34 -0500 Subject: [PATCH] feat(pg): add Ubuntu/Debian PostgreSQL installation guide & enhanced SheetJSPG.js type deduc - Include Ubuntu/Debian installation instructions - Improve PostgreSQL table creation with better type detection - Enhance date format detection and parsing Breaking Changes: - Replace `aoo_to_pg_table` to `sheet_to_pg_table` & now it takes worksheet object instead of array of arrays - Changed type detection algorithm may produce column types --- docz/docs/03-demos/23-data/16-postgresql.md | 200 ++++++++++++++------ docz/static/postgresql/SheetJSPG.js | 183 ++++++++++++------ 2 files changed, 261 insertions(+), 122 deletions(-) diff --git a/docz/docs/03-demos/23-data/16-postgresql.md b/docz/docs/03-demos/23-data/16-postgresql.md index 099d136..778daec 100644 --- a/docz/docs/03-demos/23-data/16-postgresql.md +++ b/docz/docs/03-demos/23-data/16-postgresql.md @@ -129,80 +129,139 @@ for(let row of aoo) { ### Creating a Table -The array of objects can be scanned to determine column names and types. With -the names and types, a `CREATE TABLE` query can be written. +The worksheet can be scanned to determine column names and types. With the names and types, a `CREATE TABLE` query can be written.
Implementation Details (click to show) -The `aoo_to_pg_table` function: +The `sheet_to_pg_table` function: -- scans each row object to determine column names and types +- scans worksheet cells to determine column names and types - drops and creates a new table with the determined column names and types - loads the entire dataset into the new table ```js -/* create table and load data given an array of objects and a PostgreSQL client */ -async function aoo_to_pg_table(client, aoo, table_name) { - /* define types that can be converted (e.g. boolean can be stored in float) */ - const T_FLOAT = ["float8", "boolean"]; - const T_BOOL = ["boolean"]; +/* create table and load data given a worksheet and a PostgreSQL client */ +async function sheet_to_pg_table(client, worksheet, tableName) { + if (!worksheet['!ref']) return; + + const range = XLSX.utils.decode_range(worksheet['!ref']); + + /* Extract headers from first row, clean names for PostgreSQL */ + const headers = []; + for (let col = range.s.c; col <= range.e.c; col++) { + const cellAddress = XLSX.utils.encode_cell({ r: range.s.r, c: col }); + const cell = worksheet[cellAddress]; + const headerValue = cell ? String(cell.v).replace(/[^a-zA-Z0-9_]/g, '_') : `column_${col + 1}`; + headers.push(headerValue.toLowerCase()); + } - /* types is a map from column headers to Knex schema column type */ - const types = {}; + /* Group cell values by column for type deduction */ + const columnValues = headers.map(() => []); + for (let row = range.s.r + 1; row <= range.e.r; row++) { + for (let col = range.s.c; col <= range.e.c; col++) { + const cellAddress = XLSX.utils.encode_cell({ r: row, c: col }); + const cell = worksheet[cellAddress]; + columnValues[col].push(cell); + } + } - /* names is an ordered list of the column header names */ - const names = []; + /* Deduce PostgreSQL type for each column */ + const types = {}; + headers.forEach((header, idx) => { + types[header] = deduceType(columnValues[idx]); + }); - /* loop across each row object */ - aoo.forEach(row => - /* Object.entries returns a row of [key, value] pairs */ - Object.entries(row).forEach(([k,v]) => { - - /* If this is first occurrence, mark unknown and append header to names */ - if(!types[k]) { types[k] = ""; names.push(k); } - - /* skip null and undefined values */ - if(v == null) return; - - /* check and resolve type */ - switch(typeof v) { - /* change type if it is empty or can be stored in a float */ - case "number": if(!types[k] || T_FLOAT.includes(types[k])) types[k] = "float8"; break; - /* change type if it is empty or can be stored in a boolean */ - case "boolean": if(!types[k] || T_BOOL.includes(types[k])) types[k] = "boolean"; break; - /* no other type can hold strings */ - case "string": types[k] = "text"; break; - default: types[k] = "text"; break; - } - }) - ); - - /* Delete table if it exists in the DB */ - const query = format("DROP TABLE IF EXISTS %I;", table_name); - await client.query(query); - - /* Create table */ - { - const entries = Object.entries(types); - const Istr = entries.map(e => format(`%I ${e[1]}`, e[0])).join(", "); - let query = format.withArray(`CREATE TABLE %I (${Istr});`, [ table_name ]); - await client.query(query); - } - - /* Insert each row */ - for(let row of aoo) { - const ent = Object.entries(row); - const Istr = Array.from({length: ent.length}, ()=>"%I").join(", "); - const Lstr = Array.from({length: ent.length}, ()=>"%L").join(", "); - let query = format.withArray( - `INSERT INTO %I (${Istr}) VALUES (${Lstr});`, - [ table_name, ...ent.map(x => x[0]), ...ent.map(x => x[1]) ] + /* Delete table if it exists in the DB */ + await client.query(format('DROP TABLE IF EXISTS %I', tableName)); + + /* Create table */ + const createTableSQL = format( + 'CREATE TABLE %I (%s)', + tableName, + headers.map(header => format('%I %s', header, types[header])).join(', ') ); - await client.query(query); - } + await client.query(createTableSQL); - return client; + /* Insert data row by row */ + for (let row = range.s.r + 1; row <= range.e.r; row++) { + const values = headers.map((header, col) => { + const cellAddress = XLSX.utils.encode_cell({ r: row, c: col }); + const cell = worksheet[cellAddress]; + return parseValue(cell, types[header]); + }); + + const insertSQL = format( + 'INSERT INTO %I (%s) VALUES (%s)', + tableName, + headers.map(h => format('%I', h)).join(', '), + values.map(() => '%L').join(', ') + ); + await client.query(format(insertSQL, ...values)); + } +} + +function deduceType(cells) { + if (!cells || cells.length === 0) return 'text'; + + const nonEmptyCells = cells.filter(cell => cell && cell.v != null); + if (nonEmptyCells.length === 0) return 'text'; + + // Check for dates by looking at both cell type and formatted value + const isDateCell = cell => cell?.t === 'd' || (cell?.t === 'n' && cell.w && /\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\d{2}-[A-Za-z]{3}-\d{4}|[A-Za-z]{3}-\d{2}|\d{1,2}-[A-Za-z]{3}/.test(cell.w)); + + if (nonEmptyCells.some(isDateCell)) { return 'date'; } + + const allBooleans = nonEmptyCells.every(cell => cell.t === 'b'); + if (allBooleans) { return 'boolean'; } + + const allNumbers = nonEmptyCells.every(cell => cell.t === 'n' || (cell.t === 's' && !isNaN(cell.v.replace(/[,$\s%()]/g, '')))); + + if (allNumbers) { + const numbers = nonEmptyCells.map(cell => { + if (cell.t === 'n') return cell.v; + return parseFloat(cell.v.replace(/[,$\s%()]/g, '')); + }); + + const needsPrecision = numbers.some(num => { + const str = num.toString(); + return str.includes('e') || + (str.includes('.') && str.split('.')[1].length > 6) || + Math.abs(num) > 1e15; + }); + + return needsPrecision ? 'numeric' : 'double precision'; + } + return 'text'; +} + +function parseValue(cell, type) { + if (!cell || cell.v == null) return null; + + switch (type) { + case 'date': + if (cell.t === 'd') { return cell.v.toISOString().split('T')[0]; } + if (cell.t === 'n') { + const date = new Date((cell.v - 25569) * 86400 * 1000); + return date.toISOString().split('T')[0]; + } + return null; + + case 'numeric': + case 'double precision': + if (cell.t === 'n') return cell.v; + if (cell.t === 's') { + const cleaned = cell.v.replace(/[,$\s%()]/g, ''); + if (!isNaN(cleaned)) return parseFloat(cleaned); + } + return null; + + case 'boolean': + return cell.t === 'b' ? cell.v : null; + + default: + return String(cell.v); + } } ``` @@ -230,6 +289,16 @@ Or, if you don't want/need a background service you can just run: LC_ALL="C" /usr/local/opt/postgresql@16/bin/postgres -D /usr/local/var/postgresql@16 ``` +On Linux, install `postgresql` by running the following script: +```bash +echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list +wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - + +sudo apt update +sudo apt install postgresql-16 +sudo systemctl start postgresql +``` + Run the command to start a local database instance.
@@ -238,6 +307,9 @@ Run the command to start a local database instance. ```bash dropdb SheetJSPG + +# Ubuntu/Debian +sudo -i -u postgres dropdb SheetJSPG ``` :::info pass @@ -257,6 +329,9 @@ current user, command-line flags can override the defaults. ```bash createdb SheetJSPG + +# Ubuntu/Debian +sudo -i -u postgres createdb SheetJSPG ``` :::note pass @@ -316,6 +391,8 @@ correct host name and port number. - If the server expects a different username and password, uncomment the `user` and `password` lines and replace the values with the username and password. +- For Ubuntu/Debian PostgreSQL installations, the default user is `postgres`. The password must be set during installation or using `sudo -u postgres psql` followed by `\password postgres` in the psql prompt. + 7) Run the script: ```bash @@ -379,6 +456,9 @@ correct host name and port number. - If the server expects a different username and password, uncomment the `user` and `password` lines and replace the values with the username and password. +- For Ubuntu/Debian PostgreSQL installations, the default user is postgres. The password must be set during installation or using sudo -u postgres psql followed by \password postgres in the psql prompt. + + 11) Fetch the example file [`pres.numbers`](https://docs.sheetjs.com/pres.numbers): ```bash diff --git a/docz/static/postgresql/SheetJSPG.js b/docz/static/postgresql/SheetJSPG.js index 916115e..c38b533 100644 --- a/docz/static/postgresql/SheetJSPG.js +++ b/docz/static/postgresql/SheetJSPG.js @@ -1,74 +1,136 @@ const pg = require("pg"), format = require("pg-format"); const XLSX = require("xlsx"); const opts = { - database:"SheetJSPG", - host: "127.0.0.1", // localhost - port: 5432, - //user: "", - //password: "" -}; + database:"SheetJSPG", + host: "127.0.0.1", // localhost + port: 5432, + //user: "", + //password: "" + }; -/* create table and load data given an array of objects and a PostgreSQL client */ -async function aoo_to_pg_table(client, aoo, table_name) { - /* define types that can be converted (e.g. boolean can be stored in float) */ - const T_FLOAT = ["float8", "boolean"]; - const T_BOOL = ["boolean"]; +function deduceType(cells) { + if (!cells || cells.length === 0) return 'text'; - /* types is a map from column headers to Knex schema column type */ - const types = {}; + const nonEmptyCells = cells.filter(cell => cell && cell.v != null); + if (nonEmptyCells.length === 0) return 'text'; - /* names is an ordered list of the column header names */ - const names = []; + // Check for dates by looking at both cell type and formatted value + const isDateCell = cell => cell?.t === 'd' || (cell?.t === 'n' && cell.w && /\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\d{2}-[A-Za-z]{3}-\d{4}|[A-Za-z]{3}-\d{2}|\d{1,2}-[A-Za-z]{3}/.test(cell.w)); - /* loop across each row object */ - aoo.forEach(row => - /* Object.entries returns a row of [key, value] pairs */ - Object.entries(row).forEach(([k,v]) => { + if (nonEmptyCells.some(isDateCell)) { return 'date'; } - /* If this is first occurrence, mark unknown and append header to names */ - if(!types[k]) { types[k] = ""; names.push(k); } + // Check for booleans + const allBooleans = nonEmptyCells.every(cell => cell.t === 'b'); + if (allBooleans) { return 'boolean'; } - /* skip null and undefined values */ - if(v == null) return; + // Check for numbers + const allNumbers = nonEmptyCells.every(cell => cell.t === 'n' || (cell.t === 's' && !isNaN(cell.v.replace(/[,$\s%()]/g, '')))); - /* check and resolve type */ - switch(typeof v) { - /* change type if it is empty or can be stored in a float */ - case "number": if(!types[k] || T_FLOAT.includes(types[k])) types[k] = "float8"; break; - /* change type if it is empty or can be stored in a boolean */ - case "boolean": if(!types[k] || T_BOOL.includes(types[k])) types[k] = "boolean"; break; - /* no other type can hold strings */ - case "string": types[k] = "text"; break; - default: types[k] = "text"; break; - } - }) - ); + if (allNumbers) { + const numbers = nonEmptyCells.map(cell => { + if (cell.t === 'n') return cell.v; + return parseFloat(cell.v.replace(/[,$\s%()]/g, '')); + }); - /* Delete table if it exists in the DB */ - const query = format("DROP TABLE IF EXISTS %I;", table_name); - await client.query(query); + const needsPrecision = numbers.some(num => { + const str = num.toString(); + return str.includes('e') || + (str.includes('.') && str.split('.')[1].length > 6) || + Math.abs(num) > 1e15; + }); - /* Create table */ - { - const entries = Object.entries(types); - const Istr = entries.map(e => format(`%I ${e[1]}`, e[0])).join(", "); - let query = format.withArray(`CREATE TABLE %I (${Istr});`, [ table_name ]); - await client.query(query); - } + return needsPrecision ? 'numeric' : 'double precision'; + } + return 'text'; // default to string type +} - /* Insert each row */ - for(let row of aoo) { - const ent = Object.entries(row); - const Istr = Array.from({length: ent.length}, ()=>"%I").join(", "); - const Lstr = Array.from({length: ent.length}, ()=>"%L").join(", "); - let query = format.withArray( - `INSERT INTO %I (${Istr}) VALUES (${Lstr});`, - [ table_name, ...ent.map(x => x[0]), ...ent.map(x => x[1]) ] +function parseValue(cell, type) { + if (!cell || cell.v == null) return null; + + switch (type) { + case 'date': + if (cell.t === 'd') { return cell.v.toISOString().split('T')[0]; } + if (cell.t === 'n') { + const date = new Date((cell.v - 25569) * 86400 * 1000); + return date.toISOString().split('T')[0]; + } + return null; + + case 'numeric': + case 'double precision': + if (cell.t === 'n') return cell.v; + if (cell.t === 's') { + const cleaned = cell.v.replace(/[,$\s%()]/g, ''); + if (!isNaN(cleaned)) return parseFloat(cleaned); + } + return null; + + case 'boolean': + return cell.t === 'b' ? cell.v : null; + + default: + return String(cell.v); + } +} + +/* create table and load data given a worksheet and a PostgreSQL client */ +async function sheet_to_pg_table(client, worksheet, tableName) { + if (!worksheet['!ref']) return; + + const range = XLSX.utils.decode_range(worksheet['!ref']); + + /* Extract headers from first row, clean names for PostgreSQL */ + const headers = []; + for (let col = range.s.c; col <= range.e.c; col++) { + const cellAddress = XLSX.utils.encode_cell({ r: range.s.r, c: col }); + const cell = worksheet[cellAddress]; + const headerValue = cell ? String(cell.v).replace(/[^a-zA-Z0-9_]/g, '_') : `column_${col + 1}`; + headers.push(headerValue.toLowerCase()); + } + + /* Group cell values by column for type deduction */ + const columnValues = headers.map(() => []); + for (let row = range.s.r + 1; row <= range.e.r; row++) { + for (let col = range.s.c; col <= range.e.c; col++) { + const cellAddress = XLSX.utils.encode_cell({ r: row, c: col }); + const cell = worksheet[cellAddress]; + columnValues[col].push(cell); + } + } + + /* Deduce PostgreSQL type for each column */ + const types = {}; + headers.forEach((header, idx) => { + types[header] = deduceType(columnValues[idx]); + }); + + /* Delete table if it exists in the DB */ + await client.query(format('DROP TABLE IF EXISTS %I', tableName)); + + /* Create table */ + const createTableSQL = format( + 'CREATE TABLE %I (%s)', + tableName, + headers.map(header => format('%I %s', header, types[header])).join(', ') ); - await client.query(query); - } + await client.query(createTableSQL); - return client; + /* Insert data row by row */ + for (let row = range.s.r + 1; row <= range.e.r; row++) { + const values = headers.map((header, col) => { + const cellAddress = XLSX.utils.encode_cell({ r: row, c: col }); + const cell = worksheet[cellAddress]; + return parseValue(cell, types[header]); + }); + + const insertSQL = format( + 'INSERT INTO %I (%s) VALUES (%s)', + tableName, + headers.map(h => format('%I', h)).join(', '), + values.map(() => '%L').join(', ') + ); + await client.query(format(insertSQL, ...values)); + } } (async() => { @@ -83,11 +145,8 @@ try { /* open connection to PostgreSQL database */ await client.connect(); - /* generate array of objects from worksheet */ - const aoo = XLSX.utils.sheet_to_json(oldws); - - /* create table and load data */ - await aoo_to_pg_table(client, aoo, "Presidents"); + /* create table and load data given a worksheet */ + await sheet_to_pg_table(client, oldws, "Presidents"); } finally { /* disconnect */ await client.end(); @@ -111,4 +170,4 @@ try { /* disconnect */ await client.end(); } -})(); +})(); \ No newline at end of file