From 7315b4211c65d37513344550cc6d9bbb08d64ff1 Mon Sep 17 00:00:00 2001
From: Asad <contact@asadk.dev>
Date: Wed, 20 Nov 2024 19:45:34 -0500
Subject: [PATCH] feat(pg): add Ubuntu/Debian PostgreSQL installation guide &
 enhanced SheetJSPG.js type deduc

- Include Ubuntu/Debian installation instructions
- Improve PostgreSQL table creation with better type detection
- Enhance date format detection and parsing

Breaking Changes:
- Replace `aoo_to_pg_table` to `sheet_to_pg_table` & now it takes worksheet object instead of array of arrays
- Changed type detection algorithm may produce column types
---
 docz/docs/03-demos/23-data/16-postgresql.md | 200 ++++++++++++++------
 docz/static/postgresql/SheetJSPG.js         | 183 ++++++++++++------
 2 files changed, 261 insertions(+), 122 deletions(-)

diff --git a/docz/docs/03-demos/23-data/16-postgresql.md b/docz/docs/03-demos/23-data/16-postgresql.md
index 099d136..778daec 100644
--- a/docz/docs/03-demos/23-data/16-postgresql.md
+++ b/docz/docs/03-demos/23-data/16-postgresql.md
@@ -129,80 +129,139 @@ for(let row of aoo) {
 
 ### Creating a Table
 
-The array of objects can be scanned to determine column names and types. With
-the names and types, a `CREATE TABLE` query can be written.
+The worksheet can be scanned to determine column names and types. With the names and types, a `CREATE TABLE` query can be written.
 
 <details>
   <summary><b>Implementation Details</b> (click to show)</summary>
 
-The `aoo_to_pg_table` function:
+The `sheet_to_pg_table` function:
 
-- scans each row object to determine column names and types
+- scans worksheet cells to determine column names and types
 - drops and creates a new table with the determined column names and types
 - loads the entire dataset into the new table
 
 ```js
-/* create table and load data given an array of objects and a PostgreSQL client */
-async function aoo_to_pg_table(client, aoo, table_name) {
-  /* define types that can be converted (e.g. boolean can be stored in float) */
-  const T_FLOAT = ["float8", "boolean"];
-  const T_BOOL = ["boolean"];
+/* create table and load data given a worksheet and a PostgreSQL client */
+async function sheet_to_pg_table(client, worksheet, tableName) {
+    if (!worksheet['!ref']) return;
+    
+    const range = XLSX.utils.decode_range(worksheet['!ref']);
+    
+    /* Extract headers from first row, clean names for PostgreSQL */
+    const headers = [];
+    for (let col = range.s.c; col <= range.e.c; col++) {
+        const cellAddress = XLSX.utils.encode_cell({ r: range.s.r, c: col });
+        const cell = worksheet[cellAddress];
+        const headerValue = cell ? String(cell.v).replace(/[^a-zA-Z0-9_]/g, '_') : `column_${col + 1}`;
+        headers.push(headerValue.toLowerCase());
+    }
 
-  /* types is a map from column headers to Knex schema column type */
-  const types = {};
+    /* Group cell values by column for type deduction */
+    const columnValues = headers.map(() => []);
+    for (let row = range.s.r + 1; row <= range.e.r; row++) {
+        for (let col = range.s.c; col <= range.e.c; col++) {
+            const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
+            const cell = worksheet[cellAddress];
+            columnValues[col].push(cell);
+        }
+    }
 
-  /* names is an ordered list of the column header names */
-  const names = [];
+    /* Deduce PostgreSQL type for each column */
+    const types = {};
+    headers.forEach((header, idx) => {
+        types[header] = deduceType(columnValues[idx]);
+    });
 
-  /* loop across each row object */
-  aoo.forEach(row =>
-    /* Object.entries returns a row of [key, value] pairs */
-    Object.entries(row).forEach(([k,v]) => {
-
-      /* If this is first occurrence, mark unknown and append header to names */
-      if(!types[k]) { types[k] = ""; names.push(k); }
-
-      /* skip null and undefined values */
-      if(v == null) return;
-
-      /* check and resolve type */
-      switch(typeof v) {
-        /* change type if it is empty or can be stored in a float */
-        case "number": if(!types[k] || T_FLOAT.includes(types[k])) types[k] = "float8"; break;
-        /* change type if it is empty or can be stored in a boolean */
-        case "boolean": if(!types[k] || T_BOOL.includes(types[k])) types[k] = "boolean"; break;
-        /* no other type can hold strings */
-        case "string": types[k] = "text"; break;
-        default: types[k] = "text"; break;
-      }
-    })
-  );
-
-  /* Delete table if it exists in the DB */
-  const query = format("DROP TABLE IF EXISTS %I;", table_name);
-  await client.query(query);
-
-  /* Create table */
-  {
-    const entries = Object.entries(types);
-    const Istr = entries.map(e => format(`%I ${e[1]}`, e[0])).join(", ");
-    let query = format.withArray(`CREATE TABLE %I (${Istr});`, [ table_name ]);
-    await client.query(query);
-  }
-
-  /* Insert each row */
-  for(let row of aoo) {
-    const ent = Object.entries(row);
-    const Istr = Array.from({length: ent.length}, ()=>"%I").join(", ");
-    const Lstr = Array.from({length: ent.length}, ()=>"%L").join(", ");
-    let query = format.withArray(
-      `INSERT INTO %I (${Istr}) VALUES (${Lstr});`,
-      [ table_name, ...ent.map(x => x[0]), ...ent.map(x => x[1]) ]
+    /* Delete table if it exists in the DB */
+    await client.query(format('DROP TABLE IF EXISTS %I', tableName));
+    
+    /* Create table */
+    const createTableSQL = format(
+        'CREATE TABLE %I (%s)',
+        tableName,
+        headers.map(header => format('%I %s', header, types[header])).join(', ')
     );
-    await client.query(query);
-  }
+    await client.query(createTableSQL);
 
-  return client;
+    /* Insert data row by row */
+    for (let row = range.s.r + 1; row <= range.e.r; row++) {
+        const values = headers.map((header, col) => {
+            const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
+            const cell = worksheet[cellAddress];
+            return parseValue(cell, types[header]);
+        });
+
+        const insertSQL = format(
+            'INSERT INTO %I (%s) VALUES (%s)',
+            tableName,
+            headers.map(h => format('%I', h)).join(', '),
+            values.map(() => '%L').join(', ')
+        );
+        await client.query(format(insertSQL, ...values));
+    }
+}
+
+function deduceType(cells) {
+    if (!cells || cells.length === 0) return 'text';
+
+    const nonEmptyCells = cells.filter(cell => cell && cell.v != null);
+    if (nonEmptyCells.length === 0) return 'text';
+
+    // Check for dates by looking at both cell type and formatted value
+    const isDateCell = cell => cell?.t === 'd' || (cell?.t === 'n' && cell.w && /\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\d{2}-[A-Za-z]{3}-\d{4}|[A-Za-z]{3}-\d{2}|\d{1,2}-[A-Za-z]{3}/.test(cell.w));
+
+    if (nonEmptyCells.some(isDateCell)) { return 'date'; }
+
+    const allBooleans = nonEmptyCells.every(cell => cell.t === 'b');
+    if (allBooleans) { return 'boolean'; }
+
+    const allNumbers = nonEmptyCells.every(cell => cell.t === 'n' || (cell.t === 's' && !isNaN(cell.v.replace(/[,$\s%()]/g, ''))));
+
+    if (allNumbers) {
+        const numbers = nonEmptyCells.map(cell => {
+            if (cell.t === 'n') return cell.v;
+            return parseFloat(cell.v.replace(/[,$\s%()]/g, ''));
+        });
+
+        const needsPrecision = numbers.some(num => {
+            const str = num.toString();
+            return str.includes('e') ||
+                   (str.includes('.') && str.split('.')[1].length > 6) ||
+                   Math.abs(num) > 1e15;
+        });
+
+        return needsPrecision ? 'numeric' : 'double precision';
+    }
+    return 'text';
+}
+
+function parseValue(cell, type) {
+    if (!cell || cell.v == null) return null;
+    
+    switch (type) {
+        case 'date':
+            if (cell.t === 'd') { return cell.v.toISOString().split('T')[0]; }
+            if (cell.t === 'n') {
+                const date = new Date((cell.v - 25569) * 86400 * 1000);
+                return date.toISOString().split('T')[0];
+            }
+            return null;
+
+        case 'numeric':
+        case 'double precision':
+            if (cell.t === 'n') return cell.v;
+            if (cell.t === 's') {
+                const cleaned = cell.v.replace(/[,$\s%()]/g, '');
+                if (!isNaN(cleaned)) return parseFloat(cleaned);
+            }
+            return null;
+
+        case 'boolean':
+            return cell.t === 'b' ? cell.v : null;
+
+        default:
+            return String(cell.v);
+    }
 }
 ```
 
@@ -230,6 +289,16 @@ Or, if you don't want/need a background service you can just run:
   LC_ALL="C" /usr/local/opt/postgresql@16/bin/postgres -D /usr/local/var/postgresql@16
 ```
 
+On Linux, install `postgresql` by running the following script:
+```bash
+echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
+wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
+
+sudo apt update
+sudo apt install postgresql-16
+sudo systemctl start postgresql
+```
+
 Run the command to start a local database instance.
 
 </details>
@@ -238,6 +307,9 @@ Run the command to start a local database instance.
 
 ```bash
 dropdb SheetJSPG
+
+# Ubuntu/Debian
+sudo -i -u postgres dropdb SheetJSPG
 ```
 
 :::info pass
@@ -257,6 +329,9 @@ current user, command-line flags can override the defaults.
 
 ```bash
 createdb SheetJSPG
+
+# Ubuntu/Debian
+sudo -i -u postgres createdb SheetJSPG
 ```
 
 :::note pass
@@ -316,6 +391,8 @@ correct host name and port number.
 - If the server expects a different username and password, uncomment the `user`
 and `password` lines and replace the values with the username and password.
 
+- For Ubuntu/Debian PostgreSQL installations, the default user is `postgres`. The password must be set during installation or using `sudo -u postgres psql` followed by  `\password postgres` in the psql prompt.
+
 7) Run the script:
 
 ```bash
@@ -379,6 +456,9 @@ correct host name and port number.
 - If the server expects a different username and password, uncomment the `user`
 and `password` lines and replace the values with the username and password.
 
+- For Ubuntu/Debian PostgreSQL installations, the default user is postgres. The password must be set during installation or using sudo -u postgres psql followed by \password postgres in the psql prompt.
+
+
 11) Fetch the example file [`pres.numbers`](https://docs.sheetjs.com/pres.numbers):
 
 ```bash
diff --git a/docz/static/postgresql/SheetJSPG.js b/docz/static/postgresql/SheetJSPG.js
index 916115e..c38b533 100644
--- a/docz/static/postgresql/SheetJSPG.js
+++ b/docz/static/postgresql/SheetJSPG.js
@@ -1,74 +1,136 @@
 const pg = require("pg"), format = require("pg-format");
 const XLSX = require("xlsx");
 const opts = {
-  database:"SheetJSPG",
-  host: "127.0.0.1", // localhost
-  port: 5432,
-  //user: "",
-  //password: ""
-};
+    database:"SheetJSPG",
+    host: "127.0.0.1", // localhost
+    port: 5432,
+    //user: "",
+    //password: ""
+  };
 
-/* create table and load data given an array of objects and a PostgreSQL client */
-async function aoo_to_pg_table(client, aoo, table_name) {
-  /* define types that can be converted (e.g. boolean can be stored in float) */
-  const T_FLOAT = ["float8", "boolean"];
-  const T_BOOL = ["boolean"];
+function deduceType(cells) {
+    if (!cells || cells.length === 0) return 'text';
 
-  /* types is a map from column headers to Knex schema column type */
-  const types = {};
+    const nonEmptyCells = cells.filter(cell => cell && cell.v != null);
+    if (nonEmptyCells.length === 0) return 'text';
 
-  /* names is an ordered list of the column header names */
-  const names = [];
+    // Check for dates by looking at both cell type and formatted value
+    const isDateCell = cell => cell?.t === 'd' || (cell?.t === 'n' && cell.w && /\d{4}-\d{2}-\d{2}|\d{1,2}\/\d{1,2}\/\d{4}|\d{2}-[A-Za-z]{3}-\d{4}|[A-Za-z]{3}-\d{2}|\d{1,2}-[A-Za-z]{3}/.test(cell.w));
 
-  /* loop across each row object */
-  aoo.forEach(row =>
-    /* Object.entries returns a row of [key, value] pairs */
-    Object.entries(row).forEach(([k,v]) => {
+    if (nonEmptyCells.some(isDateCell)) { return 'date'; }
 
-      /* If this is first occurrence, mark unknown and append header to names */
-      if(!types[k]) { types[k] = ""; names.push(k); }
+    // Check for booleans
+    const allBooleans = nonEmptyCells.every(cell => cell.t === 'b');
+    if (allBooleans) { return 'boolean'; }
 
-      /* skip null and undefined values */
-      if(v == null) return;
+    // Check for numbers
+    const allNumbers = nonEmptyCells.every(cell => cell.t === 'n' || (cell.t === 's' && !isNaN(cell.v.replace(/[,$\s%()]/g, ''))));
 
-      /* check and resolve type */
-      switch(typeof v) {
-        /* change type if it is empty or can be stored in a float */
-        case "number": if(!types[k] || T_FLOAT.includes(types[k])) types[k] = "float8"; break;
-        /* change type if it is empty or can be stored in a boolean */
-        case "boolean": if(!types[k] || T_BOOL.includes(types[k])) types[k] = "boolean"; break;
-        /* no other type can hold strings */
-        case "string": types[k] = "text"; break;
-        default: types[k] = "text"; break;
-      }
-    })
-  );
+    if (allNumbers) {
+        const numbers = nonEmptyCells.map(cell => {
+            if (cell.t === 'n') return cell.v;
+            return parseFloat(cell.v.replace(/[,$\s%()]/g, ''));
+        });
 
-  /* Delete table if it exists in the DB */
-  const query = format("DROP TABLE IF EXISTS %I;", table_name);
-  await client.query(query);
+        const needsPrecision = numbers.some(num => {
+            const str = num.toString();
+            return str.includes('e') ||
+                   (str.includes('.') && str.split('.')[1].length > 6) ||
+                   Math.abs(num) > 1e15;
+        });
 
-  /* Create table */
-  {
-    const entries = Object.entries(types);
-    const Istr = entries.map(e => format(`%I ${e[1]}`, e[0])).join(", ");
-    let query = format.withArray(`CREATE TABLE %I (${Istr});`, [ table_name ]);
-    await client.query(query);
-  }
+        return needsPrecision ? 'numeric' : 'double precision';
+    }
+    return 'text'; // default to string type
+}
 
-  /* Insert each row */
-  for(let row of aoo) {
-    const ent = Object.entries(row);
-    const Istr = Array.from({length: ent.length}, ()=>"%I").join(", ");
-    const Lstr = Array.from({length: ent.length}, ()=>"%L").join(", ");
-    let query = format.withArray(
-      `INSERT INTO %I (${Istr}) VALUES (${Lstr});`,
-      [ table_name, ...ent.map(x => x[0]), ...ent.map(x => x[1]) ]
+function parseValue(cell, type) {
+    if (!cell || cell.v == null) return null;
+    
+    switch (type) {
+        case 'date':
+            if (cell.t === 'd') { return cell.v.toISOString().split('T')[0]; }
+            if (cell.t === 'n') {
+                const date = new Date((cell.v - 25569) * 86400 * 1000);
+                return date.toISOString().split('T')[0];
+            }
+            return null;
+
+        case 'numeric':
+        case 'double precision':
+            if (cell.t === 'n') return cell.v;
+            if (cell.t === 's') {
+                const cleaned = cell.v.replace(/[,$\s%()]/g, '');
+                if (!isNaN(cleaned)) return parseFloat(cleaned);
+            }
+            return null;
+
+        case 'boolean':
+            return cell.t === 'b' ? cell.v : null;
+
+        default:
+            return String(cell.v);
+    }
+}
+
+/* create table and load data given a worksheet and a PostgreSQL client */
+async function sheet_to_pg_table(client, worksheet, tableName) {
+    if (!worksheet['!ref']) return;
+    
+    const range = XLSX.utils.decode_range(worksheet['!ref']);
+    
+    /* Extract headers from first row, clean names for PostgreSQL */
+    const headers = [];
+    for (let col = range.s.c; col <= range.e.c; col++) {
+        const cellAddress = XLSX.utils.encode_cell({ r: range.s.r, c: col });
+        const cell = worksheet[cellAddress];
+        const headerValue = cell ? String(cell.v).replace(/[^a-zA-Z0-9_]/g, '_') : `column_${col + 1}`;
+        headers.push(headerValue.toLowerCase());
+    }
+
+    /* Group cell values by column for type deduction */
+    const columnValues = headers.map(() => []);
+    for (let row = range.s.r + 1; row <= range.e.r; row++) {
+        for (let col = range.s.c; col <= range.e.c; col++) {
+            const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
+            const cell = worksheet[cellAddress];
+            columnValues[col].push(cell);
+        }
+    }
+
+    /* Deduce PostgreSQL type for each column */
+    const types = {};
+    headers.forEach((header, idx) => {
+        types[header] = deduceType(columnValues[idx]);
+    });
+
+    /* Delete table if it exists in the DB */
+    await client.query(format('DROP TABLE IF EXISTS %I', tableName));
+    
+    /* Create table */
+    const createTableSQL = format(
+        'CREATE TABLE %I (%s)',
+        tableName,
+        headers.map(header => format('%I %s', header, types[header])).join(', ')
     );
-    await client.query(query);
-  }
+    await client.query(createTableSQL);
 
-  return client;
+    /* Insert data row by row */
+    for (let row = range.s.r + 1; row <= range.e.r; row++) {
+        const values = headers.map((header, col) => {
+            const cellAddress = XLSX.utils.encode_cell({ r: row, c: col });
+            const cell = worksheet[cellAddress];
+            return parseValue(cell, types[header]);
+        });
+
+        const insertSQL = format(
+            'INSERT INTO %I (%s) VALUES (%s)',
+            tableName,
+            headers.map(h => format('%I', h)).join(', '),
+            values.map(() => '%L').join(', ')
+        );
+        await client.query(format(insertSQL, ...values));
+    }
 }
 
 (async() => {
@@ -83,11 +145,8 @@ try {
   /* open connection to PostgreSQL database */
   await client.connect();
 
-  /* generate array of objects from worksheet */
-  const aoo = XLSX.utils.sheet_to_json(oldws);
-
-  /* create table and load data */
-  await aoo_to_pg_table(client, aoo, "Presidents");
+  /* create table and load data given a worksheet */
+  await sheet_to_pg_table(client, oldws, "Presidents");
 } finally {
   /* disconnect */
   await client.end();
@@ -111,4 +170,4 @@ try {
   /* disconnect */
   await client.end();
 }
-})();
+})();
\ No newline at end of file