forked from sheetjs/docs.sheetjs.com
71 lines
2.2 KiB
JavaScript
71 lines
2.2 KiB
JavaScript
|
import { Document } from "@langchain/core/documents";
|
||
|
import { BufferLoader } from "langchain/document_loaders/fs/buffer";
|
||
|
import { read, utils } from "xlsx";
|
||
|
|
||
|
/**
|
||
|
* Document loader that uses SheetJS to load documents.
|
||
|
*
|
||
|
* Each worksheet is parsed into an array of row objects using the SheetJS
|
||
|
* `sheet_to_json` method and projected to a `Document`. Metadata includes
|
||
|
* original sheet name, row data, and row index
|
||
|
*/
|
||
|
export default class LoadOfSheet extends BufferLoader {
|
||
|
/** @type {import("langchain/chains/query_constructor").AttributeInfo[]} */
|
||
|
attributes = [];
|
||
|
|
||
|
/**
|
||
|
* Document loader that uses SheetJS to load documents.
|
||
|
*
|
||
|
* @param {string|Blob} filePathOrBlob Source Data
|
||
|
*/
|
||
|
constructor(filePathOrBlob) {
|
||
|
super(filePathOrBlob);
|
||
|
this.attributes = [];
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parse document
|
||
|
*
|
||
|
* NOTE: column labels in multiple sheets are not disambiguated!
|
||
|
*
|
||
|
* @param {Buffer} raw Raw data Buffer
|
||
|
* @param {Document["metadata"]} metadata Document metadata
|
||
|
* @returns {Promise<Document[]>} Array of Documents
|
||
|
*/
|
||
|
async parse(raw, metadata) {
|
||
|
/** @type {Document[]} */
|
||
|
const result = [];
|
||
|
|
||
|
this.attributes = [
|
||
|
{ name: "worksheet", description: "Sheet or Worksheet Name", type: "string" },
|
||
|
{ name: "rowNum", description: "Row index", type: "number" }
|
||
|
];
|
||
|
|
||
|
const wb = read(raw, {type: "buffer", WTF:1});
|
||
|
for(let name of wb.SheetNames) {
|
||
|
const fields = {};
|
||
|
const ws = wb.Sheets[name];
|
||
|
if(!ws) return;
|
||
|
|
||
|
const aoo = utils.sheet_to_json(ws);
|
||
|
aoo.forEach((row, idx) => {
|
||
|
result.push({
|
||
|
pageContent: "Row " + (idx + 1) + " has the following content: \n" + Object.entries(row).map(kv => `- ${kv[0]}: ${kv[1]}`).join("\n") + "\n",
|
||
|
metadata: {
|
||
|
worksheet: name,
|
||
|
rowNum: row["__rowNum__"],
|
||
|
...metadata,
|
||
|
...row
|
||
|
}
|
||
|
});
|
||
|
Object.entries(row).forEach(([k,v]) => { if(v != null) (fields[k] || (fields[k] = {}))[v instanceof Date ? "date" : typeof v] = true } );
|
||
|
});
|
||
|
Object.entries(fields).forEach(([k,v]) => this.attributes.push({
|
||
|
name: k, description: k, type: Object.keys(v).join(" or ")
|
||
|
}));
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
};
|