import { Document } from "@langchain/core/documents"; import { BufferLoader } from "langchain/document_loaders/fs/buffer"; import { read, utils } from "xlsx"; /** * Document loader that uses SheetJS to load documents. * * Each worksheet is parsed into an array of row objects using the SheetJS * `sheet_to_json` method and projected to a `Document`. Metadata includes * original sheet name, row data, and row index */ export default class LoadOfSheet extends BufferLoader { /** @type {import("langchain/chains/query_constructor").AttributeInfo[]} */ attributes = []; /** * Document loader that uses SheetJS to load documents. * * @param {string|Blob} filePathOrBlob Source Data */ constructor(filePathOrBlob) { super(filePathOrBlob); this.attributes = []; } /** * Parse document * * NOTE: column labels in multiple sheets are not disambiguated! * * @param {Buffer} raw Raw data Buffer * @param {Document["metadata"]} metadata Document metadata * @returns {Promise} Array of Documents */ async parse(raw, metadata) { /** @type {Document[]} */ const result = []; this.attributes = [ { name: "worksheet", description: "Sheet or Worksheet Name", type: "string" }, { name: "rowNum", description: "Row index", type: "number" } ]; const wb = read(raw, {type: "buffer", WTF:1}); for(let name of wb.SheetNames) { const fields = {}; const ws = wb.Sheets[name]; if(!ws) return; const aoo = utils.sheet_to_json(ws); aoo.forEach((row, idx) => { result.push({ pageContent: "Row " + (idx + 1) + " has the following content: \n" + Object.entries(row).map(kv => `- ${kv[0]}: ${kv[1]}`).join("\n") + "\n", metadata: { worksheet: name, rowNum: row["__rowNum__"], ...metadata, ...row } }); Object.entries(row).forEach(([k,v]) => { if(v != null) (fields[k] || (fields[k] = {}))[v instanceof Date ? "date" : typeof v] = true } ); }); Object.entries(fields).forEach(([k,v]) => this.attributes.push({ name: k, description: k, type: Object.keys(v).join(" or ") })); } return result; } };