From b658f5012de0658272fb0853a1c00a1fda2ccf1f Mon Sep 17 00:00:00 2001 From: Sean Hatfield Date: Thu, 3 Oct 2024 13:45:23 -0700 Subject: [PATCH] Support XLSX files (#2403) * support xlsx files * lint * create seperate docs for each xlsx sheet * lint * use node-xlsx pkg for parsing xslx files * lint * update error handling --------- Co-authored-by: timothycarambat --- collector/package.json | 3 +- collector/processSingleFile/convert/asXlsx.js | 113 ++++++++++++++++++ collector/utils/constants.js | 6 + collector/yarn.lock | 11 ++ 4 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 collector/processSingleFile/convert/asXlsx.js diff --git a/collector/package.json b/collector/package.json index 4ce85e68e1..bf6498c065 100644 --- a/collector/package.json +++ b/collector/package.json @@ -33,6 +33,7 @@ "mime": "^3.0.0", "moment": "^2.29.4", "node-html-parser": "^6.1.13", + "node-xlsx": "^0.24.0", "officeparser": "^4.0.5", "openai": "4.38.5", "pdf-parse": "^1.1.1", @@ -48,4 +49,4 @@ "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} \ No newline at end of file +} diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js new file mode 100644 index 0000000000..f21c6f1d9b --- /dev/null +++ b/collector/processSingleFile/convert/asXlsx.js @@ -0,0 +1,113 @@ +const { v4 } = require("uuid"); +const xlsx = require("node-xlsx").default; +const path = require("path"); +const fs = require("fs"); +const { + createdDate, + trashFile, + writeToServerDocuments, +} = require("../../utils/files"); +const { tokenizeString } = require("../../utils/tokenizer"); +const { default: slugify } = require("slugify"); + +function convertToCSV(data) { + return data + .map((row) => + row + .map((cell) => { + if (cell === null || cell === undefined) return ""; + if (typeof cell === "string" && cell.includes(",")) + return `"${cell}"`; + return cell; + }) + .join(",") + ) + .join("\n"); +} + +async function asXlsx({ fullFilePath = "", filename = "" }) { + const documents = []; + const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { + lower: true, + trim: true, + }); + + const outFolderPath = + process.env.NODE_ENV === "development" + ? path.resolve( + __dirname, + `../../../server/storage/documents/${folderName}` + ) + : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); + + try { + const workSheetsFromFile = xlsx.parse(fullFilePath); + if (!fs.existsSync(outFolderPath)) + fs.mkdirSync(outFolderPath, { recursive: true }); + + for (const sheet of workSheetsFromFile) { + try { + const { name, data } = sheet; + const content = convertToCSV(data); + + if (!content?.length) { + console.warn(`Sheet "${name}" is empty. Skipping.`); + continue; + } + + console.log(`-- Processing sheet: ${name} --`); + const sheetData = { + id: v4(), + url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, + title: `${filename} - Sheet:${name}`, + docAuthor: "Unknown", + description: `Spreadsheet data from sheet: ${name}`, + docSource: "an xlsx file uploaded by the user.", + chunkSource: "", + published: createdDate(fullFilePath), + wordCount: content.split(/\s+/).length, + pageContent: content, + token_count_estimate: tokenizeString(content).length, + }; + + const document = writeToServerDocuments( + sheetData, + `sheet-${slugify(name)}`, + outFolderPath + ); + documents.push(document); + console.log( + `[SUCCESS]: Sheet "${name}" converted & ready for embedding.` + ); + } catch (err) { + console.error(`Error processing sheet "${name}":`, err); + continue; + } + } + } catch (err) { + console.error("Could not process xlsx file!", err); + return { + success: false, + reason: `Error processing ${filename}: ${err.message}`, + documents: [], + }; + } finally { + trashFile(fullFilePath); + } + + if (documents.length === 0) { + console.error(`No valid sheets found in ${filename}.`); + return { + success: false, + reason: `No valid sheets found in ${filename}.`, + documents: [], + }; + } + + console.log( + `[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n` + ); + return { success: true, reason: null, documents }; +} + +module.exports = asXlsx; diff --git a/collector/utils/constants.js b/collector/utils/constants.js index ee9ad22ae0..c7beeb4b25 100644 --- a/collector/utils/constants.js +++ b/collector/utils/constants.js @@ -11,6 +11,10 @@ const ACCEPTED_MIMES = { ".pptx", ], + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [ + ".xlsx", + ], + "application/vnd.oasis.opendocument.text": [".odt"], "application/vnd.oasis.opendocument.presentation": [".odp"], @@ -41,6 +45,8 @@ const SUPPORTED_FILETYPE_CONVERTERS = { ".odt": "./convert/asOfficeMime.js", ".odp": "./convert/asOfficeMime.js", + ".xlsx": "./convert/asXlsx.js", + ".mbox": "./convert/asMbox.js", ".epub": "./convert/asEPub.js", diff --git a/collector/yarn.lock b/collector/yarn.lock index 2786692e09..f991b43fae 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -2326,6 +2326,13 @@ node-html-parser@^6.1.13: css-select "^5.1.0" he "1.2.0" +node-xlsx@^0.24.0: + version "0.24.0" + resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f" + integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg== + dependencies: + xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz" + nodemailer@6.9.13: version "6.9.13" resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6" @@ -3528,6 +3535,10 @@ ws@8.14.2: resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f" integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g== +"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz": + version "0.20.2" + resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d" + xml2js@^0.6.2: version "0.6.2" resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499"