diff --git a/.changeset/spotty-months-own.md b/.changeset/spotty-months-own.md new file mode 100644 index 000000000..adfd9dcfe --- /dev/null +++ b/.changeset/spotty-months-own.md @@ -0,0 +1,6 @@ +--- +'@flatfile/plugin-xlsx-extractor': minor +'@flatfile/util-extractor': patch +--- + +Improved header detection options. diff --git a/plugins/xlsx-extractor/src/header.detection.ts b/plugins/xlsx-extractor/src/header.detection.ts new file mode 100644 index 000000000..cdcb9c53e --- /dev/null +++ b/plugins/xlsx-extractor/src/header.detection.ts @@ -0,0 +1,188 @@ +import stream from 'stream' + +export const ROWS_TO_SEARCH_FOR_HEADER = 10 + +interface DefaultOptions { + algorithm: 'default' + rowsToSearch?: number +} + +interface ExplicitHeadersOptions { + algorithm: 'explicitHeaders' + headers: string[] + skip?: number +} + +interface SpecificRowsOptions { + algorithm: 'specificRows' + rowNumbers: number[] + skip?: number +} + +interface NewfangledOptions { + algorithm: 'newfangled' +} + +export type GetHeadersOptions = + | DefaultOptions + | ExplicitHeadersOptions + | SpecificRowsOptions + | NewfangledOptions + +interface GetHeadersResult { + header: string[] + skip: number +} + +// Takes a datastream (representing a CSV) and returns the header row and the number of rows to skip +export abstract class Headerizer { + constructor() {} + abstract getHeaders(dataStream: stream.Readable): Promise + + static create(options: GetHeadersOptions): Headerizer { + switch (options.algorithm) { + case 'explicitHeaders': + return new ExplicitHeaders(options) + case 'specificRows': + return new SpecificRows(options) + case 'newfangled': + throw new Error('Not implemented') + default: + return new OriginalDetector(options) + } + } +} + +export const countNonEmptyCells = (row: string[]): number => { + return row.filter((cell) => cell.trim() !== '').length +} + +// This is the original / default implementation of detectHeader. +// It looks at the first `rowsToSearch` rows and takes the row +// with the most non-empty cells as the header, preferring the earliest +// such row in the case of a tie. +class OriginalDetector extends Headerizer { + private rowsToSearch: number + + constructor(private options: DefaultOptions) { + super() + this.rowsToSearch = options.rowsToSearch || ROWS_TO_SEARCH_FOR_HEADER + } + + async getHeaders(dataStream: stream.Readable): Promise { + let currentRow = 0 + let skip = 0 + let header: string[] = [] + + // This is the original implementation of detectHeader + const detector = new stream.Writable({ + objectMode: true, + write: (row, encoding, callback) => { + currentRow++ + if (currentRow >= this.rowsToSearch) { + dataStream.destroy() + } + if (countNonEmptyCells(row) > countNonEmptyCells(header)) { + header = row + skip = currentRow + } + callback() + }, + }) + + dataStream.pipe(detector, { end: true }) + + return new Promise((resolve, reject) => { + detector.on('finish', () => { + resolve({ header, skip }) + }) + dataStream.on('close', () => { + resolve({ header, skip }) + }) + dataStream.on('error', (error) => { + reject(error) + }) + }) + } +} + +// This implementation simply returns an explicit list of headers +// it was provided with. +class ExplicitHeaders extends Headerizer { + headers: string[] + constructor(private readonly options: ExplicitHeadersOptions) { + super() + + if (!options.headers || options.headers.length === 0) { + throw new Error('ExplicitHeaders requires at least one header') + } + } + + async getHeaders(dataStream: stream.Readable): Promise { + return { + header: this.options.headers, + skip: this.options.skip || 0, + } + } +} + +// This implementation looks at specific rows and combines them into a single header. +// For example, if you knew that the header was in the third row, you could pass it +// { rowNumbers: [2] } +class SpecificRows extends Headerizer { + constructor(private readonly options: SpecificRowsOptions) { + super() + + if (!options.rowNumbers || options.rowNumbers.length === 0) { + throw new Error('SpecificRows requires at least one row number') + } + } + + async getHeaders(dataStream: stream.Readable): Promise { + let currentRow = 0 + let maxRow = Math.max(...this.options.rowNumbers) + let header: string[] = [] + + const detector = new stream.Writable({ + objectMode: true, + write: (row, encoding, callback) => { + if (currentRow > maxRow) { + dataStream.destroy() + } else if (this.options.rowNumbers.includes(currentRow)) { + if (header.length === 0) { + // This is the first header row we've seen, so just remember it + header = row + } else { + for (let i = 0; i < header.length; i++) { + if (header[i] === '') { + header[i] = row[i].trim() + } else { + header[i] = `${header[i].trim()} ${row[i].trim()}` + } + } + } + } + currentRow++ + callback() + }, + }) + + dataStream.pipe(detector, { end: true }) + + // If we have an explicit skip, use it, otherwise skip past the last header row + const skip = this.options.skip ?? maxRow + 1 + + // TODO: this logic is duplicated, factor it out? + return new Promise((resolve, reject) => { + detector.on('finish', () => { + resolve({ header, skip }) + }) + dataStream.on('close', () => { + resolve({ header, skip }) + }) + dataStream.on('error', (error) => { + reject(error) + }) + }) + } +} diff --git a/plugins/xlsx-extractor/src/index.ts b/plugins/xlsx-extractor/src/index.ts index b1610e41c..66b23d92a 100644 --- a/plugins/xlsx-extractor/src/index.ts +++ b/plugins/xlsx-extractor/src/index.ts @@ -1,4 +1,5 @@ import { Extractor } from '@flatfile/util-extractor' +import { GetHeadersOptions } from './header.detection' import { parseBuffer } from './parser' /** @@ -15,6 +16,7 @@ export interface ExcelExtractorOptions { readonly rawNumbers?: boolean readonly chunkSize?: number readonly parallel?: number + readonly headerDetectionOptions?: GetHeadersOptions readonly debug?: boolean } diff --git a/plugins/xlsx-extractor/src/parser.spec.ts b/plugins/xlsx-extractor/src/parser.spec.ts index a627a6405..23306c33b 100644 --- a/plugins/xlsx-extractor/src/parser.spec.ts +++ b/plugins/xlsx-extractor/src/parser.spec.ts @@ -1,13 +1,18 @@ -import { parseBuffer } from './parser' +import { WorkbookCapture } from '@flatfile/util-extractor' import * as fs from 'fs' import * as path from 'path' +import { parseBuffer } from './parser' describe('parser', () => { const buffer: Buffer = fs.readFileSync( path.join(__dirname, '../ref/test-basic.xlsx') ) - test('Excel to WorkbookCapture', () => { - expect(parseBuffer(buffer).Departments).toEqual({ + let capture: WorkbookCapture + beforeAll(async () => { + capture = await parseBuffer(buffer) + }) + test('Excel to WorkbookCapture', async () => { + expect(capture.Departments).toEqual({ headers: ['Code', 'Details', 'BranchName', 'Tenant'], required: { Code: true, Details: false, BranchName: true, Tenant: true }, data: [ @@ -27,9 +32,8 @@ describe('parser', () => { }) }) - describe('test-basic.xlsx', function () { - const capture = parseBuffer(buffer) - test('finds all the sheet names', () => { + describe('test-basic.xlsx', () => { + test('finds all the sheet names', async () => { expect(Object.keys(capture)).toEqual([ 'Departments', 'Clients', diff --git a/plugins/xlsx-extractor/src/parser.ts b/plugins/xlsx-extractor/src/parser.ts index 4b2efd3a8..158aa9434 100644 --- a/plugins/xlsx-extractor/src/parser.ts +++ b/plugins/xlsx-extractor/src/parser.ts @@ -1,27 +1,40 @@ -import * as XLSX from 'xlsx' -import { mapKeys, mapValues } from 'remeda' -import { SheetCapture, WorkbookCapture } from '@flatfile/util-extractor' import { Flatfile } from '@flatfile/api' +import { SheetCapture, WorkbookCapture } from '@flatfile/util-extractor' +import { mapKeys, mapValues } from 'remeda' +import { Readable } from 'stream' +import * as XLSX from 'xlsx' +import { GetHeadersOptions, Headerizer } from './header.detection' -export function parseBuffer( +export async function parseBuffer( buffer: Buffer, options?: { raw?: boolean rawNumbers?: boolean + headerDetectionOptions?: GetHeadersOptions } -): WorkbookCapture { +): Promise { const workbook = XLSX.read(buffer, { type: 'buffer', cellDates: true, }) + const sheetNames = Object.keys(workbook.Sheets) - return mapValues(workbook.Sheets, (value, key) => { - return convertSheet( - value, - options?.rawNumbers || false, - options?.raw || false - ) - }) + const processedSheets = await Promise.all( + sheetNames.map(async (sheetName) => { + const value = workbook.Sheets[sheetName] + const processedValue = await convertSheet( + value, + options?.rawNumbers || false, + options?.raw || false, + options?.headerDetectionOptions || { + algorithm: 'default', + } + ) + return [sheetName, processedValue] + }) + ) + + return Object.fromEntries(processedSheets) } /** @@ -29,41 +42,54 @@ export function parseBuffer( * * @param sheet */ -function convertSheet( +async function convertSheet( sheet: XLSX.WorkSheet, - rawNumbers: boolean, - raw: boolean -): SheetCapture { + rawNumbers: boolean = false, + raw: boolean = false, + headerDetectionOptions?: GetHeadersOptions +): Promise { let rows = XLSX.utils.sheet_to_json>(sheet, { header: 'A', defval: null, - rawNumbers: rawNumbers || false, - raw: raw || false, + rawNumbers, + raw, }) - const { headerRow, skip } = detectHeader(rows) + const extractValues = (data: Record[]) => + data.map((row) => Object.values(row).filter((value) => value !== null)) + + const headerizer = Headerizer.create(headerDetectionOptions) + const headerStream = Readable.from(extractValues(rows)) + const { header, skip } = await headerizer.getHeaders(headerStream) rows.splice(0, skip) - const headers = prependNonUniqueHeaderColumns(headerRow) - const required: Record = {} - Object.keys(headerRow).forEach((key) => { - const newKey = headers[key] - if (newKey) { - required[newKey] = headerRow[key]?.toString().includes('*') ?? false - } - }) + const toExcelHeader = (data: string[], keys: string[]) => + data.reduce((result, value, index) => { + result[keys[index]] = value + return result + }, {}) - const data: Flatfile.RecordData[] = rows + const columnKeys = Object.keys(rows[0]) + const excelHeader = toExcelHeader(header, columnKeys) + const headers = prependNonUniqueHeaderColumns(excelHeader) + const required = Object.fromEntries( + Object.entries(excelHeader).map(([key, value]) => [ + headers[key], + value?.toString().includes('*') ?? false, + ]) + ) + + const data = rows .filter((row) => !Object.values(row).every(isNullOrWhitespace)) - .map((row) => { - const mappedRow = mapKeys(row, (key) => headers[key]) - return mapValues(mappedRow, (value) => ({ - value: value, - })) as Flatfile.RecordData - }) + .map((row) => + mapValues( + mapKeys(row, (key) => headers[key]), + (value) => ({ value }) + ) + ) return { - headers: Object.values(headers).filter((v) => v) as string[], + headers: Object.values(headers).filter(Boolean), required, data, } @@ -90,31 +116,3 @@ function prependNonUniqueHeaderColumns( const isNullOrWhitespace = (value: any) => value === null || (typeof value === 'string' && value.trim() === '') - -const detectHeader = ( - rows: Record[] -): { headerRow: Record; skip: number } => { - const ROWS_TO_CHECK = 10 - - let skip = 0 - let widestRow: Record = {} - let widestRowCount = 0 - - for (let i = 0; i < Math.min(rows.length, ROWS_TO_CHECK); i++) { - const row = rows[i] - const rowCount = countNonEmptyCells(row) - if (rowCount > widestRowCount) { - widestRow = row - widestRowCount = rowCount - skip = i + 1 - } - } - - return { headerRow: widestRow, skip } -} - -const countNonEmptyCells = (row: Record): number => { - return Object.values(row).filter( - (cell) => cell && cell.toString().trim() !== '' - ).length -} diff --git a/utils/extractor/src/index.ts b/utils/extractor/src/index.ts index 9e681b5e2..79f593ecd 100644 --- a/utils/extractor/src/index.ts +++ b/utils/extractor/src/index.ts @@ -7,7 +7,10 @@ import { mapValues } from 'remeda' export const Extractor = ( fileExt: string | RegExp, extractorType: string, - parseBuffer: (buffer: Buffer, options: any) => WorkbookCapture, + parseBuffer: ( + buffer: Buffer, + options: any + ) => WorkbookCapture | Promise, options?: Record ) => { return (listener: FlatfileListener) => { @@ -56,7 +59,7 @@ export const Extractor = ( } await tick(3, 'Parsing Sheets') - const capture = parseBuffer(buffer, options) + const capture = await parseBuffer(buffer, options) const workbook = await createWorkbook( event.context.environmentId, file,