From 7ef4cd341b1e98d820660872d5cb676d5d6aaf48 Mon Sep 17 00:00:00 2001 From: Gabor Cseh Date: Fri, 22 Nov 2024 18:33:40 +0100 Subject: [PATCH] Adding JSONL support to json extractor (#705) * Add jsonl support into json extractor plugin * Add test case * Fix lint errors * Add changeset * Apply CodeRabbit suggestion --------- Co-authored-by: Gabor Cseh <17csibe17@gmail.com> Co-authored-by: Carl Brugger --- .changeset/tender-lizards-report.md | 6 + package-lock.json | 14 +- plugins/json-extractor/ref/test-basic.jsonl | 3 + plugins/json-extractor/src/index.ts | 2 +- plugins/json-extractor/src/parser.spec.ts | 332 +++++++++++--------- plugins/json-extractor/src/parser.ts | 24 +- utils/extractor/src/index.ts | 1 + 7 files changed, 218 insertions(+), 164 deletions(-) create mode 100644 .changeset/tender-lizards-report.md create mode 100644 plugins/json-extractor/ref/test-basic.jsonl diff --git a/.changeset/tender-lizards-report.md b/.changeset/tender-lizards-report.md new file mode 100644 index 000000000..02f502dc4 --- /dev/null +++ b/.changeset/tender-lizards-report.md @@ -0,0 +1,6 @@ +--- +'@flatfile/plugin-json-extractor': minor +'@flatfile/util-extractor': minor +--- + +This release adds support for JSONL files in the JSON Extractor plugin. diff --git a/package-lock.json b/package-lock.json index b2c0e754e..376d11792 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17431,7 +17431,7 @@ }, "plugins/autocast": { "name": "@flatfile/plugin-autocast", - "version": "5.0.0", + "version": "6.0.0", "license": "ISC", "dependencies": { "@flatfile/hooks": "^1.4.1", @@ -17440,7 +17440,7 @@ "devDependencies": { "@flatfile/bundler-config-tsup": "^0.2.0", "@flatfile/config-vitest": "^0.0.0", - "@flatfile/plugin-record-hook": "^1.10.0" + "@flatfile/plugin-record-hook": "^1.11.0" }, "engines": { "node": ">= 18" @@ -17448,7 +17448,7 @@ "peerDependencies": { "@flatfile/api": "^1.9.19", "@flatfile/listener": "^1.1.0", - "@flatfile/plugin-record-hook": "^1.10.0" + "@flatfile/plugin-record-hook": "^1.11.0" } }, "plugins/automap": { @@ -17474,12 +17474,12 @@ }, "plugins/constraints": { "name": "@flatfile/plugin-constraints", - "version": "6.0.0", + "version": "7.0.0", "license": "ISC", "devDependencies": { "@flatfile/bundler-config-tsup": "^0.2.0", "@flatfile/config-vitest": "^0.0.0", - "@flatfile/plugin-record-hook": "^1.10.0" + "@flatfile/plugin-record-hook": "^1.11.0" }, "engines": { "node": ">= 18" @@ -17487,7 +17487,7 @@ "peerDependencies": { "@flatfile/api": "^1.9.19", "@flatfile/listener": "^1.1.0", - "@flatfile/plugin-record-hook": "^1.10.0" + "@flatfile/plugin-record-hook": "^1.11.0" } }, "plugins/dedupe": { @@ -17782,7 +17782,7 @@ }, "plugins/record-hook": { "name": "@flatfile/plugin-record-hook", - "version": "1.10.0", + "version": "1.11.1", "license": "ISC", "dependencies": { "@flatfile/util-common": "^1.6.0" diff --git a/plugins/json-extractor/ref/test-basic.jsonl b/plugins/json-extractor/ref/test-basic.jsonl new file mode 100644 index 000000000..5a503c1e3 --- /dev/null +++ b/plugins/json-extractor/ref/test-basic.jsonl @@ -0,0 +1,3 @@ +{"First Name": "Tony","Last Name": "Lamb","Email": "me@opbaj.tp","Address": {"Street": "123 Main Street","City": "Springfield","State": "ST","Zip": "12345","Coordinates": {"Latitude": "40.7128° N","Longitude": "74.0060° W"}},"Father": {"First Name": "Father_First_1","Last Name": "Father_Last_1","Father": {"First Name": "Father_First_2","Last Name": "Father_Last_2","Father": {"First Name": "Father_First_3","Last Name": "Father_Last_3","Father": {"First Name": "Father_First_4","Last Name": "Father_Last_4","Father": {"First Name": "Father_First_5","Last Name": "Father_Last_5","Father": null}}}}}} +{"First Name": "Christian","Last Name": "Ramos","Email": "uw@ag.tg","Address": {"Street": "456 Elm Street","City": "Greenville","State": "GT","Zip": "67890","Coordinates": {"Latitude": "40.7128° N","Longitude": "74.0060° W"}},"Father": {"First Name": "Father_First_1","Last Name": "Father_Last_1","Father": {"First Name": "Father_First_2","Last Name": "Father_Last_2","Father": {"First Name": "Father_First_3","Last Name": "Father_Last_3","Father": {"First Name": "Father_First_4","Last Name": "Father_Last_4","Father": {"First Name": "Father_First_5","Last Name": "Father_Last_5","Father": null}}}}}} +{"First Name": "Frederick","Last Name": "Boyd","Email": "kempur@ascebec.gs","Address": {"Street": "789 Oak Street","City": "Rivertown","State": "RT","Zip": "10112","Coordinates": {"Latitude": "40.7128° N","Longitude": "74.0060° W"}},"Father": {"First Name": "Father_First_1","Last Name": "Father_Last_1","Father": {"First Name": "Father_First_2","Last Name": "Father_Last_2","Father": {"First Name": "Father_First_3","Last Name": "Father_Last_3","Father": {"First Name": "Father_First_4","Last Name": "Father_Last_4","Father": {"First Name": "Father_First_5","Last Name": "Father_Last_5","Father": null}}}}}} diff --git a/plugins/json-extractor/src/index.ts b/plugins/json-extractor/src/index.ts index 5760db1dc..6a1d1db6c 100644 --- a/plugins/json-extractor/src/index.ts +++ b/plugins/json-extractor/src/index.ts @@ -8,7 +8,7 @@ export interface PluginOptions { } export const JSONExtractor = (options?: PluginOptions) => { - return Extractor('.json', 'json', parseBuffer, options) + return Extractor(/\.(jsonl?|jsonlines)$/i, 'json', parseBuffer, options) } export const jsonParser = parseBuffer diff --git a/plugins/json-extractor/src/parser.spec.ts b/plugins/json-extractor/src/parser.spec.ts index 6c7959b4a..f64b6dbac 100644 --- a/plugins/json-extractor/src/parser.spec.ts +++ b/plugins/json-extractor/src/parser.spec.ts @@ -5,165 +5,189 @@ import { parseBuffer, parseSheet } from './parser' describe('parser', function () { describe('parser single sheet', function () { - const buffer: Buffer = fs.readFileSync( - path.join(__dirname, '../ref/test-basic.json') - ) - const singleSheetCapture = parseBuffer(buffer) - - it('has a single sheet', () => { - expect(singleSheetCapture).toEqual({ - Sheet1: { - headers: [ - 'First Name', - 'Last Name', - 'Email', - 'Address.Street', - 'Address.City', - 'Address.State', - 'Address.Zip', - 'Address.Coordinates.Latitude', - 'Address.Coordinates.Longitude', - 'Father.First Name', - 'Father.Last Name', - 'Father.Father.First Name', - 'Father.Father.Last Name', - 'Father.Father.Father.First Name', - 'Father.Father.Father.Last Name', - 'Father.Father.Father.Father.First Name', - 'Father.Father.Father.Father.Last Name', - 'Father.Father.Father.Father.Father.First Name', - 'Father.Father.Father.Father.Father.Last Name', - ], - data: [ - { - 'First Name': { value: 'Tony' }, - 'Last Name': { value: 'Lamb' }, - Email: { value: 'me@opbaj.tp' }, - 'Address.Street': { value: '123 Main Street' }, - 'Address.City': { value: 'Springfield' }, - 'Address.State': { value: 'ST' }, - 'Address.Zip': { value: '12345' }, - 'Address.Coordinates.Latitude': { value: '40.7128° N' }, - 'Address.Coordinates.Longitude': { value: '74.0060° W' }, - 'Father.First Name': { - value: 'Father_First_1', - }, - 'Father.Last Name': { - value: 'Father_Last_1', - }, - 'Father.Father.First Name': { - value: 'Father_First_2', - }, - 'Father.Father.Last Name': { - value: 'Father_Last_2', - }, - 'Father.Father.Father.First Name': { - value: 'Father_First_3', - }, - 'Father.Father.Father.Last Name': { - value: 'Father_Last_3', - }, - 'Father.Father.Father.Father.First Name': { - value: 'Father_First_4', - }, - 'Father.Father.Father.Father.Last Name': { - value: 'Father_Last_4', - }, - 'Father.Father.Father.Father.Father.First Name': { - value: 'Father_First_5', - }, - 'Father.Father.Father.Father.Father.Last Name': { - value: 'Father_Last_5', - }, + const expectedSingleSheetCapture = { + Sheet1: { + headers: [ + 'First Name', + 'Last Name', + 'Email', + 'Address.Street', + 'Address.City', + 'Address.State', + 'Address.Zip', + 'Address.Coordinates.Latitude', + 'Address.Coordinates.Longitude', + 'Father.First Name', + 'Father.Last Name', + 'Father.Father.First Name', + 'Father.Father.Last Name', + 'Father.Father.Father.First Name', + 'Father.Father.Father.Last Name', + 'Father.Father.Father.Father.First Name', + 'Father.Father.Father.Father.Last Name', + 'Father.Father.Father.Father.Father.First Name', + 'Father.Father.Father.Father.Father.Last Name', + ], + data: [ + { + 'First Name': { value: 'Tony' }, + 'Last Name': { value: 'Lamb' }, + Email: { value: 'me@opbaj.tp' }, + 'Address.Street': { value: '123 Main Street' }, + 'Address.City': { value: 'Springfield' }, + 'Address.State': { value: 'ST' }, + 'Address.Zip': { value: '12345' }, + 'Address.Coordinates.Latitude': { value: '40.7128° N' }, + 'Address.Coordinates.Longitude': { value: '74.0060° W' }, + 'Father.First Name': { + value: 'Father_First_1', }, - { - 'First Name': { value: 'Christian' }, - 'Last Name': { value: 'Ramos' }, - Email: { value: 'uw@ag.tg' }, - 'Address.Street': { value: '456 Elm Street' }, - 'Address.City': { value: 'Greenville' }, - 'Address.State': { value: 'GT' }, - 'Address.Zip': { value: '67890' }, - 'Address.Coordinates.Latitude': { value: '40.7128° N' }, - 'Address.Coordinates.Longitude': { value: '74.0060° W' }, - 'Father.First Name': { - value: 'Father_First_1', - }, - 'Father.Last Name': { - value: 'Father_Last_1', - }, - 'Father.Father.First Name': { - value: 'Father_First_2', - }, - 'Father.Father.Last Name': { - value: 'Father_Last_2', - }, - 'Father.Father.Father.First Name': { - value: 'Father_First_3', - }, - 'Father.Father.Father.Last Name': { - value: 'Father_Last_3', - }, - 'Father.Father.Father.Father.First Name': { - value: 'Father_First_4', - }, - 'Father.Father.Father.Father.Last Name': { - value: 'Father_Last_4', - }, - 'Father.Father.Father.Father.Father.First Name': { - value: 'Father_First_5', - }, - 'Father.Father.Father.Father.Father.Last Name': { - value: 'Father_Last_5', - }, + 'Father.Last Name': { + value: 'Father_Last_1', }, - { - 'First Name': { value: 'Frederick' }, - 'Last Name': { value: 'Boyd' }, - Email: { value: 'kempur@ascebec.gs' }, - 'Address.Street': { value: '789 Oak Street' }, - 'Address.City': { value: 'Rivertown' }, - 'Address.State': { value: 'RT' }, - 'Address.Zip': { value: '10112' }, - 'Address.Coordinates.Latitude': { value: '40.7128° N' }, - 'Address.Coordinates.Longitude': { value: '74.0060° W' }, - 'Father.First Name': { - value: 'Father_First_1', - }, - 'Father.Last Name': { - value: 'Father_Last_1', - }, - 'Father.Father.First Name': { - value: 'Father_First_2', - }, - 'Father.Father.Last Name': { - value: 'Father_Last_2', - }, - 'Father.Father.Father.First Name': { - value: 'Father_First_3', - }, - 'Father.Father.Father.Last Name': { - value: 'Father_Last_3', - }, - 'Father.Father.Father.Father.First Name': { - value: 'Father_First_4', - }, - 'Father.Father.Father.Father.Last Name': { - value: 'Father_Last_4', - }, - 'Father.Father.Father.Father.Father.First Name': { - value: 'Father_First_5', - }, - 'Father.Father.Father.Father.Father.Last Name': { - value: 'Father_Last_5', - }, + 'Father.Father.First Name': { + value: 'Father_First_2', }, - ], - metadata: undefined, - }, - }) + 'Father.Father.Last Name': { + value: 'Father_Last_2', + }, + 'Father.Father.Father.First Name': { + value: 'Father_First_3', + }, + 'Father.Father.Father.Last Name': { + value: 'Father_Last_3', + }, + 'Father.Father.Father.Father.First Name': { + value: 'Father_First_4', + }, + 'Father.Father.Father.Father.Last Name': { + value: 'Father_Last_4', + }, + 'Father.Father.Father.Father.Father.First Name': { + value: 'Father_First_5', + }, + 'Father.Father.Father.Father.Father.Last Name': { + value: 'Father_Last_5', + }, + }, + { + 'First Name': { value: 'Christian' }, + 'Last Name': { value: 'Ramos' }, + Email: { value: 'uw@ag.tg' }, + 'Address.Street': { value: '456 Elm Street' }, + 'Address.City': { value: 'Greenville' }, + 'Address.State': { value: 'GT' }, + 'Address.Zip': { value: '67890' }, + 'Address.Coordinates.Latitude': { value: '40.7128° N' }, + 'Address.Coordinates.Longitude': { value: '74.0060° W' }, + 'Father.First Name': { + value: 'Father_First_1', + }, + 'Father.Last Name': { + value: 'Father_Last_1', + }, + 'Father.Father.First Name': { + value: 'Father_First_2', + }, + 'Father.Father.Last Name': { + value: 'Father_Last_2', + }, + 'Father.Father.Father.First Name': { + value: 'Father_First_3', + }, + 'Father.Father.Father.Last Name': { + value: 'Father_Last_3', + }, + 'Father.Father.Father.Father.First Name': { + value: 'Father_First_4', + }, + 'Father.Father.Father.Father.Last Name': { + value: 'Father_Last_4', + }, + 'Father.Father.Father.Father.Father.First Name': { + value: 'Father_First_5', + }, + 'Father.Father.Father.Father.Father.Last Name': { + value: 'Father_Last_5', + }, + }, + { + 'First Name': { value: 'Frederick' }, + 'Last Name': { value: 'Boyd' }, + Email: { value: 'kempur@ascebec.gs' }, + 'Address.Street': { value: '789 Oak Street' }, + 'Address.City': { value: 'Rivertown' }, + 'Address.State': { value: 'RT' }, + 'Address.Zip': { value: '10112' }, + 'Address.Coordinates.Latitude': { value: '40.7128° N' }, + 'Address.Coordinates.Longitude': { value: '74.0060° W' }, + 'Father.First Name': { + value: 'Father_First_1', + }, + 'Father.Last Name': { + value: 'Father_Last_1', + }, + 'Father.Father.First Name': { + value: 'Father_First_2', + }, + 'Father.Father.Last Name': { + value: 'Father_Last_2', + }, + 'Father.Father.Father.First Name': { + value: 'Father_First_3', + }, + 'Father.Father.Father.Last Name': { + value: 'Father_Last_3', + }, + 'Father.Father.Father.Father.First Name': { + value: 'Father_First_4', + }, + 'Father.Father.Father.Father.Last Name': { + value: 'Father_Last_4', + }, + 'Father.Father.Father.Father.Father.First Name': { + value: 'Father_First_5', + }, + 'Father.Father.Father.Father.Father.Last Name': { + value: 'Father_Last_5', + }, + }, + ], + metadata: undefined, + }, + } + + it('has a single sheet from json input', () => { + const buffer: Buffer = fs.readFileSync( + path.join(__dirname, '../ref/test-basic.json') + ) + const singleSheetCapture = parseBuffer(buffer) + + expect(singleSheetCapture).toEqual(expectedSingleSheetCapture) + }) + + it('has a single sheet from jsonl input', () => { + const buffer: Buffer = fs.readFileSync( + path.join(__dirname, '../ref/test-basic.jsonl') + ) + const singleSheetCapture = parseBuffer(buffer, { fileExt: 'jsonl' }) + + expect(singleSheetCapture).toEqual(expectedSingleSheetCapture) + }) + + it('handles empty lines in JSONL', () => { + const buffer = Buffer.from('{"a": 1}\n\n{"b": 2}') + const result = parseBuffer(buffer, { fileExt: 'jsonl' }) + expect(result.Sheet1.data).toHaveLength(2) + }) + + it('skips invalid lines in JSONL', () => { + const buffer = Buffer.from('{"a": 1}\n{invalid}\n{"b": 2}') + const result = parseBuffer(buffer, { fileExt: 'jsonl' }) + expect(result.Sheet1.data).toHaveLength(2) }) }) + describe('parser multisheet', function () { const buffer: Buffer = fs.readFileSync( path.join(__dirname, '../ref/test-multisheet.json') diff --git a/plugins/json-extractor/src/parser.ts b/plugins/json-extractor/src/parser.ts index 133f9e4bc..28417dff3 100644 --- a/plugins/json-extractor/src/parser.ts +++ b/plugins/json-extractor/src/parser.ts @@ -1,14 +1,34 @@ import { SheetCapture, WorkbookCapture } from '@flatfile/util-extractor' -export function parseBuffer(buffer: Buffer): WorkbookCapture { +export function parseBuffer( + buffer: Buffer, + options?: { readonly fileExt?: string } +): WorkbookCapture { try { - const fileContents = buffer.toString('utf8') + let fileContents = buffer.toString('utf8') if (!fileContents) { console.log('Invalid file contents') return {} as WorkbookCapture } + if (options?.fileExt === 'jsonl' || options?.fileExt === 'jsonlines') { + const lines = fileContents + .split('\n') + .filter((line) => line.trim() !== '') + .map((line) => { + try { + JSON.parse(line) + return line + } catch (e) { + console.error('Invalid JSON line:', line) + return null + } + }) + .filter((line) => line !== null) + fileContents = `[${lines.join(',')}]` + } + const parsedData = JSON.parse(fileContents) if (typeof parsedData !== 'object' || parsedData === null) { console.error('Invalid input: data must be an object.') diff --git a/utils/extractor/src/index.ts b/utils/extractor/src/index.ts index 02296aed0..d2d70768a 100644 --- a/utils/extractor/src/index.ts +++ b/utils/extractor/src/index.ts @@ -79,6 +79,7 @@ export const Extractor = ( const capture = await parseBuffer(buffer, { ...options, fileId, + fileExt: file.ext, headerSelectionEnabled, })