From 694bc16e9f5af2198936cfd20aa450eaf642f988 Mon Sep 17 00:00:00 2001 From: Johnie Hjelm Date: Sat, 26 Oct 2024 17:09:46 +0200 Subject: [PATCH 1/4] feat: enhance README with nested field support for image extraction and improved feature list formatting --- README.md | 45 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2e05b7e..0a7648b 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,13 @@ Effect Schema, allowing you to use your preferred validation tool. ## Features -- HTML Parsing: Extract data from HTML using CSS selectors with the help of +- **HTML Parsing**: Extract data from HTML using CSS selectors with the help of [cheerio](https://github.com/cheeriojs/cheerio). -- Schema Validation: Validate and transform extracted data with schema validation libraries like [Zod](https://github.com/colinhacks/zod). -- Custom Transformations: Provide custom transformations for extractedattributes. -- Default Values: Define default values for missing data fields. +- **Schema Validation**: Validate and transform extracted data with schema validation libraries like [Zod](https://github.com/colinhacks/zod). +- **Custom Transformations**: Provide custom transformations for extractedattributes. +- **Default Values**: Define default values for missing data fields. +- **Nested Field Support**: Define and extract nested data structures from + HTML elements. ### Schema Support @@ -48,6 +50,14 @@ const schema = z.object({ description: z.string(), keywords: z.array(z.string()), views: z.number(), + image: z + .object({ + url: z.string(), + width: z.number(), + height: z.number(), + }) + .default({ url: '', width: 0, height: 0 }) + .optional(), }); ``` @@ -78,6 +88,25 @@ const fields: FieldDefinitions = { transform: (value) => parseInt(value, 10), defaultValue: 0, }, + // Example of a nested field + image: { + fields: { + url: { + selector: 'meta[property="og:image"]', + attribute: 'content', + }, + width: { + selector: 'meta[property="og:image:width"]', + attribute: 'content', + transform: (value) => parseInt(value, 10), + }, + height: { + selector: 'meta[property="og:image:height"]', + attribute: 'content', + transform: (value) => parseInt(value, 10), + }, + }, + }, }; ``` @@ -96,6 +125,9 @@ const html = ` + + + Example Title @@ -111,6 +143,11 @@ console.log(data); // description: 'An example description.', // keywords: ['typescript', 'html', 'parsing'], // views: 1234 +// image: { +// url: 'https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&tight=false&w=1372', +// width: 1372, +// height: 708 +// } // } ``` From 2268a8cfa58af35c58fa18f7c28b40d2c3824b92 Mon Sep 17 00:00:00 2001 From: Johnie Hjelm Date: Sat, 26 Oct 2024 17:09:54 +0200 Subject: [PATCH 2/4] feat: expand FieldDefinition type to support nested field definitions in scrape configuration --- src/types.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/types.ts b/src/types.ts index a463955..fd57456 100644 --- a/src/types.ts +++ b/src/types.ts @@ -3,12 +3,18 @@ export type ScrapeConfig = { validator: SchemaValidator; }; -export type FieldDefinition = { - selector: string; - attribute?: string; - transform?: (value: string) => T; - defaultValue?: T; - multiple?: boolean; +export type FieldDefinition = + | { + selector: string; + attribute?: string; + transform?: (value: string) => T; + defaultValue?: T; + multiple?: boolean; + } + | NestedFieldDefinition; + +type NestedFieldDefinition = { + fields: SchemaFieldDefinitions; }; export type SchemaFieldDefinitions = { From 55dabadfe8b0b7b16912cbd62fdf00a75972ecf3 Mon Sep 17 00:00:00 2001 From: Johnie Hjelm Date: Sat, 26 Oct 2024 17:10:10 +0200 Subject: [PATCH 3/4] feat: add extractData helper to support nested field definitions --- src/createScraper.ts | 54 +++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/src/createScraper.ts b/src/createScraper.ts index bf8af90..b465642 100644 --- a/src/createScraper.ts +++ b/src/createScraper.ts @@ -1,24 +1,30 @@ import * as cheerio from 'cheerio'; -import { type ScrapeConfig } from '@/types.js'; +import { type ScrapeConfig, type SchemaFieldDefinitions } from '@/types.js'; -export const createScraper = ({ - fields, - validator, -}: ScrapeConfig): ((html: string) => T) => { - return (html: string): T => { - const $ = cheerio.load(html); - const data: Partial> = {}; +const extractData = ( + fields: SchemaFieldDefinitions, + $context: cheerio.CheerioAPI, +): Partial => { + const data: Partial = {}; - for (const key in fields) { - const fieldDef = fields[key]; - const elements = $(fieldDef.selector); + for (const key in fields) { + const fieldDef = fields[key]; + if ('fields' in fieldDef) { + const nestedData = extractData( + fieldDef.fields as SchemaFieldDefinitions, + $context, + ); + + data[key as keyof U] = nestedData as U[typeof key]; + } else { + const elements = $context(fieldDef.selector); let values: string[] = []; elements.each((_, element) => { const value = fieldDef.attribute - ? $(element).attr(fieldDef.attribute) - : $(element).text(); + ? $context(element).attr(fieldDef.attribute) + : $context(element).text().trim(); if (value !== undefined) { values.push(value); @@ -26,18 +32,30 @@ export const createScraper = ({ }); if (values.length === 0 && fieldDef.defaultValue !== undefined) { - data[key] = fieldDef.defaultValue; + data[key as keyof U] = fieldDef.defaultValue as U[typeof key]; } else if (fieldDef.multiple) { - data[key] = values.map((value) => + data[key as keyof U] = values.map((value) => fieldDef.transform ? fieldDef.transform(value) : value, - ); + ) as U[typeof key]; } else { const value = values[0]; - data[key] = - fieldDef.transform && value ? fieldDef.transform(value) : value; + data[key as keyof U] = ( + fieldDef.transform && value ? fieldDef.transform(value) : value + ) as U[typeof key]; } } + } + return data; +}; + +export const createScraper = ({ + fields, + validator, +}: ScrapeConfig): ((html: cheerio.CheerioAPI | string) => T) => { + return (html: cheerio.CheerioAPI | string): T => { + const $ = typeof html === 'string' ? cheerio.load(html) : html; + const data = extractData(fields, $); return validator.validate(data); }; }; From e64439e4b48288a06daa0bd63caafd109d6e257c Mon Sep 17 00:00:00 2001 From: Johnie Hjelm Date: Sat, 26 Oct 2024 17:15:44 +0200 Subject: [PATCH 4/4] feat: implement support for nested schemas in scraper tests and add validations for nested image data extraction --- src/createScraper.test.ts | 70 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/src/createScraper.test.ts b/src/createScraper.test.ts index dc2d0d6..21c1b41 100644 --- a/src/createScraper.test.ts +++ b/src/createScraper.test.ts @@ -11,7 +11,22 @@ const schema = z.object({ views: z.number(), }); +const schemaWithNested = z.object({ + title: z.string().default('No title nested'), + image: z + .object({ + url: z.string(), + width: z.number(), + height: z.number(), + }) + .default({ url: '', width: 0, height: 0 }) + .optional(), +}); + type FieldDefinitions = SchemaFieldDefinitions>; +type NestedFieldDefinitions = SchemaFieldDefinitions< + z.infer +>; const fields: FieldDefinitions = { title: { @@ -36,6 +51,30 @@ const fields: FieldDefinitions = { }, }; +const nestedFields: NestedFieldDefinitions = { + title: { + selector: 'title', + }, + image: { + fields: { + url: { + selector: 'meta[property="og:image"]', + attribute: 'content', + }, + width: { + selector: 'meta[property="og:image:width"]', + attribute: 'content', + transform: (value) => parseInt(value, 10), + }, + height: { + selector: 'meta[property="og:image:height"]', + attribute: 'content', + transform: (value) => parseInt(value, 10), + }, + }, + }, +}; + const html = ` @@ -49,6 +88,19 @@ const html = ` `; +const htmlWithNested = ` + + + + Example Title + + + + + + +`; + describe('xscrape', () => { test('extracts data from HTML', () => { const validator = new ZodValidator(schema); @@ -114,4 +166,22 @@ describe('xscrape', () => { expect(error).toBeInstanceOf(Error); } }); + + test('extracts nested data from HTML', () => { + const validator = new ZodValidator(schemaWithNested); + const scraper = createScraper({ + fields: nestedFields, + validator, + }); + const data = scraper(htmlWithNested); + + expect(data).toEqual({ + title: 'Example Title', + image: { + url: 'https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&tight=false&w=1372', + width: 1372, + height: 708, + }, + }); + }); });