Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for nested fields #1

Merged
merged 4 commits into from
Oct 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ Effect Schema, allowing you to use your preferred validation tool.

## Features

- HTML Parsing: Extract data from HTML using CSS selectors with the help of
- **HTML Parsing**: Extract data from HTML using CSS selectors with the help of
[cheerio](https://github.com/cheeriojs/cheerio).
- Schema Validation: Validate and transform extracted data with schema validation libraries like [Zod](https://github.com/colinhacks/zod).
- Custom Transformations: Provide custom transformations for extractedattributes.
- Default Values: Define default values for missing data fields.
- **Schema Validation**: Validate and transform extracted data with schema validation libraries like [Zod](https://github.com/colinhacks/zod).
- **Custom Transformations**: Provide custom transformations for extractedattributes.
- **Default Values**: Define default values for missing data fields.
- **Nested Field Support**: Define and extract nested data structures from
HTML elements.

### Schema Support

Expand Down Expand Up @@ -48,6 +50,14 @@ const schema = z.object({
description: z.string(),
keywords: z.array(z.string()),
views: z.number(),
image: z
.object({
url: z.string(),
width: z.number(),
height: z.number(),
})
.default({ url: '', width: 0, height: 0 })
.optional(),
});
```

Expand Down Expand Up @@ -78,6 +88,25 @@ const fields: FieldDefinitions = {
transform: (value) => parseInt(value, 10),
defaultValue: 0,
},
// Example of a nested field
image: {
fields: {
url: {
selector: 'meta[property="og:image"]',
attribute: 'content',
},
width: {
selector: 'meta[property="og:image:width"]',
attribute: 'content',
transform: (value) => parseInt(value, 10),
},
height: {
selector: 'meta[property="og:image:height"]',
attribute: 'content',
transform: (value) => parseInt(value, 10),
},
},
},
};
```

Expand All @@ -96,6 +125,9 @@ const html = `
<meta name="description" content="An example description.">
<meta name="keywords" content="typescript,html,parsing">
<meta name="views" content="1234">
<meta property="og:image" content="https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&amp;tight=false&amp;w=1372">
<meta property="og:image:width" content="1372">
<meta property="og:image:height" content="708">
<title>Example Title</title>
</head>
<body></body>
Expand All @@ -111,6 +143,11 @@ console.log(data);
// description: 'An example description.',
// keywords: ['typescript', 'html', 'parsing'],
// views: 1234
// image: {
// url: 'https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&amp;tight=false&amp;w=1372',
// width: 1372,
// height: 708
// }
// }
```

Expand Down
70 changes: 70 additions & 0 deletions src/createScraper.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,22 @@ const schema = z.object({
views: z.number(),
});

const schemaWithNested = z.object({
title: z.string().default('No title nested'),
image: z
.object({
url: z.string(),
width: z.number(),
height: z.number(),
})
.default({ url: '', width: 0, height: 0 })
.optional(),
});

type FieldDefinitions = SchemaFieldDefinitions<z.infer<typeof schema>>;
type NestedFieldDefinitions = SchemaFieldDefinitions<
z.infer<typeof schemaWithNested>
>;

const fields: FieldDefinitions = {
title: {
Expand All @@ -36,6 +51,30 @@ const fields: FieldDefinitions = {
},
};

const nestedFields: NestedFieldDefinitions = {
title: {
selector: 'title',
},
image: {
fields: {
url: {
selector: 'meta[property="og:image"]',
attribute: 'content',
},
width: {
selector: 'meta[property="og:image:width"]',
attribute: 'content',
transform: (value) => parseInt(value, 10),
},
height: {
selector: 'meta[property="og:image:height"]',
attribute: 'content',
transform: (value) => parseInt(value, 10),
},
},
},
};

const html = `
<!DOCTYPE html>
<html>
Expand All @@ -49,6 +88,19 @@ const html = `
</html>
`;

const htmlWithNested = `
<!DOCTYPE html>
<html>
<head>
<title>Example Title</title>
<meta property="og:image" content="https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&amp;tight=false&amp;w=1372">
<meta property="og:image:width" content="1372">
<meta property="og:image:height" content="708">
</head>
<body></body>
</html>
`;

describe('xscrape', () => {
test('extracts data from HTML', () => {
const validator = new ZodValidator(schema);
Expand Down Expand Up @@ -114,4 +166,22 @@ describe('xscrape', () => {
expect(error).toBeInstanceOf(Error);
}
});

test('extracts nested data from HTML', () => {
const validator = new ZodValidator(schemaWithNested);
const scraper = createScraper({
fields: nestedFields,
validator,
});
const data = scraper(htmlWithNested);

expect(data).toEqual({
title: 'Example Title',
image: {
url: 'https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&tight=false&w=1372',
width: 1372,
height: 708,
},
});
});
});
54 changes: 36 additions & 18 deletions src/createScraper.ts
Original file line number Diff line number Diff line change
@@ -1,43 +1,61 @@
import * as cheerio from 'cheerio';
import { type ScrapeConfig } from '@/types.js';
import { type ScrapeConfig, type SchemaFieldDefinitions } from '@/types.js';

export const createScraper = <T>({
fields,
validator,
}: ScrapeConfig<T>): ((html: string) => T) => {
return (html: string): T => {
const $ = cheerio.load(html);
const data: Partial<Record<keyof T, unknown>> = {};
const extractData = <U>(
fields: SchemaFieldDefinitions<U>,
$context: cheerio.CheerioAPI,
): Partial<U> => {
const data: Partial<U> = {};

for (const key in fields) {
const fieldDef = fields[key];
const elements = $(fieldDef.selector);
for (const key in fields) {
const fieldDef = fields[key];

if ('fields' in fieldDef) {
const nestedData = extractData(
fieldDef.fields as SchemaFieldDefinitions<U[typeof key]>,
$context,
);

data[key as keyof U] = nestedData as U[typeof key];
} else {
const elements = $context(fieldDef.selector);
let values: string[] = [];

elements.each((_, element) => {
const value = fieldDef.attribute
? $(element).attr(fieldDef.attribute)
: $(element).text();
? $context(element).attr(fieldDef.attribute)
: $context(element).text().trim();

if (value !== undefined) {
values.push(value);
}
});

if (values.length === 0 && fieldDef.defaultValue !== undefined) {
data[key] = fieldDef.defaultValue;
data[key as keyof U] = fieldDef.defaultValue as U[typeof key];
} else if (fieldDef.multiple) {
data[key] = values.map((value) =>
data[key as keyof U] = values.map((value) =>
fieldDef.transform ? fieldDef.transform(value) : value,
);
) as U[typeof key];
} else {
const value = values[0];
data[key] =
fieldDef.transform && value ? fieldDef.transform(value) : value;
data[key as keyof U] = (
fieldDef.transform && value ? fieldDef.transform(value) : value
) as U[typeof key];
}
}
}

return data;
};

export const createScraper = <T>({
fields,
validator,
}: ScrapeConfig<T>): ((html: cheerio.CheerioAPI | string) => T) => {
return (html: cheerio.CheerioAPI | string): T => {
const $ = typeof html === 'string' ? cheerio.load(html) : html;
const data = extractData(fields, $);
return validator.validate(data);
};
};
18 changes: 12 additions & 6 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,18 @@ export type ScrapeConfig<T> = {
validator: SchemaValidator<T>;
};

export type FieldDefinition<T> = {
selector: string;
attribute?: string;
transform?: (value: string) => T;
defaultValue?: T;
multiple?: boolean;
export type FieldDefinition<T> =
| {
selector: string;
attribute?: string;
transform?: (value: string) => T;
defaultValue?: T;
multiple?: boolean;
}
| NestedFieldDefinition<T>;

type NestedFieldDefinition<T> = {
fields: SchemaFieldDefinitions<T>;
};

export type SchemaFieldDefinitions<T> = {
Expand Down