forked from FlowiseAI/Flowise
-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/Spider (open-source web scraper & crawler) (FlowiseAI#2738)
* Add Spider Scraper & Crawler * fix pnpm lint * chore: Update metadata to be correct format * fix pnpm lint
- Loading branch information
1 parent
efc6e02
commit 656f6ca
Showing
4 changed files
with
317 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import { INodeParams, INodeCredential } from '../src/Interface' | ||
|
||
class SpiderApiCredential implements INodeCredential { | ||
label: string | ||
name: string | ||
version: number | ||
description: string | ||
inputs: INodeParams[] | ||
|
||
constructor() { | ||
this.label = 'Spider API' | ||
this.name = 'spiderApi' | ||
this.version = 1.0 | ||
this.description = 'Get your API key from the <a target="_blank" href="https://spider.cloud">Spider</a> dashboard.' | ||
this.inputs = [ | ||
{ | ||
label: 'Spider API Key', | ||
name: 'spiderApiKey', | ||
type: 'password' | ||
} | ||
] | ||
} | ||
} | ||
|
||
module.exports = { credClass: SpiderApiCredential } |
175 changes: 175 additions & 0 deletions
175
packages/components/nodes/documentloaders/Spider/Spider.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
import { TextSplitter } from 'langchain/text_splitter' | ||
import { Document, DocumentInterface } from '@langchain/core/documents' | ||
import { BaseDocumentLoader } from 'langchain/document_loaders/base' | ||
import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface' | ||
import { getCredentialData, getCredentialParam } from '../../../src/utils' | ||
import SpiderApp from './SpiderApp' | ||
|
||
interface SpiderLoaderParameters { | ||
url: string | ||
apiKey?: string | ||
mode?: 'crawl' | 'scrape' | ||
params?: Record<string, unknown> | ||
} | ||
|
||
class SpiderLoader extends BaseDocumentLoader { | ||
private apiKey: string | ||
private url: string | ||
private mode: 'crawl' | 'scrape' | ||
private params?: Record<string, unknown> | ||
|
||
constructor(loaderParams: SpiderLoaderParameters) { | ||
super() | ||
const { apiKey, url, mode = 'crawl', params } = loaderParams | ||
if (!apiKey) { | ||
throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.') | ||
} | ||
|
||
this.apiKey = apiKey | ||
this.url = url | ||
this.mode = mode | ||
this.params = params | ||
} | ||
|
||
public async load(): Promise<DocumentInterface[]> { | ||
const app = new SpiderApp({ apiKey: this.apiKey }) | ||
let spiderDocs: any[] | ||
|
||
if (this.mode === 'scrape') { | ||
const response = await app.scrapeUrl(this.url, this.params) | ||
if (!response.success) { | ||
throw new Error(`Spider: Failed to scrape URL. Error: ${response.error}`) | ||
} | ||
spiderDocs = [response.data] | ||
} else if (this.mode === 'crawl') { | ||
const response = await app.crawlUrl(this.url, this.params) | ||
if (!response.success) { | ||
throw new Error(`Spider: Failed to crawl URL. Error: ${response.error}`) | ||
} | ||
spiderDocs = response.data | ||
} else { | ||
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`) | ||
} | ||
|
||
return spiderDocs.map( | ||
(doc) => | ||
new Document({ | ||
pageContent: doc.content || '', | ||
metadata: { source: doc.url } | ||
}) | ||
) | ||
} | ||
} | ||
|
||
class Spider_DocumentLoaders implements INode { | ||
label: string | ||
name: string | ||
description: string | ||
type: string | ||
icon: string | ||
version: number | ||
category: string | ||
baseClasses: string[] | ||
inputs: INodeParams[] | ||
credential: INodeParams | ||
|
||
constructor() { | ||
this.label = 'Spider Document Loaders' | ||
this.name = 'spiderDocumentLoaders' | ||
this.version = 1.0 | ||
this.type = 'Document' | ||
this.icon = 'spider.svg' | ||
this.category = 'Document Loaders' | ||
this.description = 'Scrape & Crawl the web with Spider' | ||
this.baseClasses = [this.type] | ||
this.inputs = [ | ||
{ | ||
label: 'Text Splitter', | ||
name: 'textSplitter', | ||
type: 'TextSplitter', | ||
optional: true | ||
}, | ||
{ | ||
label: 'Mode', | ||
name: 'mode', | ||
type: 'options', | ||
options: [ | ||
{ | ||
label: 'Scrape', | ||
name: 'scrape', | ||
description: 'Scrape a single page' | ||
}, | ||
{ | ||
label: 'Crawl', | ||
name: 'crawl', | ||
description: 'Crawl a website and extract pages within the same domain' | ||
} | ||
], | ||
default: 'scrape' | ||
}, | ||
{ | ||
label: 'Web Page URL', | ||
name: 'url', | ||
type: 'string', | ||
placeholder: 'https://spider.cloud' | ||
}, | ||
{ | ||
label: 'Additional Parameters', | ||
name: 'params', | ||
description: | ||
'Find all the available parameters in the <a _target="blank" href="https://spider.cloud/docs/api">Spider API documentation</a>', | ||
additionalParams: true, | ||
placeholder: '{ "anti_bot": true }', | ||
type: 'json', | ||
optional: true | ||
} | ||
] | ||
this.credential = { | ||
label: 'Credential', | ||
name: 'credential', | ||
type: 'credential', | ||
credentialNames: ['spiderApi'] | ||
} | ||
} | ||
|
||
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> { | ||
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter | ||
const url = nodeData.inputs?.url as string | ||
const mode = nodeData.inputs?.mode as 'crawl' | 'scrape' | ||
let params = nodeData.inputs?.params || {} | ||
const credentialData = await getCredentialData(nodeData.credential ?? '', options) | ||
const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData) | ||
|
||
if (typeof params === 'string') { | ||
try { | ||
params = JSON.parse(params) | ||
} catch (e) { | ||
throw new Error('Invalid JSON string provided for params') | ||
} | ||
} | ||
|
||
// Ensure return_format is set to markdown | ||
params.return_format = 'markdown' | ||
|
||
const input: SpiderLoaderParameters = { | ||
url, | ||
mode: mode as 'crawl' | 'scrape', | ||
apiKey: spiderApiKey, | ||
params: params as Record<string, unknown> | ||
} | ||
|
||
const loader = new SpiderLoader(input) | ||
|
||
let docs = [] | ||
|
||
if (textSplitter) { | ||
docs = await loader.loadAndSplit(textSplitter) | ||
} else { | ||
docs = await loader.load() | ||
} | ||
|
||
return docs | ||
} | ||
} | ||
|
||
module.exports = { nodeClass: Spider_DocumentLoaders } |
116 changes: 116 additions & 0 deletions
116
packages/components/nodes/documentloaders/Spider/SpiderApp.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios' | ||
|
||
interface SpiderAppConfig { | ||
apiKey?: string | null | ||
apiUrl?: string | null | ||
} | ||
|
||
interface SpiderDocumentMetadata { | ||
title?: string | ||
description?: string | ||
language?: string | ||
[key: string]: any | ||
} | ||
|
||
interface SpiderDocument { | ||
id?: string | ||
url?: string | ||
content: string | ||
markdown?: string | ||
html?: string | ||
createdAt?: Date | ||
updatedAt?: Date | ||
type?: string | ||
metadata: SpiderDocumentMetadata | ||
} | ||
|
||
interface ScrapeResponse { | ||
success: boolean | ||
data?: SpiderDocument | ||
error?: string | ||
} | ||
|
||
interface CrawlResponse { | ||
success: boolean | ||
data?: SpiderDocument[] | ||
error?: string | ||
} | ||
|
||
interface Params { | ||
[key: string]: any | ||
} | ||
|
||
class SpiderApp { | ||
private apiKey: string | ||
private apiUrl: string | ||
|
||
constructor({ apiKey = null, apiUrl = null }: SpiderAppConfig) { | ||
this.apiKey = apiKey || '' | ||
this.apiUrl = apiUrl || 'https://api.spider.cloud/v1' | ||
if (!this.apiKey) { | ||
throw new Error('No API key provided') | ||
} | ||
} | ||
|
||
async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> { | ||
const headers = this.prepareHeaders() | ||
const jsonData: Params = { url, limit: 1, ...params } | ||
|
||
try { | ||
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers) | ||
if (response.status === 200) { | ||
const responseData = response.data | ||
if (responseData[0].status) { | ||
return { success: true, data: responseData[0] } | ||
} else { | ||
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`) | ||
} | ||
} else { | ||
this.handleError(response, 'scrape URL') | ||
} | ||
} catch (error: any) { | ||
throw new Error(error.message) | ||
} | ||
return { success: false, error: 'Internal server error.' } | ||
} | ||
|
||
async crawlUrl(url: string, params: Params | null = null, idempotencyKey?: string): Promise<CrawlResponse | any> { | ||
const headers = this.prepareHeaders(idempotencyKey) | ||
const jsonData: Params = { url, ...params } | ||
|
||
try { | ||
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers) | ||
if (response.status === 200) { | ||
return { success: true, data: response.data } | ||
} else { | ||
this.handleError(response, 'start crawl job') | ||
} | ||
} catch (error: any) { | ||
throw new Error(error.message) | ||
} | ||
return { success: false, error: 'Internal server error.' } | ||
} | ||
|
||
private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { | ||
return { | ||
'Content-Type': 'application/json', | ||
Authorization: `Bearer ${this.apiKey}`, | ||
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}) | ||
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string } | ||
} | ||
|
||
private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> { | ||
return axios.post(`${this.apiUrl}/${url}`, data, { headers }) | ||
} | ||
|
||
private handleError(response: AxiosResponse, action: string): void { | ||
if ([402, 408, 409, 500].includes(response.status)) { | ||
const errorMessage: string = response.data.error || 'Unknown error occurred' | ||
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`) | ||
} else { | ||
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`) | ||
} | ||
} | ||
} | ||
|
||
export default SpiderApp |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.