Skip to content

Commit

Permalink
Feature/Spider (open-source web scraper & crawler) (FlowiseAI#2738)
Browse files Browse the repository at this point in the history
* Add Spider Scraper & Crawler

* fix pnpm lint

* chore: Update metadata to be correct format

* fix pnpm lint
  • Loading branch information
WilliamEspegren authored Jul 1, 2024
1 parent efc6e02 commit 656f6ca
Show file tree
Hide file tree
Showing 4 changed files with 317 additions and 0 deletions.
25 changes: 25 additions & 0 deletions packages/components/credentials/SpiderApi.credential.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { INodeParams, INodeCredential } from '../src/Interface'

class SpiderApiCredential implements INodeCredential {
label: string
name: string
version: number
description: string
inputs: INodeParams[]

constructor() {
this.label = 'Spider API'
this.name = 'spiderApi'
this.version = 1.0
this.description = 'Get your API key from the <a target="_blank" href="https://spider.cloud">Spider</a> dashboard.'
this.inputs = [
{
label: 'Spider API Key',
name: 'spiderApiKey',
type: 'password'
}
]
}
}

module.exports = { credClass: SpiderApiCredential }
175 changes: 175 additions & 0 deletions packages/components/nodes/documentloaders/Spider/Spider.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import { TextSplitter } from 'langchain/text_splitter'
import { Document, DocumentInterface } from '@langchain/core/documents'
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
import { INode, INodeData, INodeParams, ICommonObject } from '../../../src/Interface'
import { getCredentialData, getCredentialParam } from '../../../src/utils'
import SpiderApp from './SpiderApp'

interface SpiderLoaderParameters {
url: string
apiKey?: string
mode?: 'crawl' | 'scrape'
params?: Record<string, unknown>
}

class SpiderLoader extends BaseDocumentLoader {
private apiKey: string
private url: string
private mode: 'crawl' | 'scrape'
private params?: Record<string, unknown>

constructor(loaderParams: SpiderLoaderParameters) {
super()
const { apiKey, url, mode = 'crawl', params } = loaderParams
if (!apiKey) {
throw new Error('Spider API key not set. You can set it as SPIDER_API_KEY in your .env file, or pass it to Spider.')
}

this.apiKey = apiKey
this.url = url
this.mode = mode
this.params = params
}

public async load(): Promise<DocumentInterface[]> {
const app = new SpiderApp({ apiKey: this.apiKey })
let spiderDocs: any[]

if (this.mode === 'scrape') {
const response = await app.scrapeUrl(this.url, this.params)
if (!response.success) {
throw new Error(`Spider: Failed to scrape URL. Error: ${response.error}`)
}
spiderDocs = [response.data]
} else if (this.mode === 'crawl') {
const response = await app.crawlUrl(this.url, this.params)
if (!response.success) {
throw new Error(`Spider: Failed to crawl URL. Error: ${response.error}`)
}
spiderDocs = response.data
} else {
throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`)
}

return spiderDocs.map(
(doc) =>
new Document({
pageContent: doc.content || '',
metadata: { source: doc.url }
})
)
}
}

class Spider_DocumentLoaders implements INode {
label: string
name: string
description: string
type: string
icon: string
version: number
category: string
baseClasses: string[]
inputs: INodeParams[]
credential: INodeParams

constructor() {
this.label = 'Spider Document Loaders'
this.name = 'spiderDocumentLoaders'
this.version = 1.0
this.type = 'Document'
this.icon = 'spider.svg'
this.category = 'Document Loaders'
this.description = 'Scrape & Crawl the web with Spider'
this.baseClasses = [this.type]
this.inputs = [
{
label: 'Text Splitter',
name: 'textSplitter',
type: 'TextSplitter',
optional: true
},
{
label: 'Mode',
name: 'mode',
type: 'options',
options: [
{
label: 'Scrape',
name: 'scrape',
description: 'Scrape a single page'
},
{
label: 'Crawl',
name: 'crawl',
description: 'Crawl a website and extract pages within the same domain'
}
],
default: 'scrape'
},
{
label: 'Web Page URL',
name: 'url',
type: 'string',
placeholder: 'https://spider.cloud'
},
{
label: 'Additional Parameters',
name: 'params',
description:
'Find all the available parameters in the <a _target="blank" href="https://spider.cloud/docs/api">Spider API documentation</a>',
additionalParams: true,
placeholder: '{ "anti_bot": true }',
type: 'json',
optional: true
}
]
this.credential = {
label: 'Credential',
name: 'credential',
type: 'credential',
credentialNames: ['spiderApi']
}
}

async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const url = nodeData.inputs?.url as string
const mode = nodeData.inputs?.mode as 'crawl' | 'scrape'
let params = nodeData.inputs?.params || {}
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const spiderApiKey = getCredentialParam('spiderApiKey', credentialData, nodeData)

if (typeof params === 'string') {
try {
params = JSON.parse(params)
} catch (e) {
throw new Error('Invalid JSON string provided for params')
}
}

// Ensure return_format is set to markdown
params.return_format = 'markdown'

const input: SpiderLoaderParameters = {
url,
mode: mode as 'crawl' | 'scrape',
apiKey: spiderApiKey,
params: params as Record<string, unknown>
}

const loader = new SpiderLoader(input)

let docs = []

if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
}

return docs
}
}

module.exports = { nodeClass: Spider_DocumentLoaders }
116 changes: 116 additions & 0 deletions packages/components/nodes/documentloaders/Spider/SpiderApp.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import axios, { AxiosResponse, AxiosRequestHeaders } from 'axios'

interface SpiderAppConfig {
apiKey?: string | null
apiUrl?: string | null
}

interface SpiderDocumentMetadata {
title?: string
description?: string
language?: string
[key: string]: any
}

interface SpiderDocument {
id?: string
url?: string
content: string
markdown?: string
html?: string
createdAt?: Date
updatedAt?: Date
type?: string
metadata: SpiderDocumentMetadata
}

interface ScrapeResponse {
success: boolean
data?: SpiderDocument
error?: string
}

interface CrawlResponse {
success: boolean
data?: SpiderDocument[]
error?: string
}

interface Params {
[key: string]: any
}

class SpiderApp {
private apiKey: string
private apiUrl: string

constructor({ apiKey = null, apiUrl = null }: SpiderAppConfig) {
this.apiKey = apiKey || ''
this.apiUrl = apiUrl || 'https://api.spider.cloud/v1'
if (!this.apiKey) {
throw new Error('No API key provided')
}
}

async scrapeUrl(url: string, params: Params | null = null): Promise<ScrapeResponse> {
const headers = this.prepareHeaders()
const jsonData: Params = { url, limit: 1, ...params }

try {
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
if (response.status === 200) {
const responseData = response.data
if (responseData[0].status) {
return { success: true, data: responseData[0] }
} else {
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`)
}
} else {
this.handleError(response, 'scrape URL')
}
} catch (error: any) {
throw new Error(error.message)
}
return { success: false, error: 'Internal server error.' }
}

async crawlUrl(url: string, params: Params | null = null, idempotencyKey?: string): Promise<CrawlResponse | any> {
const headers = this.prepareHeaders(idempotencyKey)
const jsonData: Params = { url, ...params }

try {
const response: AxiosResponse = await this.postRequest('crawl', jsonData, headers)
if (response.status === 200) {
return { success: true, data: response.data }
} else {
this.handleError(response, 'start crawl job')
}
} catch (error: any) {
throw new Error(error.message)
}
return { success: false, error: 'Internal server error.' }
}

private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
return {
'Content-Type': 'application/json',
Authorization: `Bearer ${this.apiKey}`,
...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {})
} as AxiosRequestHeaders & { 'x-idempotency-key'?: string }
}

private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise<AxiosResponse> {
return axios.post(`${this.apiUrl}/${url}`, data, { headers })
}

private handleError(response: AxiosResponse, action: string): void {
if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage: string = response.data.error || 'Unknown error occurred'
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`)
} else {
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`)
}
}
}

export default SpiderApp
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 656f6ca

Please sign in to comment.