Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(website-crawl): add jina reader as additional alternative for website crawling #8761

Merged
merged 5 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions api/controllers/console/datasets/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource):
@account_initialization_required
def post(self):
parser = reqparse.RequestParser()
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json")
parser.add_argument(
"provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
)
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
args = parser.parse_args()
Expand All @@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource):
@account_initialization_required
def get(self, job_id: str):
parser = reqparse.RequestParser()
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args")
parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
args = parser.parse_args()
# get crawl status
try:
Expand Down
10 changes: 10 additions & 0 deletions api/core/rag/extractor/extract_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from core.rag.extractor.excel_extractor import ExcelExtractor
from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
from core.rag.extractor.html_extractor import HtmlExtractor
from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
from core.rag.extractor.markdown_extractor import MarkdownExtractor
from core.rag.extractor.notion_extractor import NotionExtractor
from core.rag.extractor.pdf_extractor import PdfExtractor
Expand Down Expand Up @@ -171,6 +172,15 @@ def extract(
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
elif extract_setting.website_info.provider == "jinareader":
extractor = JinaReaderWebExtractor(
url=extract_setting.website_info.url,
job_id=extract_setting.website_info.job_id,
tenant_id=extract_setting.website_info.tenant_id,
mode=extract_setting.website_info.mode,
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
else:
raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
else:
Expand Down
35 changes: 35 additions & 0 deletions api/core/rag/extractor/jina_reader_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService


class JinaReaderWebExtractor(BaseExtractor):
"""
Crawl and scrape websites and return content in clean llm-ready markdown.
"""

def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id
self.tenant_id = tenant_id
self.mode = mode
self.only_main_content = only_main_content

def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []
if self.mode == "crawl":
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
if crawl_data is None:
return []
document = Document(
page_content=crawl_data.get("content", ""),
metadata={
"source_url": crawl_data.get("url"),
"description": crawl_data.get("description"),
"title": crawl_data.get("title"),
},
)
documents.append(document)
return documents
3 changes: 3 additions & 0 deletions api/services/auth/api_key_auth_factory.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from services.auth.firecrawl import FirecrawlAuth
from services.auth.jina import JinaAuth


class ApiKeyAuthFactory:
def __init__(self, provider: str, credentials: dict):
if provider == "firecrawl":
self.auth = FirecrawlAuth(credentials)
elif provider == "jinareader":
self.auth = JinaAuth(credentials)
else:
raise ValueError("Invalid provider")

Expand Down
44 changes: 44 additions & 0 deletions api/services/auth/jina.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import json

import requests

from services.auth.api_key_auth_base import ApiKeyAuthBase


class JinaAuth(ApiKeyAuthBase):
def __init__(self, credentials: dict):
super().__init__(credentials)
auth_type = credentials.get("auth_type")
if auth_type != "bearer":
raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer")
self.api_key = credentials.get("config").get("api_key", None)

if not self.api_key:
raise ValueError("No API key provided")

def validate_credentials(self):
headers = self._prepare_headers()
options = {
"url": "https://example.com",
}
response = self._post_request("https://r.jina.ai", options, headers)
if response.status_code == 200:
return True
else:
self._handle_error(response)

def _prepare_headers(self):
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}

def _post_request(self, url, data, headers):
return requests.post(url, headers=headers, json=data)

def _handle_error(self, response):
if response.status_code in {402, 409, 500}:
error_message = response.json().get("error", "Unknown error occurred")
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
else:
if response.text:
error_message = json.loads(response.text).get("error", "Unknown error occurred")
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")
100 changes: 100 additions & 0 deletions api/services/website_service.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import json

import requests
from flask_login import current_user

from core.helper import encrypter
Expand Down Expand Up @@ -65,6 +66,35 @@ def crawl_url(cls, args: dict) -> dict:
time = str(datetime.datetime.now().timestamp())
redis_client.setex(website_crawl_time_cache_key, 3600, time)
return {"status": "active", "job_id": job_id}
elif provider == "jinareader":
api_key = encrypter.decrypt_token(
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
)
crawl_sub_pages = options.get("crawl_sub_pages", False)
if not crawl_sub_pages:
response = requests.get(
f"https://r.jina.ai/{url}",
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
)
if response.json().get("code") != 200:
raise ValueError("Failed to crawl")
return {"status": "active", "data": response.json().get("data")}
else:
response = requests.post(
"https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
json={
"url": url,
"maxPages": options.get("limit", 1),
"useSitemap": options.get("use_sitemap", True),
},
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
},
)
if response.json().get("code") != 200:
raise ValueError("Failed to crawl")
return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
else:
raise ValueError("Invalid provider")

Expand Down Expand Up @@ -93,6 +123,42 @@ def get_crawl_status(cls, job_id: str, provider: str) -> dict:
time_consuming = abs(end_time - float(start_time))
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
redis_client.delete(website_crawl_time_cache_key)
elif provider == "jinareader":
api_key = encrypter.decrypt_token(
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
)
response = requests.post(
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
json={"taskId": job_id},
)
data = response.json().get("data", {})
crawl_status_data = {
"status": data.get("status", "active"),
"job_id": job_id,
"total": len(data.get("urls", [])),
"current": len(data.get("processed", [])) + len(data.get("failed", [])),
"data": [],
"time_consuming": data.get("duration", 0) / 1000,
}

if crawl_status_data["status"] == "completed":
response = requests.post(
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
)
data = response.json().get("data", {})
formatted_data = [
{
"title": item.get("data", {}).get("title"),
"source_url": item.get("data", {}).get("url"),
"description": item.get("data", {}).get("description"),
"markdown": item.get("data", {}).get("content"),
}
for item in data.get("processed", {}).values()
]
crawl_status_data["data"] = formatted_data
else:
raise ValueError("Invalid provider")
return crawl_status_data
Expand All @@ -119,6 +185,40 @@ def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str
if item.get("source_url") == url:
return item
return None
elif provider == "jinareader":
file_key = "website_files/" + job_id + ".txt"
if storage.exists(file_key):
data = storage.load_once(file_key)
if data:
data = json.loads(data.decode("utf-8"))
elif not job_id:
response = requests.get(
f"https://r.jina.ai/{url}",
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
)
if response.json().get("code") != 200:
raise ValueError("Failed to crawl")
return response.json().get("data")
else:
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
response = requests.post(
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
json={"taskId": job_id},
)
data = response.json().get("data", {})
if data.get("status") != "completed":
raise ValueError("Crawl job is not completed")

response = requests.post(
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
)
data = response.json().get("data", {})
for item in data.get("processed", {}).values():
if item.get("data", {}).get("url") == url:
return item.get("data", {})
else:
raise ValueError("Invalid provider")

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 8 additions & 4 deletions web/app/components/datasets/create/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets'
import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
import { fetchDataSource } from '@/service/common'
import { fetchDatasetDetail } from '@/service/datasets'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider, type NotionPage } from '@/models/common'
import { useModalContext } from '@/context/modal-context'
import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'

Expand All @@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
excludes: '',
limit: 10,
max_depth: '',
use_sitemap: true,
}

const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
Expand All @@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
const updateFileList = (preparedFiles: FileItem[]) => {
setFiles(preparedFiles)
}
const [fireCrawlJobId, setFireCrawlJobId] = useState('')
const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState<DataSourceProvider>(DataSourceProvider.fireCrawl)
const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('')

const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => {
const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID)
Expand Down Expand Up @@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
onStepChange={nextStep}
websitePages={websitePages}
updateWebsitePages={setWebsitePages}
onFireCrawlJobIdChange={setFireCrawlJobId}
onWebsiteCrawlProviderChange={setWebsiteCrawlProvider}
onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
Expand All @@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
files={fileList.map(file => file.file)}
notionPages={notionPages}
websitePages={websitePages}
fireCrawlJobId={fireCrawlJobId}
websiteCrawlProvider={websiteCrawlProvider}
websiteCrawlJobId={websiteCrawlJobId}
onStepChange={changeStep}
updateIndexingTypeCache={updateIndexingTypeCache}
updateResultCache={updateResultCache}
Expand Down
11 changes: 7 additions & 4 deletions web/app/components/datasets/create/step-one/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview'
import s from './index.module.css'
import cn from '@/utils/classnames'
import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
import type { NotionPage } from '@/models/common'
import type { DataSourceProvider, NotionPage } from '@/models/common'
import { DataSourceType } from '@/models/datasets'
import Button from '@/app/components/base/button'
import { NotionPageSelector } from '@/app/components/base/notion-page-selector'
Expand All @@ -33,7 +33,8 @@ type IStepOneProps = {
changeType: (type: DataSourceType) => void
websitePages?: CrawlResultItem[]
updateWebsitePages: (value: CrawlResultItem[]) => void
onFireCrawlJobIdChange: (jobId: string) => void
onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void
onWebsiteCrawlJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
Expand Down Expand Up @@ -69,7 +70,8 @@ const StepOne = ({
updateNotionPages,
websitePages = [],
updateWebsitePages,
onFireCrawlJobIdChange,
onWebsiteCrawlProviderChange,
onWebsiteCrawlJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}: IStepOneProps) => {
Expand Down Expand Up @@ -229,7 +231,8 @@ const StepOne = ({
onPreview={setCurrentWebsite}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={updateWebsitePages}
onJobIdChange={onFireCrawlJobIdChange}
onCrawlProviderChange={onWebsiteCrawlProviderChange}
onJobIdChange={onWebsiteCrawlJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
Expand Down
11 changes: 7 additions & 4 deletions web/app/components/datasets/create/step-two/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
import Toast from '@/app/components/base/toast'
import { formatNumber } from '@/utils/format'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { DataSourceType, DocForm } from '@/models/datasets'
import NotionIcon from '@/app/components/base/notion-icon'
import Switch from '@/app/components/base/switch'
Expand Down Expand Up @@ -63,7 +64,8 @@ type StepTwoProps = {
notionPages?: NotionPage[]
websitePages?: CrawlResultItem[]
crawlOptions?: CrawlOptions
fireCrawlJobId?: string
websiteCrawlProvider?: DataSourceProvider
websiteCrawlJobId?: string
onStepChange?: (delta: number) => void
updateIndexingTypeCache?: (type: string) => void
updateResultCache?: (res: createDocumentResponse) => void
Expand Down Expand Up @@ -94,7 +96,8 @@ const StepTwo = ({
notionPages = [],
websitePages = [],
crawlOptions,
fireCrawlJobId = '',
websiteCrawlProvider = DataSourceProvider.fireCrawl,
websiteCrawlJobId = '',
onStepChange,
updateIndexingTypeCache,
updateResultCache,
Expand Down Expand Up @@ -260,8 +263,8 @@ const StepTwo = ({

const getWebsiteInfo = () => {
return {
provider: 'firecrawl',
job_id: fireCrawlJobId,
provider: websiteCrawlProvider,
job_id: websiteCrawlJobId,
urls: websitePages.map(page => page.source_url),
only_main_content: crawlOptions?.only_main_content,
}
Expand Down
Loading
Loading