diff --git a/web/app/components/datasets/create/index.tsx b/web/app/components/datasets/create/index.tsx index 4a25b6699e5e1c..7bfc3895210d93 100644 --- a/web/app/components/datasets/create/index.tsx +++ b/web/app/components/datasets/create/index.tsx @@ -8,7 +8,7 @@ import StepOne from './step-one' import StepTwo from './step-two' import StepThree from './step-three' import { DataSourceType } from '@/models/datasets' -import type { CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets' +import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets' import { fetchDataSource } from '@/service/common' import { fetchDatasetDetail } from '@/service/datasets' import type { NotionPage } from '@/models/common' @@ -19,6 +19,15 @@ type DatasetUpdateFormProps = { datasetId?: string } +const DEFAULT_CRAWL_OPTIONS: CrawlOptions = { + crawl_sub_pages: true, + only_main_content: true, + includes: '', + excludes: '', + limit: 10, + max_depth: 2, +} + const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { const { t } = useTranslation() const { setShowAccountSettingModal } = useModalContext() @@ -37,6 +46,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { } const [websitePages, setWebsitePages] = useState([]) + const [crawlOptions, setCrawlOptions] = useState(DEFAULT_CRAWL_OPTIONS) + const updateFileList = (preparedFiles: FileItem[]) => { setFiles(preparedFiles) } @@ -127,6 +138,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { websitePages={websitePages} updateWebsitePages={setWebsitePages} onFireCrawlJobIdChange={setFireCrawlJobId} + crawlOptions={crawlOptions} + onCrawlOptionsChange={setCrawlOptions} /> {(step === 2 && (!datasetId || (datasetId && !!detail))) && { onStepChange={changeStep} updateIndexingTypeCache={updateIndexingTypeCache} updateResultCache={updateResultCache} + crawlOptions={crawlOptions} />} {step === 3 && void onFireCrawlJobIdChange: (jobId: string) => void + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void } type NotionConnectorProps = { @@ -68,6 +70,8 @@ const StepOne = ({ websitePages = [], updateWebsitePages, onFireCrawlJobIdChange, + crawlOptions, + onCrawlOptionsChange, }: IStepOneProps) => { const { dataset } = useDatasetDetailContext() const [showModal, setShowModal] = useState(false) @@ -221,6 +225,8 @@ const StepOne = ({ checkedCrawlResult={websitePages} onCheckedCrawlResultChange={updateWebsitePages} onJobIdChange={onFireCrawlJobIdChange} + crawlOptions={crawlOptions} + onCrawlOptionsChange={onCrawlOptionsChange} /> {isShowVectorSpaceFull && ( diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 0ffc9da2a6be09..b62ed879ad522c 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -12,7 +12,7 @@ import RetrievalMethodInfo from '../../common/retrieval-method-info' import PreviewItem, { PreviewType } from './preview-item' import LanguageSelect from './language-select' import s from './index.module.css' -import type { CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, IndexingEstimateResponse, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' +import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, IndexingEstimateResponse, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' import { createDocument, createFirstDocument, @@ -58,6 +58,7 @@ type StepTwoProps = { files: CustomFile[] notionPages?: NotionPage[] websitePages?: CrawlResultItem[] + crawlOptions?: CrawlOptions fireCrawlJobId?: string onStepChange?: (delta: number) => void updateIndexingTypeCache?: (type: string) => void @@ -86,6 +87,7 @@ const StepTwo = ({ files, notionPages = [], websitePages = [], + crawlOptions, fireCrawlJobId = '', onStepChange, updateIndexingTypeCache, @@ -252,6 +254,7 @@ const StepTwo = ({ provider: 'firecrawl', job_id: fireCrawlJobId, urls: websitePages.map(page => page.source_url), + only_main_content: crawlOptions?.only_main_content, } } diff --git a/web/app/components/datasets/create/website/firecrawl/crawled-result.tsx b/web/app/components/datasets/create/website/firecrawl/crawled-result.tsx index bf451e2eaa8a71..0a10ac52bac215 100644 --- a/web/app/components/datasets/create/website/firecrawl/crawled-result.tsx +++ b/web/app/components/datasets/create/website/firecrawl/crawled-result.tsx @@ -61,7 +61,7 @@ const CrawledResult: FC = ({ />
{t(`${I18N_PREFIX}.scrapTimeInfo`, { total: list.length, - time: '12.4 seconds', + time: '12.4 seconds', // TODO toFixed(1) })}
diff --git a/web/app/components/datasets/create/website/firecrawl/index.tsx b/web/app/components/datasets/create/website/firecrawl/index.tsx index e87137a2916d38..eb985fe9957b82 100644 --- a/web/app/components/datasets/create/website/firecrawl/index.tsx +++ b/web/app/components/datasets/create/website/firecrawl/index.tsx @@ -23,15 +23,8 @@ type Props = { checkedCrawlResult: CrawlResultItem[] onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void onJobIdChange: (jobId: string) => void -} - -const DEFAULT_CRAWL_OPTIONS: CrawlOptions = { - crawl_sub_pages: true, - only_main_content: true, - includes: '', - excludes: '', - limit: 10, - max_depth: 2, + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void } enum Step { @@ -45,6 +38,8 @@ const FireCrawl: FC = ({ checkedCrawlResult, onCheckedCrawlResultChange, onJobIdChange, + crawlOptions, + onCrawlOptionsChange, }) => { const { t } = useTranslation() const [step, setStep] = useState(Step.init) @@ -55,7 +50,6 @@ const FireCrawl: FC = ({ }) }, [setShowAccountSettingModal]) - const [crawlOptions, setCrawlOptions] = useState(DEFAULT_CRAWL_OPTIONS) const checkValid = useCallback((url: string) => { let errorMsg = '' if (!url) { @@ -172,7 +166,7 @@ const FireCrawl: FC = ({ isFilledFull={!isInit} hasError={isCrawlFinished && crawlHasError} > - {isInit && } + {isInit && } {isRunning && void onJobIdChange: (jobId: string) => void + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void } const Website: FC = ({ @@ -20,6 +22,8 @@ const Website: FC = ({ checkedCrawlResult, onCheckedCrawlResultChange, onJobIdChange, + crawlOptions, + onCrawlOptionsChange, }) => { const { setShowAccountSettingModal } = useModalContext() const [isLoaded, setIsLoaded] = useState(false) @@ -55,6 +59,8 @@ const Website: FC = ({ checkedCrawlResult={checkedCrawlResult} onCheckedCrawlResultChange={onCheckedCrawlResultChange} onJobIdChange={onJobIdChange} + crawlOptions={crawlOptions} + onCrawlOptionsChange={onCrawlOptionsChange} /> ) : ( diff --git a/web/models/datasets.ts b/web/models/datasets.ts index 101ce2d7a3799f..d417ec4943b807 100644 --- a/web/models/datasets.ts +++ b/web/models/datasets.ts @@ -5,7 +5,7 @@ import type { Tag } from '@/app/components/base/tag-management/constant' export enum DataSourceType { FILE = 'upload_file', NOTION = 'notion_import', - WEB = 'website', + WEB = 'website_crawl', } export type DataSet = {