Skip to content

Commit

Permalink
chore: add only_main_content to api
Browse files Browse the repository at this point in the history
  • Loading branch information
iamjoel committed Jun 7, 2024
1 parent 6dd4aec commit 537f7ec
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 17 deletions.
16 changes: 15 additions & 1 deletion web/app/components/datasets/create/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import StepOne from './step-one'
import StepTwo from './step-two'
import StepThree from './step-three'
import { DataSourceType } from '@/models/datasets'
import type { CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
import { fetchDataSource } from '@/service/common'
import { fetchDatasetDetail } from '@/service/datasets'
import type { NotionPage } from '@/models/common'
Expand All @@ -19,6 +19,15 @@ type DatasetUpdateFormProps = {
datasetId?: string
}

const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
crawl_sub_pages: true,
only_main_content: true,
includes: '',
excludes: '',
limit: 10,
max_depth: 2,
}

const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
const { t } = useTranslation()
const { setShowAccountSettingModal } = useModalContext()
Expand All @@ -37,6 +46,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
}

const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([])
const [crawlOptions, setCrawlOptions] = useState<CrawlOptions>(DEFAULT_CRAWL_OPTIONS)

const updateFileList = (preparedFiles: FileItem[]) => {
setFiles(preparedFiles)
}
Expand Down Expand Up @@ -127,6 +138,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
websitePages={websitePages}
updateWebsitePages={setWebsitePages}
onFireCrawlJobIdChange={setFireCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
</div>
{(step === 2 && (!datasetId || (datasetId && !!detail))) && <StepTwo
Expand All @@ -142,6 +155,7 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
onStepChange={changeStep}
updateIndexingTypeCache={updateIndexingTypeCache}
updateResultCache={updateResultCache}
crawlOptions={crawlOptions}
/>}
{step === 3 && <StepThree
datasetId={datasetId}
Expand Down
8 changes: 7 additions & 1 deletion web/app/components/datasets/create/step-one/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import EmptyDatasetCreationModal from '../empty-dataset-creation-modal'
import Website from '../website'
import WebsitePreview from '../website/preview'
import s from './index.module.css'
import type { CrawlResultItem, FileItem } from '@/models/datasets'
import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
import type { NotionPage } from '@/models/common'
import { DataSourceType } from '@/models/datasets'
import Button from '@/app/components/base/button'
Expand All @@ -34,6 +34,8 @@ type IStepOneProps = {
websitePages?: CrawlResultItem[]
updateWebsitePages: (value: CrawlResultItem[]) => void
onFireCrawlJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}

type NotionConnectorProps = {
Expand Down Expand Up @@ -68,6 +70,8 @@ const StepOne = ({
websitePages = [],
updateWebsitePages,
onFireCrawlJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}: IStepOneProps) => {
const { dataset } = useDatasetDetailContext()
const [showModal, setShowModal] = useState(false)
Expand Down Expand Up @@ -221,6 +225,8 @@ const StepOne = ({
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={updateWebsitePages}
onJobIdChange={onFireCrawlJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
</div>
{isShowVectorSpaceFull && (
Expand Down
5 changes: 4 additions & 1 deletion web/app/components/datasets/create/step-two/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import RetrievalMethodInfo from '../../common/retrieval-method-info'
import PreviewItem, { PreviewType } from './preview-item'
import LanguageSelect from './language-select'
import s from './index.module.css'
import type { CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, IndexingEstimateResponse, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, IndexingEstimateResponse, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import {
createDocument,
createFirstDocument,
Expand Down Expand Up @@ -58,6 +58,7 @@ type StepTwoProps = {
files: CustomFile[]
notionPages?: NotionPage[]
websitePages?: CrawlResultItem[]
crawlOptions?: CrawlOptions
fireCrawlJobId?: string
onStepChange?: (delta: number) => void
updateIndexingTypeCache?: (type: string) => void
Expand Down Expand Up @@ -86,6 +87,7 @@ const StepTwo = ({
files,
notionPages = [],
websitePages = [],
crawlOptions,
fireCrawlJobId = '',
onStepChange,
updateIndexingTypeCache,
Expand Down Expand Up @@ -252,6 +254,7 @@ const StepTwo = ({
provider: 'firecrawl',
job_id: fireCrawlJobId,
urls: websitePages.map(page => page.source_url),
only_main_content: crawlOptions?.only_main_content,
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ const CrawledResult: FC<Props> = ({
/>
<div>{t(`${I18N_PREFIX}.scrapTimeInfo`, {
total: list.length,
time: '12.4 seconds',
time: '12.4 seconds', // TODO toFixed(1)
})}</div>
</div>
<div className='p-2'>
Expand Down
16 changes: 5 additions & 11 deletions web/app/components/datasets/create/website/firecrawl/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,8 @@ type Props = {
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
}

const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
crawl_sub_pages: true,
only_main_content: true,
includes: '',
excludes: '',
limit: 10,
max_depth: 2,
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}

enum Step {
Expand All @@ -45,6 +38,8 @@ const FireCrawl: FC<Props> = ({
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
Expand All @@ -55,7 +50,6 @@ const FireCrawl: FC<Props> = ({
})
}, [setShowAccountSettingModal])

const [crawlOptions, setCrawlOptions] = useState<CrawlOptions>(DEFAULT_CRAWL_OPTIONS)
const checkValid = useCallback((url: string) => {
let errorMsg = ''
if (!url) {
Expand Down Expand Up @@ -172,7 +166,7 @@ const FireCrawl: FC<Props> = ({
isFilledFull={!isInit}
hasError={isCrawlFinished && crawlHasError}
>
{isInit && <Options className='mt-2' payload={crawlOptions} onChange={setCrawlOptions} />}
{isInit && <Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />}
{isRunning
&& <Crawling
className='mt-2'
Expand Down
8 changes: 7 additions & 1 deletion web/app/components/datasets/create/website/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import React, { useCallback, useEffect, useState } from 'react'
import NoData from './no-data'
import Firecrawl from './firecrawl'
import { useModalContext } from '@/context/modal-context'
import type { CrawlResultItem } from '@/models/datasets'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { fetchFirecrawlApiKey } from '@/service/datasets'
import { type DataSourceWebsiteItem, WebsiteProvider } from '@/models/common'

Expand All @@ -13,13 +13,17 @@ type Props = {
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}

const Website: FC<Props> = ({
onPreview,
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}) => {
const { setShowAccountSettingModal } = useModalContext()
const [isLoaded, setIsLoaded] = useState(false)
Expand Down Expand Up @@ -55,6 +59,8 @@ const Website: FC<Props> = ({
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
Expand Down
2 changes: 1 addition & 1 deletion web/models/datasets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import type { Tag } from '@/app/components/base/tag-management/constant'
export enum DataSourceType {
FILE = 'upload_file',
NOTION = 'notion_import',
WEB = 'website',
WEB = 'website_crawl',
}

export type DataSet = {
Expand Down

0 comments on commit 537f7ec

Please sign in to comment.