Skip to content

Commit

Permalink
Sync INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH between API and Web (#11230
Browse files Browse the repository at this point in the history
)
  • Loading branch information
fujita-h authored Dec 2, 2024
1 parent f8c966c commit 1d8385f
Show file tree
Hide file tree
Showing 29 changed files with 51 additions and 40 deletions.
4 changes: 2 additions & 2 deletions api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ LOG_DATEFORMAT=%Y-%m-%d %H:%M:%S
LOG_TZ=UTC

# Indexing configuration
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=1000
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000

# Workflow runtime configuration
WORKFLOW_MAX_EXECUTION_STEPS=500
Expand Down Expand Up @@ -413,4 +413,4 @@ RESET_PASSWORD_TOKEN_EXPIRY_MINUTES=5

CREATE_TIDB_SERVICE_JOB_ENABLED=false

RETRIEVAL_TOP_N=0
RETRIEVAL_TOP_N=0
2 changes: 1 addition & 1 deletion api/configs/feature/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ class IndexingConfig(BaseSettings):

INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: PositiveInt = Field(
description="Maximum token length for text segmentation during indexing",
default=1000,
default=4000,
)


Expand Down
3 changes: 2 additions & 1 deletion api/controllers/console/datasets/datasets_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def get(self):
# get default rules
mode = DocumentService.DEFAULT_RULES["mode"]
rules = DocumentService.DEFAULT_RULES["rules"]
limits = DocumentService.DEFAULT_RULES["limits"]
if document_id:
# get the latest process rule
document = Document.query.get_or_404(document_id)
Expand All @@ -132,7 +133,7 @@ def get(self):
mode = dataset_process_rule.mode
rules = dataset_process_rule.rules_dict

return {"mode": mode, "rules": rules}
return {"mode": mode, "rules": rules, "limits": limits}


class DatasetDocumentListApi(Resource):
Expand Down
3 changes: 3 additions & 0 deletions api/services/dataset_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,9 @@ class DocumentService:
],
"segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
},
"limits": {
"indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH,
},
}

DOCUMENT_METADATA_SCHEMA = {
Expand Down
2 changes: 1 addition & 1 deletion docker-legacy/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ services:
SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128'
SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128'
# Indexing configuration
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 4000
depends_on:
- db
- redis
Expand Down
2 changes: 1 addition & 1 deletion docker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ SMTP_OPPORTUNISTIC_TLS=false
# ------------------------------

# Maximum length of segmentation tokens for indexing
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=1000
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000

# Member invitation link valid time (hours),
# Default: 72.
Expand Down
2 changes: 1 addition & 1 deletion docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ x-shared-env: &shared-api-worker-env
SMTP_OPPORTUNISTIC_TLS: ${SMTP_OPPORTUNISTIC_TLS:-false}
RESEND_API_KEY: ${RESEND_API_KEY:-your-resend-api-key}
RESEND_API_URL: ${RESEND_API_URL:-https://api.resend.com}
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-1000}
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-4000}
INVITE_EXPIRY_HOURS: ${INVITE_EXPIRY_HOURS:-72}
RESET_PASSWORD_TOKEN_EXPIRY_MINUTES: ${RESET_PASSWORD_TOKEN_EXPIRY_MINUTES:-5}
CODE_EXECUTION_ENDPOINT: ${CODE_EXECUTION_ENDPOINT:-http://sandbox:8194}
Expand Down
28 changes: 15 additions & 13 deletions web/app/components/datasets/create/step-two/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ const StepTwo = ({
const setSegmentIdentifier = useCallback((value: string) => {
doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
}, [])
const [max, setMax] = useState(4000) // default chunk length
const [maxChunkLength, setMaxChunkLength] = useState(4000) // default chunk length
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000)
const [overlap, setOverlap] = useState(50)
const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>()
Expand Down Expand Up @@ -196,7 +197,7 @@ const StepTwo = ({
const resetRules = () => {
if (defaultConfig) {
setSegmentIdentifier(defaultConfig.segmentation.separator)
setMax(defaultConfig.segmentation.max_tokens)
setMaxChunkLength(defaultConfig.segmentation.max_tokens)
setOverlap(defaultConfig.segmentation.chunk_overlap)
setRules(defaultConfig.pre_processing_rules)
}
Expand All @@ -212,8 +213,8 @@ const StepTwo = ({
}

const confirmChangeCustomConfig = () => {
if (segmentationType === SegmentType.CUSTOM && max > 4000) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
if (segmentationType === SegmentType.CUSTOM && maxChunkLength > limitMaxChunkLength) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
return
}
setCustomFileIndexingEstimate(null)
Expand All @@ -234,7 +235,7 @@ const StepTwo = ({
pre_processing_rules: rules,
segmentation: {
separator: unescape(segmentIdentifier),
max_tokens: max,
max_tokens: maxChunkLength,
chunk_overlap: overlap,
},
}
Expand Down Expand Up @@ -339,12 +340,12 @@ const StepTwo = ({
)
const getCreationParams = () => {
let params
if (segmentationType === SegmentType.CUSTOM && overlap > max) {
if (segmentationType === SegmentType.CUSTOM && overlap > maxChunkLength) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
return
}
if (segmentationType === SegmentType.CUSTOM && max > 4000) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
if (segmentationType === SegmentType.CUSTOM && maxChunkLength > limitMaxChunkLength) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
return
}
if (isSetting) {
Expand Down Expand Up @@ -415,7 +416,8 @@ const StepTwo = ({
const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
const separator = res.rules.segmentation.separator
setSegmentIdentifier(separator)
setMax(res.rules.segmentation.max_tokens)
setMaxChunkLength(res.rules.segmentation.max_tokens)
setLimitMaxChunkLength(res.limits.indexing_max_segmentation_tokens_length)
setOverlap(res.rules.segmentation.chunk_overlap)
setRules(res.rules.pre_processing_rules)
setDefaultConfig(res.rules)
Expand All @@ -432,7 +434,7 @@ const StepTwo = ({
const max = rules.segmentation.max_tokens
const overlap = rules.segmentation.chunk_overlap
setSegmentIdentifier(separator)
setMax(max)
setMaxChunkLength(max)
setOverlap(overlap)
setRules(rules.pre_processing_rules)
setDefaultConfig(rules)
Expand Down Expand Up @@ -670,10 +672,10 @@ const StepTwo = ({
type="number"
className='h-9'
placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
value={max}
max={4000}
value={maxChunkLength}
max={limitMaxChunkLength}
min={1}
onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
onChange={e => setMaxChunkLength(parseInt(e.target.value.replace(/^0+/, ''), 10))}
/>
</div>
</div>
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/de-DE/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ const translation = {
websiteSource: 'Preprocess-Website',
webpageUnit: 'Seiten',
separatorTip: 'Ein Trennzeichen ist das Zeichen, das zum Trennen von Text verwendet wird. \\n\\n und \\n sind häufig verwendete Trennzeichen zum Trennen von Absätzen und Zeilen. In Kombination mit Kommas (\\n\\n,\\n) werden Absätze nach Zeilen segmentiert, wenn die maximale Blocklänge überschritten wird. Sie können auch spezielle, von Ihnen selbst definierte Trennzeichen verwenden (z. B. ***).',
maxLengthCheck: 'Die maximale Stücklänge sollte weniger als 4000 betragen',
maxLengthCheck: 'Die maximale Stücklänge sollte weniger als {{limit}} betragen',
},
stepThree: {
creationTitle: '🎉 Wissen erstellt',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/en-US/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ const translation = {
separatorTip: 'A delimiter is the character used to separate text. \\n\\n and \\n are commonly used delimiters for separating paragraphs and lines. Combined with commas (\\n\\n,\\n), paragraphs will be segmented by lines when exceeding the maximum chunk length. You can also use special delimiters defined by yourself (e.g. ***).',
separatorPlaceholder: '\\n\\n for separating paragraphs; \\n for separating lines',
maxLength: 'Maximum chunk length',
maxLengthCheck: 'Maximum chunk length should be less than 4000',
maxLengthCheck: 'Maximum chunk length should be less than {{limit}}',
overlap: 'Chunk overlap',
overlapTip: 'Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%-25% of the maximum chunk size.',
overlapCheck: 'chunk overlap should not bigger than maximum chunk length',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/es-ES/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ const translation = {
retrievalSettingTip: 'Para cambiar el método de índice, por favor ve a la ',
datasetSettingLink: 'configuración del conocimiento.',
separatorTip: 'Un delimitador es el carácter que se utiliza para separar el texto. \\n\\n y \\n son delimitadores comúnmente utilizados para separar párrafos y líneas. Combinado con comas (\\n\\n,\\n), los párrafos se segmentarán por líneas cuando excedan la longitud máxima del fragmento. También puede utilizar delimitadores especiales definidos por usted mismo (por ejemplo, ***).',
maxLengthCheck: 'La longitud máxima del fragmento debe ser inferior a 4000',
maxLengthCheck: 'La longitud máxima del fragmento debe ser inferior a {{limit}}',
},
stepThree: {
creationTitle: '🎉 Conocimiento creado',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/fa-IR/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ const translation = {
retrievalSettingTip: 'برای تغییر روش شاخص، لطفاً به',
datasetSettingLink: 'تنظیمات دانش بروید.',
separatorTip: 'جداکننده نویسه ای است که برای جداسازی متن استفاده می شود. \\n\\n و \\n معمولا برای جداسازی پاراگراف ها و خطوط استفاده می شوند. همراه با کاما (\\n\\n,\\n)، پاراگراف ها زمانی که از حداکثر طول تکه فراتر می روند، با خطوط تقسیم بندی می شوند. همچنین می توانید از جداکننده های خاصی که توسط خودتان تعریف شده اند استفاده کنید (مثلا ***).',
maxLengthCheck: 'حداکثر طول تکه باید کمتر از 4000 باشد',
maxLengthCheck: 'حداکثر طول تکه باید کمتر از {{limit}} باشد',
},
stepThree: {
creationTitle: ' دانش ایجاد شد',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/fr-FR/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ const translation = {
webpageUnit: 'Pages',
websiteSource: 'Site web de prétraitement',
separatorTip: 'Un délimiteur est le caractère utilisé pour séparer le texte. \\n\\n et \\n sont des délimiteurs couramment utilisés pour séparer les paragraphes et les lignes. Combiné à des virgules (\\n\\n,\\n), les paragraphes seront segmentés par des lignes lorsqu’ils dépasseront la longueur maximale des morceaux. Vous pouvez également utiliser des délimiteurs spéciaux définis par vous-même (par exemple ***).',
maxLengthCheck: 'La longueur maximale des morceaux doit être inférieure à 4000',
maxLengthCheck: 'La longueur maximale des morceaux doit être inférieure à {{limit}}',
},
stepThree: {
creationTitle: '🎉 Connaissance créée',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/hi-IN/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ const translation = {
retrievalSettingTip: 'इंडेक्स विधि बदलने के लिए, कृपया जाएं ',
datasetSettingLink: 'ज्ञान सेटिंग्स।',
separatorTip: 'एक सीमांकक पाठ को अलग करने के लिए उपयोग किया जाने वाला वर्ण है। \\n\\n और \\n आमतौर पर पैराग्राफ और लाइनों को अलग करने के लिए उपयोग किए जाने वाले सीमांकक हैं। अल्पविराम (\\n\\n,\\n) के साथ संयुक्त, अधिकतम खंड लंबाई से अधिक होने पर अनुच्छेदों को पंक्तियों द्वारा खंडित किया जाएगा। आप स्वयं द्वारा परिभाषित विशेष सीमांकक का भी उपयोग कर सकते हैं (उदा. ***).',
maxLengthCheck: 'अधिकतम चंक लंबाई 4000 से कम होनी चाहिए',
maxLengthCheck: 'अधिकतम चंक लंबाई {{limit}} से कम होनी चाहिए',
},
stepThree: {
creationTitle: '🎉 ज्ञान बनाया गया',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/it-IT/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ const translation = {
retrievalSettingTip: 'Per cambiare il metodo di indicizzazione, vai alle ',
datasetSettingLink: 'impostazioni della Conoscenza.',
separatorTip: 'Un delimitatore è il carattere utilizzato per separare il testo. \\n\\n e \\n sono delimitatori comunemente usati per separare paragrafi e righe. In combinazione con le virgole (\\n\\n,\\n), i paragrafi verranno segmentati per righe quando superano la lunghezza massima del blocco. È inoltre possibile utilizzare delimitatori speciali definiti dall\'utente (ad es. ***).',
maxLengthCheck: 'La lunghezza massima del blocco deve essere inferiore a 4000',
maxLengthCheck: 'La lunghezza massima del blocco deve essere inferiore a {{limit}}',
},
stepThree: {
creationTitle: '🎉 Conoscenza creata',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/ja-JP/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ const translation = {
retrievalSettingTip: '検索方法を変更するには、',
datasetSettingLink: 'ナレッジ設定',
separatorTip: '区切り文字は、テキストを区切るために使用される文字です。\\n\\n と \\n は、段落と行を区切るために一般的に使用される区切り記号です。カンマ (\\n\\n,\\n) と組み合わせると、最大チャンク長を超えると、段落は行で区切られます。自分で定義した特別な区切り文字を使用することもできます(例:***)。',
maxLengthCheck: 'チャンクの最大長は 4000 未満にする必要があります',
maxLengthCheck: 'チャンクの最大長は {{limit}} 未満にする必要があります',
},
stepThree: {
creationTitle: '🎉 ナレッジが作成されました',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/ko-KR/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ const translation = {
webpageUnit: '페이지',
websiteSource: '웹 사이트 전처리',
separatorTip: '구분 기호는 텍스트를 구분하는 데 사용되는 문자입니다. \\n\\n 및 \\n은 단락과 줄을 구분하는 데 일반적으로 사용되는 구분 기호입니다. 쉼표(\\n\\n,\\n)와 함께 사용하면 최대 청크 길이를 초과할 경우 단락이 줄로 분할됩니다. 직접 정의한 특수 구분 기호(예: ***)를 사용할 수도 있습니다.',
maxLengthCheck: '최대 청크 길이는 4000 미만이어야 합니다.',
maxLengthCheck: '최대 청크 길이는 {{limit}} 미만이어야 합니다.',
},
stepThree: {
creationTitle: '🎉 지식이 생성되었습니다',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/pl-PL/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ const translation = {
webpageUnit: 'Stron',
websiteSource: 'Witryna internetowa przetwarzania wstępnego',
separatorTip: 'Ogranicznik to znak używany do oddzielania tekstu. \\n\\n i \\n są powszechnie używanymi ogranicznikami do oddzielania akapitów i wierszy. W połączeniu z przecinkami (\\n\\n,\\n), akapity będą segmentowane wierszami po przekroczeniu maksymalnej długości fragmentu. Możesz również skorzystać ze zdefiniowanych przez siebie specjalnych ograniczników (np. ***).',
maxLengthCheck: 'Maksymalna długość porcji powinna być mniejsza niż 4000',
maxLengthCheck: 'Maksymalna długość porcji powinna być mniejsza niż {{limit}}',
},
stepThree: {
creationTitle: '🎉 Utworzono Wiedzę',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/pt-BR/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ const translation = {
websiteSource: 'Site de pré-processamento',
webpageUnit: 'Páginas',
separatorTip: 'Um delimitador é o caractere usado para separar o texto. \\n\\n e \\n são delimitadores comumente usados para separar parágrafos e linhas. Combinado com vírgulas (\\n\\n,\\n), os parágrafos serão segmentados por linhas ao exceder o comprimento máximo do bloco. Você também pode usar delimitadores especiais definidos por você (por exemplo, ***).',
maxLengthCheck: 'O comprimento máximo do chunk deve ser inferior a 4000',
maxLengthCheck: 'O comprimento máximo do chunk deve ser inferior a {{limit}}',
},
stepThree: {
creationTitle: '🎉 Conhecimento criado',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/ro-RO/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ const translation = {
webpageUnit: 'Pagini',
websiteSource: 'Site-ul web de preprocesare',
separatorTip: 'Un delimitator este caracterul folosit pentru a separa textul. \\n\\n și \\n sunt delimitatori utilizați în mod obișnuit pentru separarea paragrafelor și liniilor. Combinate cu virgule (\\n\\n,\\n), paragrafele vor fi segmentate pe linii atunci când depășesc lungimea maximă a bucății. De asemenea, puteți utiliza delimitatori speciali definiți de dumneavoastră (de exemplu, ***).',
maxLengthCheck: 'Lungimea maximă a bucății trebuie să fie mai mică de 4000',
maxLengthCheck: 'Lungimea maximă a bucății trebuie să fie mai mică de {{limit}}',
},
stepThree: {
creationTitle: '🎉 Cunoștință creată',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/ru-RU/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ const translation = {
retrievalSettingTip: 'Чтобы изменить метод индексации, пожалуйста, перейдите в ',
datasetSettingLink: 'настройки базы знаний.',
separatorTip: 'Разделитель — это символ, используемый для разделения текста. \\n\\n и \\n — это часто используемые разделители для разделения абзацев и строк. В сочетании с запятыми (\\n\\n,\\n) абзацы будут сегментированы по строкам, если максимальная длина блока превышает их. Вы также можете использовать специальные разделители, определенные вами (например, ***).',
maxLengthCheck: 'Максимальная длина блока должна быть меньше 4000',
maxLengthCheck: 'Максимальная длина блока должна быть меньше {{limit}}',
},
stepThree: {
creationTitle: '🎉 База знаний создана',
Expand Down
2 changes: 1 addition & 1 deletion web/i18n/sl-SI/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ const translation = {
indexSettingTip: 'Če želite spremeniti način indeksiranja in model vdelave, pojdite na ',
retrievalSettingTip: 'Če želite spremeniti nastavitve iskanja, pojdite na ',
datasetSettingLink: 'nastavitve Znanja.',
maxLengthCheck: 'Največja dolžina kosa mora biti manjša od 4000',
maxLengthCheck: 'Največja dolžina kosa mora biti manjša od {{limit}}',
},
stepThree: {
creationTitle: '🎉 Znanje ustvarjeno',
Expand Down
Loading

0 comments on commit 1d8385f

Please sign in to comment.