From 8552a2cac107685d6dfc8bbe2c996f2d647bf935 Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Wed, 1 May 2024 09:14:17 +0800 Subject: [PATCH] Fix: change max chunk limit exception (#717) (#720) * change max chunk limit exception Signed-off-by: yuye-aws * fix integration tests for two chunking algorithm Signed-off-by: yuye-aws * update changelog Signed-off-by: yuye-aws * add run time parameter string_tobe_chunked_count Signed-off-by: yuye-aws * fix unit test for fixed token length and delimiter algorithm Signed-off-by: yuye-aws * implement unit test for string to be chunked in fixed token length and delimiter algorithm Signed-off-by: yuye-aws * update definition for string to be chunked parameter Signed-off-by: yuye-aws * fix text chunking processor ut Signed-off-by: yuye-aws * add string to be chunked count in text chunking processor Signed-off-by: yuye-aws * add string to be chunked count in text chunking processor Signed-off-by: yuye-aws * add more test cases for text chunking processor Signed-off-by: yuye-aws * remove chunker util Signed-off-by: yuye-aws * change chunk limit check in boht algorithms Signed-off-by: yuye-aws * update ut for text chunking processor Signed-off-by: yuye-aws * update parameter name to chunk_string_count Signed-off-by: yuye-aws * run spot less apply Signed-off-by: yuye-aws --------- Signed-off-by: yuye-aws (cherry picked from commit 86b70e0b0fa2e150516f7b15f68bea1d2cf0b0b8) Co-authored-by: Yuye Zhu --- CHANGELOG.md | 1 + .../processor/TextChunkingProcessor.java | 76 +++- .../processor/chunker/Chunker.java | 13 + .../processor/chunker/ChunkerUtil.java | 41 -- .../processor/chunker/DelimiterChunker.java | 8 +- .../chunker/FixedTokenLengthChunker.java | 8 +- .../processor/TextChunkingProcessorTests.java | 354 ++++++++++++++++-- .../chunker/DelimiterChunkerTests.java | 54 ++- .../chunker/FixedTokenLengthChunkerTests.java | 61 +-- 9 files changed, 459 insertions(+), 157 deletions(-) delete mode 100644 src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 9727bcc3c..ff1976866 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Removed map of subquery to subquery index in favor of storing index as part of disi wrapper to improve hybrid query latencies by 20% ([#711](https://github.com/opensearch-project/neural-search/pull/711)) ### Bug Fixes - Add support for request_cache flag in hybrid query ([#663](https://github.com/opensearch-project/neural-search/pull/663)) +- Avoid change max_chunk_limit exceed exception in text chunking processor ([#717](https://github.com/opensearch-project/neural-search/pull/717)) ### Infrastructure ### Documentation ### Maintenance diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index 50a9d4b7b..555310627 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -12,6 +12,7 @@ import java.util.List; import java.util.Objects; +import org.apache.commons.lang3.StringUtils; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.env.Environment; import org.opensearch.index.IndexService; @@ -30,6 +31,7 @@ import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.Chunker.DEFAULT_MAX_CHUNK_LIMIT; import static org.opensearch.neuralsearch.processor.chunker.Chunker.DISABLED_MAX_CHUNK_LIMIT; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; /** @@ -170,9 +172,11 @@ public IngestDocument execute(final IngestDocument ingestDocument) { // fixed token length algorithm needs runtime parameter max_token_count for tokenization Map runtimeParameters = new HashMap<>(); int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); + int chunkStringCount = getChunkStringCountFromMap(sourceAndMetadataMap, fieldMap); runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); - chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, 0); + runtimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount); + chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters); return ingestDocument; } @@ -230,13 +234,51 @@ private void validateListTypeValue(final String sourceKey, final Object sourceVa } @SuppressWarnings("unchecked") - private int chunkMapType( + private int getChunkStringCountFromMap(Map sourceAndMetadataMap, final Map fieldMap) { + int chunkStringCount = 0; + for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { + String originalKey = fieldMapEntry.getKey(); + Object targetKey = fieldMapEntry.getValue(); + if (targetKey instanceof Map) { + // call this method recursively when target key is a map + Object sourceObject = sourceAndMetadataMap.get(originalKey); + if (sourceObject instanceof List) { + List sourceObjectList = (List) sourceObject; + for (Object source : sourceObjectList) { + if (source instanceof Map) { + chunkStringCount += getChunkStringCountFromMap((Map) source, (Map) targetKey); + } + } + } else if (sourceObject instanceof Map) { + chunkStringCount += getChunkStringCountFromMap((Map) sourceObject, (Map) targetKey); + } + } else { + // chunk the object when target key is of leaf type (null, string and list of string) + Object chunkObject = sourceAndMetadataMap.get(originalKey); + chunkStringCount += getChunkStringCountFromLeafType(chunkObject); + } + } + return chunkStringCount; + } + + @SuppressWarnings("unchecked") + private int getChunkStringCountFromLeafType(final Object value) { + // leaf type means null, String or List + // the result should be an empty list when the input is null + if (value instanceof String) { + return StringUtils.isEmpty((String) value) ? 0 : 1; + } else if (isListOfString(value)) { + return (int) ((List) value).stream().filter(s -> !StringUtils.isEmpty(s)).count(); + } + return 0; + } + + @SuppressWarnings("unchecked") + private void chunkMapType( Map sourceAndMetadataMap, final Map fieldMap, - final Map runtimeParameters, - final int chunkCount + final Map runtimeParameters ) { - int updatedChunkCount = chunkCount; for (Map.Entry fieldMapEntry : fieldMap.entrySet()) { String originalKey = fieldMapEntry.getKey(); Object targetKey = fieldMapEntry.getValue(); @@ -247,21 +289,11 @@ private int chunkMapType( List sourceObjectList = (List) sourceObject; for (Object source : sourceObjectList) { if (source instanceof Map) { - updatedChunkCount = chunkMapType( - (Map) source, - (Map) targetKey, - runtimeParameters, - updatedChunkCount - ); + chunkMapType((Map) source, (Map) targetKey, runtimeParameters); } } } else if (sourceObject instanceof Map) { - updatedChunkCount = chunkMapType( - (Map) sourceObject, - (Map) targetKey, - runtimeParameters, - updatedChunkCount - ); + chunkMapType((Map) sourceObject, (Map) targetKey, runtimeParameters); } } else { // chunk the object when target key is of leaf type (null, string and list of string) @@ -270,15 +302,21 @@ private int chunkMapType( sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult); } } - return updatedChunkCount; } /** * Chunk the content, update the runtime max_chunk_limit and return the result */ private List chunkString(final String content, final Map runTimeParameters) { - // update runtime max_chunk_limit if not disabled + // return an empty list for empty string + if (StringUtils.isEmpty(content)) { + return List.of(); + } List contentResult = chunker.chunk(content, runTimeParameters); + // update chunk_string_count for each string + int chunkStringCount = parseIntegerParameter(runTimeParameters, CHUNK_STRING_COUNT_FIELD, 1); + runTimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount - 1); + // update runtime max_chunk_limit if not disabled int runtimeMaxChunkLimit = parseIntegerParameter(runTimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); if (runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) { runTimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit - contentResult.size()); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java index fb6712c76..3fa2eeb7c 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/Chunker.java @@ -14,6 +14,7 @@ public interface Chunker { String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit"; + String CHUNK_STRING_COUNT_FIELD = "chunk_string_count"; int DEFAULT_MAX_CHUNK_LIMIT = 100; int DISABLED_MAX_CHUNK_LIMIT = -1; @@ -33,4 +34,16 @@ public interface Chunker { * @return chunked passages */ List chunk(String content, Map runtimeParameters); + + /** + * Checks whether the chunking results would exceed the max chunk limit after adding a passage + * If exceeds, then return true + * + * @param chunkResultSize the size of chunking result + * @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize + * @param chunkStringCount runtime chunk_string_count, used to check with chunkResultSize + */ + static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int chunkStringCount) { + return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + chunkStringCount >= runtimeMaxChunkLimit; + } } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java deleted file mode 100644 index d4406f33e..000000000 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/ChunkerUtil.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ -package org.opensearch.neuralsearch.processor.chunker; - -import java.util.Locale; - -import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; -import static org.opensearch.neuralsearch.processor.chunker.Chunker.DISABLED_MAX_CHUNK_LIMIT; -import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; - -/** - * A util class used by chunking algorithms. - */ -public class ChunkerUtil { - - private ChunkerUtil() {} // no instance of this util class - - /** - * Checks whether the chunking results would exceed the max chunk limit. - * If exceeds, then Throw IllegalStateException - * - * @param chunkResultSize the size of chunking result - * @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize - * @param nonRuntimeMaxChunkLimit non-runtime max_chunk_limit, used to keep exception message consistent - */ - public static void checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int nonRuntimeMaxChunkLimit) { - if (runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize >= runtimeMaxChunkLimit) { - throw new IllegalArgumentException( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s]. This limit can be set by changing the [%s] parameter.", - TYPE, - nonRuntimeMaxChunkLimit, - MAX_CHUNK_LIMIT_FIELD - ) - ); - } - } -} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java index c688af436..fe2418ee8 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunker.java @@ -49,25 +49,29 @@ public void parseParameters(Map parameters) { * @param content input string * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_chunk_limit field level max chunk limit + * 2. chunk_string_count number of non-empty strings (including itself) which need to be chunked later */ @Override public List chunk(final String content, final Map runtimeParameters) { int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); + int chunkStringCount = parseIntegerParameter(runtimeParameters, CHUNK_STRING_COUNT_FIELD, 1); List chunkResult = new ArrayList<>(); int start = 0, end; int nextDelimiterPosition = content.indexOf(delimiter); while (nextDelimiterPosition != -1) { - ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); + if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, chunkStringCount)) { + break; + } end = nextDelimiterPosition + delimiter.length(); chunkResult.add(content.substring(start, end)); start = end; nextDelimiterPosition = content.indexOf(delimiter, start); } + // add the rest content into the chunk result if (start < content.length()) { - ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); chunkResult.add(content.substring(start)); } diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index cd630adf1..276e41ac7 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -117,11 +117,13 @@ public void parseParameters(Map parameters) { * @param runtimeParameters a map for runtime parameters, containing the following runtime parameters: * 1. max_token_count the max token limit for the tokenizer * 2. max_chunk_limit field level max chunk limit + * 3. chunk_string_count number of non-empty strings (including itself) which need to be chunked later */ @Override public List chunk(final String content, final Map runtimeParameters) { int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT); int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, this.maxChunkLimit); + int chunkStringCount = parseIntegerParameter(runtimeParameters, CHUNK_STRING_COUNT_FIELD, 1); List tokens = tokenize(content, tokenizer, maxTokenCount); List chunkResult = new ArrayList<>(); @@ -131,13 +133,17 @@ public List chunk(final String content, final Map runtim int overlapTokenNumber = (int) Math.floor(tokenLimit * overlapRate); while (startTokenIndex < tokens.size()) { - ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, maxChunkLimit); if (startTokenIndex == 0) { // include all characters till the start if no previous passage startContentPosition = 0; } else { startContentPosition = tokens.get(startTokenIndex).getStartOffset(); } + if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, chunkStringCount)) { + // include all characters till the end if exceeds max chunk limit + chunkResult.add(content.substring(startContentPosition)); + break; + } if (startTokenIndex + tokenLimit >= tokens.size()) { // include all characters till the end if no next passage endContentPosition = content.length(); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java index 934918e18..7109dcb41 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java @@ -14,6 +14,7 @@ import java.util.Locale; import java.util.Map; import java.util.Objects; +import java.util.Set; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -122,12 +123,18 @@ private Map createStringFieldMap() { return fieldMap; } - private Map createNestedFieldMap() { + private Map createNestedFieldMapSingleField() { Map fieldMap = new HashMap<>(); fieldMap.put(INPUT_NESTED_FIELD_KEY, Map.of(INPUT_FIELD, OUTPUT_FIELD)); return fieldMap; } + private Map createNestedFieldMapMultipleField() { + Map fieldMap = new HashMap<>(); + fieldMap.put(INPUT_NESTED_FIELD_KEY, Map.of(INPUT_FIELD + "_1", OUTPUT_FIELD + "_1", INPUT_FIELD + "_2", OUTPUT_FIELD + "_2")); + return fieldMap; + } + @SneakyThrows private TextChunkingProcessor createDefaultAlgorithmInstance(Map fieldMap) { Map config = new HashMap<>(); @@ -297,6 +304,7 @@ private List createSourceDataListStrings() { documents.add( "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); + documents.add(""); documents.add( "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); @@ -330,12 +338,19 @@ private List createSourceDataListWithNull() { return documents; } - private Map createSourceDataNestedMap() { + private Map createSourceDataNestedMapSingleField() { Map documents = new HashMap<>(); documents.put(INPUT_FIELD, createSourceDataString()); return documents; } + private Map createSourceDataNestedMapMultipleField() { + Map documents = new HashMap<>(); + documents.put(INPUT_FIELD + "_1", createSourceDataString()); + documents.put(INPUT_FIELD + "_2", createSourceDataString()); + return documents; + } + private Map createSourceDataInvalidNestedMap() { Map documents = new HashMap<>(); documents.put(INPUT_FIELD, Map.of(INPUT_NESTED_FIELD_KEY, 1)); @@ -416,51 +431,130 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkLimi } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataStringExceedMaxChunkLimit_thenFail() { + public void testExecute_withFixedTokenLength_andSourceDataStringExceedMaxChunkLimit_thenLastPassageGetConcatenated() { int maxChunkLimit = 1; TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> processor.execute(ingestDocument) + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = List.of( + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); - assert (illegalArgumentException.getMessage() - .contains( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", - TYPE, - maxChunkLimit - ) - )); + assertEquals(expectedPassages, passages); } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimit_thenFail() { + public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimitFive_thenLastPassageGetConcatenated() { int maxChunkLimit = 5; TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> processor.execute(ingestDocument) + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is the first document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); + expectedPassages.add("standard tokenizer in OpenSearch."); + expectedPassages.add("This is the second document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimitFour_thenLastPassageGetConcatenated() { + int maxChunkLimit = 4; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is the first document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); + expectedPassages.add("standard tokenizer in OpenSearch."); + expectedPassages.add( + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); - assert (illegalArgumentException.getMessage() - .contains( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", - TYPE, - maxChunkLimit - ) - )); + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimitThree_thenLastPassageGetConcatenated() { + int maxChunkLimit = 3; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is the first document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."); + expectedPassages.add( + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimitTwo_thenLastPassageGetConcatenated() { + int maxChunkLimit = 2; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add( + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + expectedPassages.add( + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataListExceedMaxChunkLimitOne_thenLastPassageGetConcatenated() { + int maxChunkLimit = 1; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add( + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + expectedPassages.add( + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + assertEquals(expectedPassages, passages); } @SneakyThrows - public void testExecute_withFixedTokenLength_andSourceDataListDisabledMaxChunkLimit_thenFail() { + public void testExecute_withFixedTokenLength_andSourceDataListDisabledMaxChunkLimit_thenSuccessful() { int maxChunkLimit = -1; TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(createStringFieldMap(), maxChunkLimit); IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataListStrings()); - processor.execute(ingestDocument); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is the first document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); + expectedPassages.add("standard tokenizer in OpenSearch."); + expectedPassages.add("This is the second document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by "); + expectedPassages.add("standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); } @SneakyThrows @@ -559,9 +653,9 @@ public void testExecute_withFixedTokenLength_andSourceDataListWithNull_thenFail( @SuppressWarnings("unchecked") @SneakyThrows - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() { - TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); - IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMap()); + public void testExecute_withFixedTokenLength_andFieldMapNestedMapSingleField_thenSucceed() { + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapSingleField()); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapSingleField()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); @@ -577,9 +671,193 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() assertEquals(expectedPassages, passages); } + @SneakyThrows + @SuppressWarnings("unchecked") + public void testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_thenSucceed() { + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapMultipleField()); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + assert (nestedResult instanceof Map); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_1"); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_2"); + Object passages1 = ((Map) nestedResult).get(OUTPUT_FIELD + "_1"); + Object passages2 = ((Map) nestedResult).get(OUTPUT_FIELD + "_2"); + assert (passages1 instanceof List); + assert (passages2 instanceof List); + + List expectedPassages = List.of( + "This is an example document to be chunked. The document ", + "contains a single paragraph, two sentences and 24 tokens by ", + "standard tokenizer in OpenSearch." + ); + assertEquals(expectedPassages, passages1); + assertEquals(expectedPassages, passages2); + } + + @SneakyThrows + @SuppressWarnings("unchecked") + public + void + testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitFive_thenLastPassageGetConcatenated() { + int maxChunkLimit = 5; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit( + createNestedFieldMapMultipleField(), + maxChunkLimit + ); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + assert (nestedResult instanceof Map); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_1"); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_2"); + Object passages1 = ((Map) nestedResult).get(OUTPUT_FIELD + "_1"); + Object passages2 = ((Map) nestedResult).get(OUTPUT_FIELD + "_2"); + assert (passages1 instanceof List); + assert (passages2 instanceof List); + + List expectedPassages1 = List.of( + "This is an example document to be chunked. The document ", + "contains a single paragraph, two sentences and 24 tokens by ", + "standard tokenizer in OpenSearch." + ); + List expectedPassages2 = List.of( + "This is an example document to be chunked. The document ", + "contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + Set> passages = Set.of((List) passages1, (List) passages2); + Set> expectedPassages = Set.of(expectedPassages1, expectedPassages2); + assertEquals(passages, expectedPassages); + } + + @SneakyThrows + @SuppressWarnings("unchecked") + public + void + testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitFour_thenLastPassageGetConcatenated() { + int maxChunkLimit = 4; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit( + createNestedFieldMapMultipleField(), + maxChunkLimit + ); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + assert (nestedResult instanceof Map); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_1"); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_2"); + Object passages1 = ((Map) nestedResult).get(OUTPUT_FIELD + "_1"); + Object passages2 = ((Map) nestedResult).get(OUTPUT_FIELD + "_2"); + assert (passages1 instanceof List); + assert (passages2 instanceof List); + + List expectedPassages1 = List.of( + "This is an example document to be chunked. The document ", + "contains a single paragraph, two sentences and 24 tokens by ", + "standard tokenizer in OpenSearch." + ); + List expectedPassages2 = List.of( + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + Set> passages = Set.of((List) passages1, (List) passages2); + Set> expectedPassages = Set.of(expectedPassages1, expectedPassages2); + assertEquals(passages, expectedPassages); + } + + @SneakyThrows + @SuppressWarnings("unchecked") + public + void + testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitThree_thenLastPassageGetConcatenated() { + int maxChunkLimit = 3; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit( + createNestedFieldMapMultipleField(), + maxChunkLimit + ); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + assert (nestedResult instanceof Map); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_1"); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_2"); + Object passages1 = ((Map) nestedResult).get(OUTPUT_FIELD + "_1"); + Object passages2 = ((Map) nestedResult).get(OUTPUT_FIELD + "_2"); + assert (passages1 instanceof List); + assert (passages2 instanceof List); + + List expectedPassages1 = List.of( + "This is an example document to be chunked. The document ", + "contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + List expectedPassages2 = List.of( + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + Set> passages = Set.of((List) passages1, (List) passages2); + Set> expectedPassages = Set.of(expectedPassages1, expectedPassages2); + assertEquals(passages, expectedPassages); + } + + @SneakyThrows + @SuppressWarnings("unchecked") + public void testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitTwo_thenLastPassageGetConcatenated() { + int maxChunkLimit = 2; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit( + createNestedFieldMapMultipleField(), + maxChunkLimit + ); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + assert (nestedResult instanceof Map); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_1"); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_2"); + Object passages1 = ((Map) nestedResult).get(OUTPUT_FIELD + "_1"); + Object passages2 = ((Map) nestedResult).get(OUTPUT_FIELD + "_2"); + assert (passages1 instanceof List); + assert (passages2 instanceof List); + + List expectedPassages = List.of( + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + assertEquals(passages1, expectedPassages); + assertEquals(passages2, expectedPassages); + } + + @SneakyThrows + @SuppressWarnings("unchecked") + public void testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitOne_thenLastPassageGetConcatenated() { + int maxChunkLimit = 1; + TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit( + createNestedFieldMapMultipleField(), + maxChunkLimit + ); + IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); + Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY); + assert (nestedResult instanceof Map); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_1"); + assert ((Map) nestedResult).containsKey(OUTPUT_FIELD + "_2"); + Object passages1 = ((Map) nestedResult).get(OUTPUT_FIELD + "_1"); + Object passages2 = ((Map) nestedResult).get(OUTPUT_FIELD + "_2"); + assert (passages1 instanceof List); + assert (passages2 instanceof List); + + List expectedPassages = List.of( + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + assertEquals(passages1, expectedPassages); + assertEquals(passages2, expectedPassages); + } + @SneakyThrows public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_thenFail() { - TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapSingleField()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createMaxDepthLimitExceedMap(0)); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -592,8 +870,8 @@ public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_then } @SneakyThrows - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() { - TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + public void testExecute_withFixedTokenLength_andFieldMapNestedMapSingleField_thenFail() { + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapSingleField()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataInvalidNestedMap()); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, @@ -607,8 +885,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() { @SneakyThrows @SuppressWarnings("unchecked") - public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceDataList_thenSucceed() { - TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap()); + public void testExecute_withFixedTokenLength_andFieldMapNestedMapSingleField_sourceDataList_thenSucceed() { + TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapSingleField()); IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap()); IngestDocument document = processor.execute(ingestDocument); assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 54e296861..e4c2a5c05 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -4,6 +4,7 @@ */ package org.opensearch.neuralsearch.processor.chunker; +import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Map; @@ -11,8 +12,8 @@ import org.junit.Assert; import org.opensearch.test.OpenSearchTestCase; -import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; public class DelimiterChunkerTests extends OpenSearchTestCase { @@ -76,23 +77,15 @@ public void testChunk_withDoubleNewlineDelimiter_thenSucceed() { assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); } - public void testChunk_whenExceedMaxChunkLimit_thenFail() { + public void testChunk_whenExceedMaxChunkLimit_thenLastPassageGetConcatenated() { int maxChunkLimit = 2; DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); String content = "\n\na\n\n\n"; - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> chunker.chunk(content, Map.of()) - ); - assert (illegalArgumentException.getMessage() - .contains( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", - TYPE, - maxChunkLimit - ) - )); + List passages = chunker.chunk(content, Map.of()); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("\n\n"); + expectedPassages.add("a\n\n\n"); + assertEquals(expectedPassages, passages); } public void testChunk_whenWithinMaxChunkLimit_thenSucceed() { @@ -103,23 +96,28 @@ public void testChunk_whenWithinMaxChunkLimit_thenSucceed() { assertEquals(List.of("\n\n", "a\n\n", "\n"), chunkResult); } - public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() { + public void testChunk_whenExceedRuntimeMaxChunkLimit_thenLastPassageGetConcatenated() { int maxChunkLimit = 3; DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); String content = "\n\na\n\n\n"; int runtimeMaxChunkLimit = 2; - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> chunker.chunk(content, Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit)) + List passages = chunker.chunk(content, Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit)); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("\n\n"); + expectedPassages.add("a\n\n\n"); + assertEquals(expectedPassages, passages); + } + + public void testChunk_whenExceedRuntimeMaxChunkLimit_withTwoStringsTobeChunked_thenLastPassageGetConcatenated() { + int maxChunkLimit = 3; + DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit)); + String content = "\n\na\n\n\n"; + int runtimeMaxChunkLimit = 2, chunkStringCount = 2; + List passages = chunker.chunk( + content, + Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit, CHUNK_STRING_COUNT_FIELD, chunkStringCount) ); - assert (illegalArgumentException.getMessage() - .contains( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", - TYPE, - maxChunkLimit - ) - )); + List expectedPassages = List.of("\n\na\n\n\n"); + assertEquals(expectedPassages, passages); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index bbcaa7069..d2a607a5b 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -23,8 +23,8 @@ import java.util.Map; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; -import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE; import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ALGORITHM_NAME; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD; import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; @@ -235,7 +235,7 @@ public void testChunk_withOverlapRateHalf_thenSucceed() { assertEquals(expectedPassages, passages); } - public void testChunk_whenExceedMaxChunkLimit_thenFail() { + public void testChunk_whenExceedMaxChunkLimit_thenLastPassageGetConcatenated() { int maxChunkLimit = 2; Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); @@ -246,19 +246,11 @@ public void testChunk_whenExceedMaxChunkLimit_thenFail() { runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> fixedTokenLengthChunker.chunk(content, runtimeParameters) - ); - assert (illegalArgumentException.getMessage() - .contains( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", - TYPE, - maxChunkLimit - ) - )); + List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); } public void testChunk_whenWithinMaxChunkLimit_thenSucceed() { @@ -280,7 +272,7 @@ public void testChunk_whenWithinMaxChunkLimit_thenSucceed() { assertEquals(expectedPassages, passages); } - public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() { + public void testChunk_whenExceedRuntimeMaxChunkLimit_thenLastPassageGetConcatenated() { int maxChunkLimit = 3, runtimeMaxChunkLimit = 2; Map parameters = new HashMap<>(); parameters.put(TOKEN_LIMIT_FIELD, 10); @@ -292,18 +284,31 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_thenFail() { runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; - IllegalArgumentException illegalArgumentException = assertThrows( - IllegalArgumentException.class, - () -> fixedTokenLengthChunker.chunk(content, runtimeParameters) + List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked. The document "); + expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); + } + + public void testChunk_whenExceedRuntimeMaxChunkLimit_withOneStringTobeChunked_thenLastPassageGetConcatenated() { + int maxChunkLimit = 3, runtimeMaxChunkLimit = 2, chunkStringCount = 1; + Map parameters = new HashMap<>(); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(TOKENIZER_FIELD, "standard"); + parameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit); + FixedTokenLengthChunker fixedTokenLengthChunker = createFixedTokenLengthChunker(parameters); + Map runtimeParameters = new HashMap<>(); + runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000); + runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit); + runtimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount); + String content = + "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + List passages = fixedTokenLengthChunker.chunk(content, runtimeParameters); + List expectedPassages = List.of( + "This is an example document to be chunked. The document ", + "contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); - assert (illegalArgumentException.getMessage() - .contains( - String.format( - Locale.ROOT, - "The number of chunks produced by %s processor has exceeded the allowed maximum of [%s].", - TYPE, - maxChunkLimit - ) - )); + assertEquals(expectedPassages, passages); } }