diff --git a/service/Core/DataFormats/Text/TextChunker2.cs b/service/Core/DataFormats/Text/TextChunker2.cs
new file mode 100644
index 000000000..f8e7ce6d9
--- /dev/null
+++ b/service/Core/DataFormats/Text/TextChunker2.cs
@@ -0,0 +1,434 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using System.Text;
+using Microsoft.KernelMemory.AI.OpenAI;
+
+namespace Microsoft.KernelMemory.DataFormats.Text;
+
+///
+/// Split text in chunks, attempting to leave meaning intact.
+/// For plain text, split looking at new lines first, then periods, and so on.
+/// For markdown, split looking at punctuation first, and so on.
+///
+[Experimental("KMEXP00")]
+public static class TextChunker2
+{
+ ///
+ /// This is the standard content to be splitted, for all content that cannot be divided in pages
+ /// we can simply send a single PageInfo with all the content in a single record.
+ ///
+ ///
+ /// A simple object that will be added on the extracted chunk, it is a simple object
+ /// because the caller can use Page Number or whatever data it needs.
+ public record ChunkInfo(string Content, object? Tag)
+ {
+ ///
+ /// If you want to convert this to string it is possible to simply return the content.
+ /// This makes simpler create TextChunker2 based on TextChunker.
+ ///
+ ///
+ public override string ToString()
+ {
+ return this.Content;
+ }
+ };
+
+ private static readonly char[] s_spaceChar = { ' ' };
+ private static readonly string?[] s_plaintextSplitOptions = { "\n\r", ".", "?!", ";", ":", ",", ")]}", " ", "-", null };
+ private static readonly string?[] s_markdownSplitOptions = { ".", "?!", ";", ":", ",", ")]}", " ", "-", "\n\r", null };
+
+ ///
+ /// Split plain text into lines.
+ ///
+ /// Text to split
+ /// Tag to associate to the split
+ /// Maximum number of tokens per line.
+ /// Function to count tokens in a string. If not supplied, the default counter will be used.
+ /// List of lines.
+ public static List SplitPlainTextLines(
+ string text,
+ object? tag,
+ int maxTokensPerLine,
+ TextChunker.TokenCounter? tokenCounter = null) =>
+ InternalSplitLines(
+ new ChunkInfo(text, tag),
+ maxTokensPerLine,
+ trim: true,
+ s_plaintextSplitOptions, tokenCounter);
+
+ ///
+ /// Split markdown text into lines.
+ ///
+ /// Text to split
+ /// Tag to associate to the split
+ /// Maximum number of tokens per line.
+ /// Function to count tokens in a string. If not supplied, the default counter will be used.
+ /// List of lines.
+ public static List SplitMarkDownLines(
+ string text,
+ object tag,
+ int maxTokensPerLine,
+ TextChunker.TokenCounter? tokenCounter = null) =>
+ InternalSplitLines(
+ new ChunkInfo(text, tag),
+ maxTokensPerLine,
+ trim: true,
+ s_markdownSplitOptions, tokenCounter);
+
+ ///
+ /// Split plain text into paragraphs.
+ /// Note: in the default KM implementation, one paragraph == one partition.
+ ///
+ /// Lines of text.
+ /// Maximum number of tokens per paragraph.
+ /// Number of tokens to overlap between paragraphs.
+ /// Text to be prepended to each individual chunk.
+ /// Function to count tokens in a string. If not supplied, the default counter will be used.
+ /// List of paragraphs.
+ public static IReadOnlyCollection SplitPlainTextParagraphs(
+ List lines,
+ int maxTokensPerParagraph,
+ int overlapTokens = 0,
+ string? chunkHeader = null,
+ TextChunker.TokenCounter? tokenCounter = null) =>
+ InternalSplitTextParagraphs(
+ lines,
+ maxTokensPerParagraph,
+ overlapTokens,
+ chunkHeader,
+ static (text, maxTokens, tokenCounter) => InternalSplitLines(
+ text,
+ maxTokens,
+ trim: false,
+ s_plaintextSplitOptions,
+ tokenCounter),
+ tokenCounter);
+
+ ///
+ /// Split markdown text into paragraphs.
+ ///
+ /// Lines of text.
+ /// Maximum number of tokens per paragraph.
+ /// Number of tokens to overlap between paragraphs.
+ /// Text to be prepended to each individual chunk.
+ /// Function to count tokens in a string. If not supplied, the default counter will be used.
+ /// List of paragraphs.
+ public static IReadOnlyCollection SplitMarkdownParagraphs(
+ List lines,
+ int maxTokensPerParagraph,
+ int overlapTokens = 0,
+ string? chunkHeader = null,
+ TextChunker.TokenCounter? tokenCounter = null) =>
+ InternalSplitTextParagraphs(
+ lines,
+ maxTokensPerParagraph,
+ overlapTokens,
+ chunkHeader,
+ static (text, maxTokens, tokenCounter) => InternalSplitLines(
+ text,
+ maxTokens,
+ trim: false,
+ s_markdownSplitOptions,
+ tokenCounter),
+ tokenCounter);
+
+ private static IReadOnlyCollection InternalSplitTextParagraphs(
+ List lines,
+ int maxTokensPerParagraph,
+ int overlapTokens,
+ string? chunkHeader,
+ Func> longLinesSplitter,
+ TextChunker.TokenCounter? tokenCounter)
+ {
+ if (maxTokensPerParagraph <= 0)
+ {
+ throw new ArgumentException("maxTokensPerParagraph should be a positive number", nameof(maxTokensPerParagraph));
+ }
+
+ if (maxTokensPerParagraph <= overlapTokens)
+ {
+ throw new ArgumentException("overlapTokens cannot be larger than maxTokensPerParagraph", nameof(maxTokensPerParagraph));
+ }
+
+ if (lines.Count == 0)
+ {
+ return Array.Empty();
+ }
+
+ var chunkHeaderTokens = chunkHeader is { Length: > 0 } ? GetTokenCount(chunkHeader, tokenCounter) : 0;
+
+ var adjustedMaxTokensPerParagraph = maxTokensPerParagraph - overlapTokens - chunkHeaderTokens;
+
+ // Split long lines first
+ var truncatedLines = lines
+ .SelectMany(line => longLinesSplitter(line, adjustedMaxTokensPerParagraph, tokenCounter))
+ .ToArray();
+
+ var paragraphs = BuildParagraph(truncatedLines, adjustedMaxTokensPerParagraph, tokenCounter);
+
+ var processedParagraphs = ProcessParagraphs(
+ paragraphs, adjustedMaxTokensPerParagraph, overlapTokens, chunkHeader, longLinesSplitter, tokenCounter);
+
+ return processedParagraphs;
+ }
+
+ private static List BuildParagraph(
+ ChunkInfo[] truncatedLines,
+ int maxTokensPerParagraph,
+ TextChunker.TokenCounter? tokenCounter)
+ {
+ StringBuilder paragraphBuilder = new();
+ List paragraphs = new();
+
+ if (truncatedLines == null || truncatedLines.Length == 0)
+ {
+ return paragraphs;
+ }
+
+ //paragraph tag is the tag was first associated to the current paraphBuilder.
+ object? paragraphTag = truncatedLines[0].Tag;
+ foreach (ChunkInfo line in truncatedLines)
+ {
+ if (paragraphBuilder.Length > 0)
+ {
+ string? paragraph = null;
+
+ int currentCount = GetTokenCount(line, tokenCounter) + 1;
+ if (currentCount < maxTokensPerParagraph)
+ {
+ currentCount += GetTokenCount(paragraphBuilder.ToString(), tokenCounter);
+ }
+
+ if (currentCount >= maxTokensPerParagraph)
+ {
+ // Complete the paragraph and prepare for the next
+ paragraph = paragraphBuilder.ToString();
+
+ paragraphs.Add(new ChunkInfo(paragraph.Trim(), paragraphTag));
+ paragraphBuilder.Clear();
+ paragraphTag = line.Tag;
+ }
+ }
+
+ paragraphBuilder.AppendLine(line.Content);
+ }
+
+ if (paragraphBuilder.Length > 0)
+ {
+ // Add the final paragraph if there's anything remaining, now the last paragraph tag is the first
+ // tag that contains text on the tag.
+ paragraphs.Add(new ChunkInfo(paragraphBuilder.ToString().Trim(), paragraphTag));
+ }
+
+ return paragraphs;
+ }
+
+ private static List ProcessParagraphs(
+ List paragraphs,
+ int adjustedMaxTokensPerParagraph,
+ int overlapTokens,
+ string? chunkHeader,
+ Func> longLinesSplitter,
+ TextChunker.TokenCounter? tokenCounter)
+ {
+ // distribute text more evenly in the last paragraphs when the last paragraph is too short.
+ if (paragraphs.Count > 1)
+ {
+ var lastParagraph = paragraphs[paragraphs.Count - 1];
+ var secondLastParagraph = paragraphs[paragraphs.Count - 2];
+
+ if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4)
+ {
+ var lastParagraphTokens = lastParagraph.Content.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries);
+ var secondLastParagraphTokens = secondLastParagraph.Content.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries);
+
+ var lastParagraphTokensCount = lastParagraphTokens.Length;
+ var secondLastParagraphTokensCount = secondLastParagraphTokens.Length;
+
+ if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph)
+ {
+ var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens);
+ var newLastParagraph = string.Join(" ", lastParagraphTokens);
+
+ paragraphs[paragraphs.Count - 2] = new ChunkInfo($"{newSecondLastParagraph} {newLastParagraph}", secondLastParagraph.Tag);
+ paragraphs.RemoveAt(paragraphs.Count - 1);
+ }
+ }
+ }
+
+ var processedParagraphs = new List();
+ var paragraphStringBuilder = new StringBuilder();
+
+ for (int i = 0; i < paragraphs.Count; i++)
+ {
+ paragraphStringBuilder.Clear();
+
+ if (chunkHeader is not null)
+ {
+ paragraphStringBuilder.Append(chunkHeader);
+ }
+
+ var paragraph = paragraphs[i];
+
+ if (overlapTokens > 0 && i < paragraphs.Count - 1)
+ {
+ var nextParagraph = paragraphs[i + 1];
+ var split = longLinesSplitter(nextParagraph, overlapTokens, tokenCounter);
+
+ paragraphStringBuilder.Append(paragraph.Content);
+
+ if (split.Count != 0)
+ {
+ paragraphStringBuilder.Append(' ').Append(split[0]);
+ }
+ }
+ else
+ {
+ paragraphStringBuilder.Append(paragraph.Content);
+ }
+
+ processedParagraphs.Add(new ChunkInfo(paragraphStringBuilder.ToString(), paragraph.Tag));
+ }
+
+ return processedParagraphs;
+ }
+
+ private static List InternalSplitLines(
+ ChunkInfo chunkInput,
+ int maxTokensPerLine,
+ bool trim,
+ string?[] splitOptions,
+ TextChunker.TokenCounter? tokenCounter)
+ {
+ var result = new List();
+
+ var text = chunkInput.Content.Replace("\r\n", "\n", StringComparison.OrdinalIgnoreCase); // normalize line endings
+ result.Add(new ChunkInfo(text, chunkInput.Tag));
+ for (int i = 0; i < splitOptions.Length; i++)
+ {
+ int count = result.Count; // track where the original input left off
+ var (splits2, inputWasSplit2) = Split(result, maxTokensPerLine, splitOptions[i].AsSpan(), trim, tokenCounter);
+ result.AddRange(splits2);
+ result.RemoveRange(0, count); // remove the original input
+ if (!inputWasSplit2)
+ {
+ break;
+ }
+ }
+
+ return result;
+ }
+
+ private static (List, bool) Split(
+ List input,
+ int maxTokens,
+ ReadOnlySpan separators,
+ bool trim,
+ TextChunker.TokenCounter? tokenCounter)
+ {
+ bool inputWasSplit = false;
+ List result = new();
+ int count = input.Count;
+ for (int i = 0; i < count; i++)
+ {
+ var currentInput = input[i];
+ var (splits, split) = Split(currentInput.Content.AsSpan(), currentInput.Content, maxTokens, separators, trim, tokenCounter);
+ result.AddRange(splits.Select(s => new ChunkInfo(s, currentInput.Tag)));
+ inputWasSplit |= split;
+ }
+
+ return (result, inputWasSplit);
+ }
+
+ private static (List, bool) Split(
+ ReadOnlySpan input,
+ string? inputString,
+ int maxTokens,
+ ReadOnlySpan separators,
+ bool trim,
+ TextChunker.TokenCounter? tokenCounter)
+ {
+ Debug.Assert(inputString is null || input.SequenceEqual(inputString.AsSpan()));
+ List result = new();
+ var inputWasSplit = false;
+
+ int inputTokenCount = GetTokenCount(inputString ??= input.ToString(), tokenCounter);
+
+ if (inputTokenCount > maxTokens)
+ {
+ inputWasSplit = true;
+
+ int half = input.Length / 2;
+ int cutPoint = -1;
+
+ if (separators.IsEmpty)
+ {
+ cutPoint = half;
+ }
+ else if (input.Length > 2)
+ {
+ int pos = 0;
+ while (true)
+ {
+ int index = input.Slice(pos, input.Length - 1 - pos).IndexOfAny(separators);
+ if (index < 0)
+ {
+ break;
+ }
+
+ index += pos;
+
+ if (Math.Abs(half - index) < Math.Abs(half - cutPoint))
+ {
+ cutPoint = index + 1;
+ }
+
+ pos = index + 1;
+ }
+ }
+
+ if (cutPoint > 0)
+ {
+ var firstHalf = input.Slice(0, cutPoint);
+ var secondHalf = input.Slice(cutPoint);
+ if (trim)
+ {
+ firstHalf = firstHalf.Trim();
+ secondHalf = secondHalf.Trim();
+ }
+
+ // Recursion
+ var (splits1, split1) = Split(firstHalf, null, maxTokens, separators, trim, tokenCounter);
+ result.AddRange(splits1);
+ var (splits2, split2) = Split(secondHalf, null, maxTokens, separators, trim, tokenCounter);
+ result.AddRange(splits2);
+
+ inputWasSplit = split1 || split2;
+ return (result, inputWasSplit);
+ }
+ }
+
+ result.Add((inputString is not null, trim) switch
+ {
+ (true, true) => inputString!.Trim(),
+ (true, false) => inputString!,
+ (false, true) => input.Trim().ToString(),
+ (false, false) => input.ToString(),
+ });
+
+ return (result, inputWasSplit);
+ }
+
+ private static int GetTokenCount(ChunkInfo input, TextChunker.TokenCounter? tokenCounter) => GetTokenCount(input.Content, tokenCounter);
+
+ private static int GetTokenCount(string input, TextChunker.TokenCounter? tokenCounter)
+ {
+ // Fall back to GPT tokenizer if none configured
+ return tokenCounter?.Invoke(input) ?? DefaultGPTTokenizer.StaticCountTokens(input);
+ }
+}
diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs
index d6c3f9e25..5235ca84c 100644
--- a/service/Core/Handlers/TextPartitioningHandler.cs
+++ b/service/Core/Handlers/TextPartitioningHandler.cs
@@ -2,12 +2,14 @@
using System;
using System.Collections.Generic;
+using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.KernelMemory.AI.OpenAI;
using Microsoft.KernelMemory.Configuration;
using Microsoft.KernelMemory.Context;
+using Microsoft.KernelMemory.DataFormats;
using Microsoft.KernelMemory.DataFormats.Text;
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.KernelMemory.Extensions;
@@ -66,6 +68,8 @@ public TextPartitioningHandler(
}
}
+ private record PartitionInfo(string Content, int? PageNumber);
+
///
public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync(
DataPipeline pipeline, CancellationToken cancellationToken = default)
@@ -97,103 +101,144 @@ public TextPartitioningHandler(
// Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
Dictionary newFiles = new();
- foreach (KeyValuePair generatedFile in uploadedFile.GeneratedFiles)
+ List? partitions = null;
+ List sentences;
+ string partitionsMimeType = MimeTypes.PlainText;
+ DataPipeline.GeneratedFileDetails? file = null;
+
+ // we prefer extracting from structured data because we can leave page number
+ var extractedContent = uploadedFile.GeneratedFiles.FirstOrDefault(uploadedFile => uploadedFile.Value.ArtifactType == DataPipeline.ArtifactTypes.ExtractedContent);
+ if (extractedContent.Value != null)
{
- var file = generatedFile.Value;
- if (file.AlreadyProcessedBy(this))
- {
- this._log.LogTrace("File {0} already processed by this handler", file.Name);
- continue;
- }
+ BinaryData dataExtractedContent = await this._orchestrator.ReadFileAsync(pipeline, extractedContent.Value.Name, cancellationToken).ConfigureAwait(false);
- // Partition only the original text
- if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
+ var fileContent = dataExtractedContent.ToObjectFromJson();
+
+ if (fileContent != null)
{
- this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
- continue;
- }
+ //ok lets try to deserialize the contentS
+ this._log.LogTrace("File {0} was processed with ExtractedContent {1}", uploadedFile.Name, extractedContent.Value.Name);
+
+ //now we should split with a splitter that keeps track of page number.
+ file = extractedContent.Value;
- // Use a different partitioning strategy depending on the file type
- List partitions;
- List sentences;
- BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
- string partitionsMimeType = MimeTypes.PlainText;
+ List chunks = new();
+ foreach (var content in fileContent.Sections)
+ {
+ var stringContent = content.Content;
- // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
- if (partitionContent.ToArray().Length == 0) { continue; }
+ var lines = TextChunker.SplitPlainTextLines(stringContent, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
+ chunks.AddRange(lines.Select(l => new TextChunker2.ChunkInfo(l, content.Number)));
+ }
- switch (file.MimeType)
+ var stringPartitions = TextChunker2.SplitPlainTextParagraphs(chunks, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, chunkHeader: chunkHeader, tokenCounter: this._tokenCounter);
+ partitions = stringPartitions.Select(c => new PartitionInfo(c.Content, (int?)c.Tag)).ToList();
+ }
+ }
+
+ if (partitions == null)
+ {
+ //old logic where we have no extracted content
+ foreach (KeyValuePair generatedFile in uploadedFile.GeneratedFiles)
{
- case MimeTypes.PlainText:
+ file = generatedFile.Value;
+ if (file.AlreadyProcessedBy(this))
{
- this._log.LogDebug("Partitioning text file {0}", file.Name);
- string content = partitionContent.ToString();
- sentences = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
- partitions = TextChunker.SplitPlainTextParagraphs(
- sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter, chunkHeader: chunkHeader);
- break;
+ this._log.LogTrace("File {0} already processed by this handler", file.Name);
+ continue;
}
- case MimeTypes.MarkDown:
+ // Partition only the original text
+ if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
{
- this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
- string content = partitionContent.ToString();
- partitionsMimeType = MimeTypes.MarkDown;
- sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
- partitions = TextChunker.SplitMarkdownParagraphs(
- sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter);
- break;
+ this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
+ continue;
}
- // TODO: add virtual/injectable logic
- // TODO: see https://learn.microsoft.com/en-us/windows/win32/search/-search-ifilter-about
+ // Use a different partitioning strategy depending on the file type
+ BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
- default:
- this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType);
- // Don't partition other files
- continue;
+ // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
+ if (partitionContent.ToArray().Length == 0) { continue; }
+
+ switch (file.MimeType)
+ {
+ case MimeTypes.PlainText:
+ {
+ this._log.LogDebug("Partitioning text file {0}", file.Name);
+ string content = partitionContent.ToString();
+ sentences = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
+ var stringPartitions = TextChunker.SplitPlainTextParagraphs(
+ sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, chunkHeader: chunkHeader, tokenCounter: this._tokenCounter);
+
+ partitions = stringPartitions.Select(c => new PartitionInfo(c, null)).ToList();
+ break;
+ }
+
+ case MimeTypes.MarkDown:
+ {
+ this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
+ string content = partitionContent.ToString();
+ partitionsMimeType = MimeTypes.MarkDown;
+ sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
+ var stringPartitions = TextChunker.SplitMarkdownParagraphs(
+ sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter);
+
+ partitions = stringPartitions.Select(c => new PartitionInfo(c, null)).ToList();
+ break;
+ }
+
+ // TODO: add virtual/injectable logic
+ // TODO: see https://learn.microsoft.com/en-us/windows/win32/search/-search-ifilter-about
+
+ default:
+ this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType);
+ // Don't partition other files
+ continue;
+ }
}
+ }
- if (partitions.Count == 0) { continue; }
+ if (partitions == null || partitions.Count == 0 || file == null) { continue; }
- this._log.LogDebug("Saving {0} file partitions", partitions.Count);
- for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++)
- {
- // TODO: turn partitions in objects with more details, e.g. page number
- string text = partitions[partitionNumber];
- int sectionNumber = 0; // TODO: use this to store the page number (if any)
- BinaryData textData = new(text);
-
- int tokenCount = this._tokenCounter(text);
- this._log.LogDebug("Partition size: {0} tokens", tokenCount);
+ this._log.LogDebug("Saving {0} file partitions", partitions.Count);
+ for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++)
+ {
+ // TODO: turn partitions in objects with more details, e.g. page number
+ var partition = partitions[partitionNumber];
+ string text = partition.Content;
+ int sectionNumber = partition.PageNumber ?? 0;
+ BinaryData textData = new(text);
- var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
- await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
+ int tokenCount = this._tokenCounter(text);
+ this._log.LogDebug("Partition size: {0} tokens", tokenCount);
- var destFileDetails = new DataPipeline.GeneratedFileDetails
- {
- Id = Guid.NewGuid().ToString("N"),
- ParentId = uploadedFile.Id,
- Name = destFile,
- Size = text.Length,
- MimeType = partitionsMimeType,
- ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
- PartitionNumber = partitionNumber,
- SectionNumber = sectionNumber,
- Tags = pipeline.Tags,
- ContentSHA256 = textData.CalculateSHA256(),
- };
- newFiles.Add(destFile, destFileDetails);
- destFileDetails.MarkProcessedBy(this);
- }
+ var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
+ await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
- file.MarkProcessedBy(this);
+ var destFileDetails = new DataPipeline.GeneratedFileDetails
+ {
+ Id = Guid.NewGuid().ToString("N"),
+ ParentId = uploadedFile.Id,
+ Name = destFile,
+ Size = text.Length,
+ MimeType = partitionsMimeType,
+ ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
+ PartitionNumber = partitionNumber,
+ SectionNumber = sectionNumber,
+ Tags = pipeline.Tags,
+ ContentSHA256 = textData.CalculateSHA256(),
+ };
+ newFiles.Add(destFile, destFileDetails);
+ destFileDetails.MarkProcessedBy(this);
}
+ file.MarkProcessedBy(this);
+
// Add new files to pipeline status
- foreach (var file in newFiles)
+ foreach (var newFile in newFiles)
{
- uploadedFile.GeneratedFiles.Add(file.Key, file.Value);
+ uploadedFile.GeneratedFiles.Add(newFile.Key, newFile.Value);
}
}
diff --git a/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs b/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs
new file mode 100644
index 000000000..7bff2becc
--- /dev/null
+++ b/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs
@@ -0,0 +1,882 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Text;
+using Microsoft.KernelMemory.DataFormats.Text;
+
+namespace Microsoft.KM.Core.UnitTests.DataFormats.Text;
+
+public sealed class TextChunker2Tests
+{
+ // Use this as the default chunker, to decouple the test from GPT3 tokenizer
+ private static readonly TextChunker.TokenCounter s_tokenCounter = s => (s.Length >> 2);
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitPlainTextLines()
+ {
+ const string Input = "This is a test of the emergency broadcast system. This is only a test.";
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test."
+ };
+
+ var result = TextChunker2.SplitPlainTextLines(Input, tag: null, 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphs()
+ {
+ List input = new()
+ {
+ new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1),
+ new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2)
+ };
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expectedTag = new[]
+ {
+ 1,
+ 1,
+ 2
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(input, 13, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsWithOverlap()
+ {
+ List input = new()
+ {
+ new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1),
+ new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2)
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "emergency broadcast system. This is only a test.",
+ "This is only a test. We repeat, this is only a test.",
+ "We repeat, this is only a test. A unit test.",
+ "A unit test."
+ };
+
+ var expectedTag = new[]
+ {
+ 1,
+ 1,
+ 1,
+ 2,
+ 2
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(input, 15, 8, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphs()
+ {
+ List input = new()
+ {
+ new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1),
+ new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2)
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expectedTag = new[]
+ {
+ 1,
+ 1,
+ 2
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(input, 13, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithOverlap()
+ {
+ List input = new()
+ {
+ new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1),
+ new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2)
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "emergency broadcast system. This is only a test.",
+ "This is only a test. We repeat, this is only a test.",
+ "We repeat, this is only a test. A unit test.",
+ "A unit test."
+ };
+
+ var expectedTag = new[]
+ {
+ 1,
+ 1,
+ 1,
+ 2,
+ 2
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(input, 15, 8, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkDownLines()
+ {
+ const string Input = "This is a test of the emergency broadcast system. This is only a test.";
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test."
+ };
+
+ var result = TextChunker2.SplitMarkDownLines(Input, tag: 42, maxTokensPerLine: 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.All(result, c => c.Tag?.Equals(42));
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithEmptyInput()
+ {
+ List input = new();
+
+ var result = TextChunker2.SplitPlainTextParagraphs(input, 13, tokenCounter: s_tokenCounter);
+
+ Assert.Empty(result);
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsWithEmptyInput()
+ {
+ List input = new();
+
+ var result = TextChunker2.SplitMarkdownParagraphs(input, 13, tokenCounter: s_tokenCounter);
+
+ Assert.Empty(result);
+ }
+
+ private List ConvertToChunkInput(List input)
+ {
+ var result = new List();
+ for (int i = 0; i < input.Count; i++)
+ {
+ result.Add(new TextChunker2.ChunkInfo(input[i], i + 1));
+ }
+ return result;
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsEvenly()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test.",
+ "A small note. And another. And once again. Seriously, this is the end. We're finished. All set. Bye.",
+ "Done."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test.",
+ "We repeat, this is only a test. A unit test.",
+ "A small note. And another. And once again.",
+ "Seriously, this is the end. We're finished. All set. Bye. Done."
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that splits on \r or \n
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsOnNewlines()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system\r\nThis is only a test",
+ "We repeat this is only a test\nA unit test",
+ "A small note\nAnd another\r\nAnd once again\rSeriously this is the end\nWe're finished\nAll set\nBye\n",
+ "Done"
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system",
+ "This is only a test",
+ "We repeat this is only a test\nA unit test",
+ "A small note\nAnd another\nAnd once again",
+ "Seriously this is the end\nWe're finished\nAll set\nBye Done",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that splits on ? or !
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsOnPunctuation()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test",
+ "We repeat, this is only a test? A unit test",
+ "A small note! And another? And once again! Seriously, this is the end. We're finished. All set. Bye.",
+ "Done."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test",
+ "We repeat, this is only a test? A unit test",
+ "A small note! And another? And once again!",
+ "Seriously, this is the end.",
+ $"We're finished. All set. Bye.{Environment.NewLine}Done.",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that splits on ;
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsOnSemicolons()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system; This is only a test",
+ "We repeat; this is only a test; A unit test",
+ "A small note; And another; And once again; Seriously, this is the end; We're finished; All set; Bye.",
+ "Done."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system;",
+ "This is only a test",
+ "We repeat; this is only a test; A unit test",
+ "A small note; And another; And once again;",
+ "Seriously, this is the end; We're finished; All set; Bye. Done.",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that splits on :
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsOnColons()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system: This is only a test",
+ "We repeat: this is only a test: A unit test",
+ "A small note: And another: And once again: Seriously, this is the end: We're finished: All set: Bye.",
+ "Done."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system:",
+ "This is only a test",
+ "We repeat: this is only a test: A unit test",
+ "A small note: And another: And once again:",
+ "Seriously, this is the end: We're finished: All set: Bye. Done.",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that splits on ,
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsOnCommas()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system, This is only a test",
+ "We repeat, this is only a test, A unit test",
+ "A small note, And another, And once again, Seriously, this is the end, We're finished, All set, Bye.",
+ "Done."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system,",
+ "This is only a test",
+ "We repeat, this is only a test, A unit test",
+ "A small note, And another, And once again, Seriously,",
+ $"this is the end, We're finished, All set, Bye.{Environment.NewLine}Done.",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that splits on ) or ] or }
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsOnClosingBrackets()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system) This is only a test",
+ "We repeat) this is only a test) A unit test",
+ "A small note] And another) And once again] Seriously this is the end} We're finished} All set} Bye.",
+ "Done."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system)",
+ "This is only a test",
+ "We repeat) this is only a test) A unit test",
+ "A small note] And another) And once again]",
+ "Seriously this is the end} We're finished} All set} Bye. Done.",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that splits on ' '
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsOnSpaces()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system This is only a test",
+ "We repeat this is only a test A unit test",
+ "A small note And another And once again Seriously this is the end We're finished All set Bye.",
+ "Done."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency",
+ "broadcast system This is only a test",
+ "We repeat this is only a test A unit test",
+ "A small note And another And once again Seriously",
+ $"this is the end We're finished All set Bye.{Environment.NewLine}Done.",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that splits on '-'
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsOnHyphens()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system-This is only a test",
+ "We repeat-this is only a test-A unit test",
+ "A small note-And another-And once again-Seriously, this is the end-We're finished-All set-Bye.",
+ "Done."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency",
+ "broadcast system-This is only a test",
+ "We repeat-this is only a test-A unit test",
+ "A small note-And another-And once again-Seriously,",
+ $"this is the end-We're finished-All set-Bye.{Environment.NewLine}Done.",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a plaintext example that does not have any of the above characters
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithNoDelimiters()
+ {
+ List input = new()
+ {
+ "Thisisatestoftheemergencybroadcastsystem",
+ "Thisisonlyatest",
+ "WerepeatthisisonlyatestAunittest",
+ "AsmallnoteAndanotherAndonceagain",
+ "SeriouslythisistheendWe'refinishedAllsetByeDoneThisOneWillBeSplitToMeetTheLimit",
+ };
+
+ var expected = new[]
+ {
+ $"Thisisatestoftheemergencybroadcastsystem{Environment.NewLine}Thisisonlyatest",
+ "WerepeatthisisonlyatestAunittest",
+ "AsmallnoteAndanotherAndonceagain",
+ "SeriouslythisistheendWe'refinishedAllse",
+ "tByeDoneThisOneWillBeSplitToMeetTheLimit",
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 3, 4, 5, 5], result.Select(o => o.Tag).Cast());
+ }
+
+ // a markdown example that splits on .
+
+ // a markdown example that splits on ? or !
+
+ // a markdown example that splits on ;
+
+ // a markdown example that splits on :
+
+ // a markdown example that splits on ,
+
+ // a markdown example that splits on ) or ] or }
+
+ // a markdown example that splits on ' '
+
+ // a markdown example that splits on '-'
+
+ // a markdown example that splits on '\r' or '\n'
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsOnNewlines()
+ {
+ List input = new()
+ {
+ "This_is_a_test_of_the_emergency_broadcast_system\r\nThis_is_only_a_test",
+ "We_repeat_this_is_only_a_test\nA_unit_test",
+ "A_small_note\nAnd_another\r\nAnd_once_again\rSeriously_this_is_the_end\nWe're_finished\nAll_set\nBye\n",
+ "Done"
+ };
+
+ var expected = new[]
+ {
+ "This_is_a_test_of_the_emergency_broadcast_system",
+ "This_is_only_a_test",
+ "We_repeat_this_is_only_a_test\nA_unit_test",
+ "A_small_note\nAnd_another\nAnd_once_again",
+ "Seriously_this_is_the_end\nWe're_finished\nAll_set\nBye Done",
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast());
+ }
+
+ // a markdown example that does not have any of the above characters
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitVeryLargeDocumentsWithoutStackOverflowing()
+ {
+#pragma warning disable CA5394 // this test relies on repeatable pseudo-random numbers
+ var rand = new Random(42);
+ var sb = new StringBuilder(100_000 * 11);
+ for (int wordNum = 0; wordNum < 100_000; wordNum++)
+ {
+ int wordLength = rand.Next(1, 10);
+ for (int charNum = 0; charNum < wordLength; charNum++)
+ {
+ sb.Append((char)('a' + rand.Next(0, 26)));
+ }
+
+ sb.Append(' ');
+ }
+
+ string text = sb.ToString();
+ List lines = TextChunker2.SplitPlainTextLines(text, tag: 42, 20, tokenCounter: s_tokenCounter);
+ var paragraphs = TextChunker2.SplitPlainTextParagraphs(lines, 200, tokenCounter: s_tokenCounter);
+ Assert.NotEmpty(paragraphs);
+#pragma warning restore CA5394
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitPlainTextLinesWithCustomTokenCounter()
+ {
+ const string input = "This is a test of the emergency broadcast system. This is only a test.";
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test."
+ };
+
+ var result = TextChunker2.SplitPlainTextLines(input, tag: 42, 60, s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([42, 42], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsWithCustomTokenCounter()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 52, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsWithOverlapAndCustomTokenCounter()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "emergency broadcast system. This is only a test.",
+ "This is only a test. We repeat, this is only a test.",
+ "We repeat, this is only a test. A unit test.",
+ "A unit test."
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 75, 40, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithCustomTokenCounter()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 52, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithOverlapAndCustomTokenCounter()
+ {
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "emergency broadcast system. This is only a test.",
+ "This is only a test. We repeat, this is only a test.",
+ "We repeat, this is only a test. A unit test.",
+ "A unit test."
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 75, 40, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkDownLinesWithCustomTokenCounter()
+ {
+ const string input = "This is a test of the emergency broadcast system. This is only a test.";
+ var expected = new[]
+ {
+ "This is a test of the emergency broadcast system.",
+ "This is only a test."
+ };
+
+ var result = TextChunker2.SplitMarkDownLines(input, tag: 42, 60, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([42, 42], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsWithHeader()
+ {
+ const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+ var expected = new[]
+ {
+ $"{ChunkHeader}This is a test of the emergency broadcast system.",
+ $"{ChunkHeader}This is only a test.",
+ $"{ChunkHeader}We repeat, this is only a test. A unit test."
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 20, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsWithOverlapAndHeader()
+ {
+ const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ $"{ChunkHeader}This is a test of the emergency broadcast system.",
+ $"{ChunkHeader}emergency broadcast system. This is only a test.",
+ $"{ChunkHeader}This is only a test. We repeat, this is only a test.",
+ $"{ChunkHeader}We repeat, this is only a test. A unit test.",
+ $"{ChunkHeader}A unit test."
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 22, 8, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithHeader()
+ {
+ const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ $"{ChunkHeader}This is a test of the emergency broadcast system.",
+ $"{ChunkHeader}This is only a test.",
+ $"{ChunkHeader}We repeat, this is only a test. A unit test."
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 20, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithOverlapAndHeader()
+ {
+ const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ $"{ChunkHeader}This is a test of the emergency broadcast system.",
+ $"{ChunkHeader}emergency broadcast system. This is only a test.",
+ $"{ChunkHeader}This is only a test. We repeat, this is only a test.",
+ $"{ChunkHeader}We repeat, this is only a test. A unit test.",
+ $"{ChunkHeader}A unit test."
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 22, 8, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsWithHeaderAndCustomTokenCounter()
+ {
+ const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+ var expected = new[]
+ {
+ $"{ChunkHeader}This is a test of the emergency broadcast system.",
+ $"{ChunkHeader}This is only a test.",
+ $"{ChunkHeader}We repeat, this is only a test. A unit test."
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 77, chunkHeader: ChunkHeader, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitMarkdownParagraphsWithOverlapAndHeaderAndCustomTokenCounter()
+ {
+ const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ $"{ChunkHeader}This is a test of the emergency broadcast system.",
+ $"{ChunkHeader}emergency broadcast system. This is only a test.",
+ $"{ChunkHeader}This is only a test. We repeat, this is only a test.",
+ $"{ChunkHeader}We repeat, this is only a test. A unit test.",
+ $"{ChunkHeader}A unit test."
+ };
+
+ var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 100, 40, chunkHeader: ChunkHeader, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithHeaderAndCustomTokenCounter()
+ {
+ const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ $"{ChunkHeader}This is a test of the emergency broadcast system.",
+ $"{ChunkHeader}This is only a test.",
+ $"{ChunkHeader}We repeat, this is only a test. A unit test."
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 77, chunkHeader: ChunkHeader, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast());
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ public void CanSplitTextParagraphsWithOverlapAndHeaderAndCustomTokenCounter()
+ {
+ const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+ List input = new()
+ {
+ "This is a test of the emergency broadcast system. This is only a test.",
+ "We repeat, this is only a test. A unit test."
+ };
+
+ var expected = new[]
+ {
+ $"{ChunkHeader}This is a test of the emergency broadcast system.",
+ $"{ChunkHeader}emergency broadcast system. This is only a test.",
+ $"{ChunkHeader}This is only a test. We repeat, this is only a test.",
+ $"{ChunkHeader}We repeat, this is only a test. A unit test.",
+ $"{ChunkHeader}A unit test."
+ };
+
+ var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 100, 40, chunkHeader: ChunkHeader, tokenCounter: s => s.Length);
+
+ Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+ Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast());
+ }
+}