Skip to content

Commit

Permalink
Merge pull request #562 from solliancenet/cj-fix-vectorization-bug
Browse files Browse the repository at this point in the history
Fix bug with text splitting based on tokens
  • Loading branch information
ciprianjichici authored Feb 6, 2024
2 parents ded9df7 + 05aabff commit 042ce9b
Showing 1 changed file with 2 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ public class TokenTextSplitterService(
.Select(t => _tokenizerService.Decode(t, _settings.TokenizerEncoder))
.ToList();

var lastChunkStart = (chunksCount - 1) * _settings.ChunkSizeTokens;
var lastChunkStart = (chunksCount - 1) * (_settings.ChunkSizeTokens - _settings.OverlapSizeTokens);
var lastChunkSize = tokens.Count - lastChunkStart + 1;
var resultMessage = string.Empty;

if (lastChunkSize < 2 * _settings.OverlapSizeTokens)
{
// The last chunk is to small, will just incorporate it into the second to last.
var secondToLastChunkStart = (chunksCount - 2) * _settings.ChunkSizeTokens;
var secondToLastChunkStart = (chunksCount - 2) * (_settings.ChunkSizeTokens - _settings.OverlapSizeTokens);
var newLastChunkSize = tokens.Count - secondToLastChunkStart + 1;
var newLastChunk = _tokenizerService.Decode(
tokens
Expand Down

0 comments on commit 042ce9b

Please sign in to comment.