Skip to content

Commit

Permalink
fix spliter length missed (langgenius#7987)
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnJyong authored and JunXu01 committed Nov 9, 2024
1 parent 0a2e832 commit 4a05a4d
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
10 changes: 7 additions & 3 deletions api/core/rag/splitter/fixed_text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,17 +93,21 @@ def recursive_split_text(self, text: str) -> list[str]:
splits = list(text)
# Now go merging things, recursively splitting longer texts.
_good_splits = []
_good_splits_lengths = [] # cache the lengths of the splits
for s in splits:
if self._length_function(s) < self._chunk_size:
s_len = self._length_function(s)
if s_len < self._chunk_size:
_good_splits.append(s)
_good_splits_lengths.append(s_len)
else:
if _good_splits:
merged_text = self._merge_splits(_good_splits, separator)
merged_text = self._merge_splits(_good_splits, separator, _good_splits_lengths)
final_chunks.extend(merged_text)
_good_splits = []
_good_splits_lengths = []
other_info = self.recursive_split_text(s)
final_chunks.extend(other_info)
if _good_splits:
merged_text = self._merge_splits(_good_splits, separator)
merged_text = self._merge_splits(_good_splits, separator, _good_splits_lengths)
final_chunks.extend(merged_text)
return final_chunks
5 changes: 4 additions & 1 deletion api/core/rag/splitter/text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,10 @@ def split_text(self, text: str) -> list[str]:
# First we naively split the large input into a bunch of smaller ones.
splits = _split_text_with_regex(text, self._separator, self._keep_separator)
_separator = "" if self._keep_separator else self._separator
return self._merge_splits(splits, _separator)
_good_splits_lengths = [] # cache the lengths of the splits
for split in splits:
_good_splits_lengths.append(self._length_function(split))
return self._merge_splits(splits, _separator, _good_splits_lengths)


class LineType(TypedDict):
Expand Down

0 comments on commit 4a05a4d

Please sign in to comment.