Skip to content

Commit

Permalink
Merge pull request #42 from github/aneubeck/count_till_limit
Browse files Browse the repository at this point in the history
fix count_till_limit function
  • Loading branch information
aneubeck authored Dec 6, 2024
2 parents 6e03fd0 + c0a3cb7 commit 3519bff
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
@@ -92,12 +92,11 @@ impl Tokenizer {
/// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
/// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize> {
self.split(text)
.try_fold(token_limit, |token_limit, piece| {
self.bpe
.count_till_limit(piece.as_bytes(), token_limit)
.map(|piece_count| token_limit - piece_count)
})
self.split(text).try_fold(0, |consumed, piece| {
self.bpe
.count_till_limit(piece.as_bytes(), token_limit - consumed)
.map(|piece_count| consumed + piece_count)
})
}

/// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
@@ -231,4 +230,12 @@ mod tests {
}
}
}

#[test]
fn test_count_till_limit() {
assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1));
assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2));
assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3));
assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None);
}
}

0 comments on commit 3519bff

Please sign in to comment.