From 79f0a02eefc2639d3b1c0e205392d312e7c176b0 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Thu, 3 Oct 2024 18:47:36 +0200 Subject: [PATCH 1/2] Add correctness test for o200k --- crates/bpe/src/byte_pair_encoding.rs | 31 ++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs index 72fa946..1f278bf 100644 --- a/crates/bpe/src/byte_pair_encoding.rs +++ b/crates/bpe/src/byte_pair_encoding.rs @@ -567,12 +567,39 @@ mod tests { use std::time::Instant; use itertools::Itertools; - use tiktoken_rs::cl100k_base_singleton; + use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton}; use crate::byte_pair_encoding::{create_test_bytes, BytePairEncoding}; #[test] - fn test_correctness() { + fn test_correctness_cl100k() { + // This is quite a challenging test case... + let test_string = std::str::from_utf8(&[ + 125, 34, 10, 10, 46, 109, 107, 100, 105, 114, 115, 32, 102, 100, 115, 32, 97, 100, 105, + 112, 105, 115, 105, 99, 105, 110, 103, 105, 116, 121, 69, 110, 103, 105, 110, 101, 32, + 69, 67, 105, 114, 105, 101, 32, 111, 112, 116, 105, 109, 97, 108, 95, 68, 65, 32, 111, + 102, 102, 101, 110, 100, + ]) + .unwrap(); + let time = Instant::now(); + let bpe = BytePairEncoding::o200k(); + println!("{:?}", time.elapsed()); + let encoded1 = o200k_base_singleton() + .lock() + .encode_ordinary(test_string) + .into_iter() + .map(|t| t as u32) + .collect_vec(); + let encoded2 = bpe.encode_via_backtracking(test_string.as_bytes()); + assert_eq!(encoded1, encoded2); + let encoded3 = bpe.encode_via_table(test_string.as_bytes()); + assert_eq!(encoded1, encoded3); + let encoded4 = bpe.encode_via_bitfield(test_string.as_bytes()); + assert_eq!(encoded1, encoded4); + } + + #[test] + fn test_correctness_o200k() { // This is quite a challenging test case... let test_string = std::str::from_utf8(&[ 125, 34, 10, 10, 46, 109, 107, 100, 105, 114, 115, 32, 102, 100, 115, 32, 97, 100, 105, From af3f23c89b3281c0cef47f221cd7083770eecf31 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Thu, 3 Oct 2024 18:48:31 +0200 Subject: [PATCH 2/2] Add worst-case benchmark and add results to README --- crates/bpe/README.md | 6 ++ crates/bpe/benches/performance.rs | 41 +++++++++- crates/bpe/benches/result/appending-o200k.svg | 20 ++--- crates/bpe/benches/result/counting-o200k.svg | 20 ++--- crates/bpe/benches/result/encoding-o200k.svg | 60 +++++++-------- crates/bpe/benches/result/worstcase-o200k.svg | 77 +++++++++++++++++++ crates/bpe/script/copy-benchmark-results | 2 +- 7 files changed, 173 insertions(+), 53 deletions(-) create mode 100644 crates/bpe/benches/result/worstcase-o200k.svg diff --git a/crates/bpe/README.md b/crates/bpe/README.md index 0cd4c58..a43c56c 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -227,6 +227,12 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac ![encoding runtime comparison](./benches/result/encoding-o200k.svg) +The graph below shows encoding results for input that is particularly challenging for tiktoken. +The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace. +This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity. + +![worst-case encoding runtime comparison](./benches/result/worstcase-o200k.svg) + ### Incremental encoding Incremental encoding tokenizes a text while appending bytes. diff --git a/crates/bpe/benches/performance.rs b/crates/bpe/benches/performance.rs index 4cff09c..9fe2704 100644 --- a/crates/bpe/benches/performance.rs +++ b/crates/bpe/benches/performance.rs @@ -160,6 +160,31 @@ fn appending_benchmark(c: &mut Criterion) { } } +fn worstcase_benchmark(c: &mut Criterion) { + for (name, bpe, tiktoken) in TOKENIZERS.iter() { + let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect(); + let input = text.as_bytes(); + + let mut group = c.benchmark_group(format!("worstcase-{name}")); + for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] { + group.throughput(criterion::Throughput::Bytes(bytes as u64)); + group.bench_with_input( + BenchmarkId::new("backtracking", bytes), + &bytes, + |b, bytes| b.iter(|| bpe.encode_via_backtracking(select_test_bytes(input, *bytes))), + ); + group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { + b.iter_batched( + || select_test_bytes(input, *bytes), + |input| tiktoken.encode_ordinary(std::str::from_utf8(input).unwrap()), + criterion::BatchSize::SmallInput, + ) + }); + } + group.finish(); + } +} + fn is_char_boundary(b: u8) -> bool { // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128 // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192 @@ -188,12 +213,24 @@ fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String { text } +fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] { + let mut start = thread_rng().gen_range(0..input.len() - bytes); + while start > 0 && !is_char_boundary(input[start]) { + start -= 1; + } + let mut end = start + bytes; + while end < input.len() && !is_char_boundary(input[end]) { + end += 1; + } + &input[start..end] +} + criterion_group!( name = benches; config = Criterion::default() .warm_up_time(Duration::from_millis(500)) - .measurement_time(Duration::from_millis(1000)) + .measurement_time(Duration::from_millis(4000)) .nresamples(1000); - targets = counting_benchmark, encoding_benchmark, appending_benchmark + targets = counting_benchmark, encoding_benchmark, appending_benchmark, worstcase_benchmark ); criterion_main!(benches); diff --git a/crates/bpe/benches/result/appending-o200k.svg b/crates/bpe/benches/result/appending-o200k.svg index f358527..5474718 100644 --- a/crates/bpe/benches/result/appending-o200k.svg +++ b/crates/bpe/benches/result/appending-o200k.svg @@ -34,17 +34,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/benches/result/counting-o200k.svg b/crates/bpe/benches/result/counting-o200k.svg index deaf497..9b93d5f 100644 --- a/crates/bpe/benches/result/counting-o200k.svg +++ b/crates/bpe/benches/result/counting-o200k.svg @@ -30,17 +30,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/benches/result/encoding-o200k.svg b/crates/bpe/benches/result/encoding-o200k.svg index 468755c..d0ffc09 100644 --- a/crates/bpe/benches/result/encoding-o200k.svg +++ b/crates/bpe/benches/result/encoding-o200k.svg @@ -34,41 +34,41 @@ - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/benches/result/worstcase-o200k.svg b/crates/bpe/benches/result/worstcase-o200k.svg new file mode 100644 index 0000000..7da8fca --- /dev/null +++ b/crates/bpe/benches/result/worstcase-o200k.svg @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/crates/bpe/script/copy-benchmark-results b/crates/bpe/script/copy-benchmark-results index df9e97f..ae045ed 100755 --- a/crates/bpe/script/copy-benchmark-results +++ b/crates/bpe/script/copy-benchmark-results @@ -6,6 +6,6 @@ result_dir="benches/result" mkdir -p "$result_dir" -for i in {counting,encoding,appending}-o200k; do +for i in {counting,encoding,appending,worstcase}-o200k; do rsvg-convert --format svg --output "$result_dir/$i.svg" --background-color white "target/criterion/reports/$i/lines.svg" done