github · hendrikvanantwerpen · Oct 4, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/crates/bpe/README.md b/crates/bpe/README.md
@@ -227,6 +227,12 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac
 
 ![encoding runtime comparison](./benches/result/encoding-o200k.svg)
 
+The graph below shows encoding results for input that is particularly challenging for tiktoken.
+The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
+This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity.
+
+![worst-case encoding runtime comparison](./benches/result/worstcase-o200k.svg)
+
 ### Incremental encoding
 
 Incremental encoding tokenizes a text while appending bytes.

diff --git a/crates/bpe/benches/performance.rs b/crates/bpe/benches/performance.rs
@@ -160,6 +160,31 @@ fn appending_benchmark(c: &mut Criterion) {
     }
 }
 
+fn worstcase_benchmark(c: &mut Criterion) {
+    for (name, bpe, tiktoken) in TOKENIZERS.iter() {
+        let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
+        let input = text.as_bytes();
+
+        let mut group = c.benchmark_group(format!("worstcase-{name}"));
+        for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
+            group.throughput(criterion::Throughput::Bytes(bytes as u64));
+            group.bench_with_input(
+                BenchmarkId::new("backtracking", bytes),
+                &bytes,
+                |b, bytes| b.iter(|| bpe.encode_via_backtracking(select_test_bytes(input, *bytes))),
+            );
+            group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| {
+                b.iter_batched(
+                    || select_test_bytes(input, *bytes),
+                    |input| tiktoken.encode_ordinary(std::str::from_utf8(input).unwrap()),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        group.finish();
+    }
+}
+
 fn is_char_boundary(b: u8) -> bool {
     // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
     // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
@@ -188,12 +213,24 @@ fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String {
     text
 }
 
+fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
+    let mut start = thread_rng().gen_range(0..input.len() - bytes);
+    while start > 0 && !is_char_boundary(input[start]) {
+        start -= 1;
+    }
+    let mut end = start + bytes;
+    while end < input.len() && !is_char_boundary(input[end]) {
+        end += 1;
+    }
+    &input[start..end]
+}
+
 criterion_group!(
     name = benches;
     config = Criterion::default()
                 .warm_up_time(Duration::from_millis(500))
-                .measurement_time(Duration::from_millis(1000))
+                .measurement_time(Duration::from_millis(4000))
                 .nresamples(1000);
-    targets = counting_benchmark, encoding_benchmark, appending_benchmark
+    targets = counting_benchmark, encoding_benchmark, appending_benchmark, worstcase_benchmark
 );
 criterion_main!(benches);
diff --git a/crates/bpe/benches/result/appending-o200k.svg b/crates/bpe/benches/result/appending-o200k.svg
diff --git a/crates/bpe/benches/result/counting-o200k.svg b/crates/bpe/benches/result/counting-o200k.svg
diff --git a/crates/bpe/benches/result/encoding-o200k.svg b/crates/bpe/benches/result/encoding-o200k.svg
diff --git a/crates/bpe/benches/result/worstcase-o200k.svg b/crates/bpe/benches/result/worstcase-o200k.svg
diff --git a/crates/bpe/script/copy-benchmark-results b/crates/bpe/script/copy-benchmark-results
@@ -6,6 +6,6 @@ result_dir="benches/result"
 
 mkdir -p "$result_dir"
 
-for i in {counting,encoding,appending}-o200k; do
+for i in {counting,encoding,appending,worstcase}-o200k; do
     rsvg-convert --format svg --output "$result_dir/$i.svg" --background-color white "target/criterion/reports/$i/lines.svg"
 done
diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs
@@ -567,12 +567,39 @@ mod tests {
     use std::time::Instant;
 
     use itertools::Itertools;
-    use tiktoken_rs::cl100k_base_singleton;
+    use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton};
 
     use crate::byte_pair_encoding::{create_test_bytes, BytePairEncoding};
 
     #[test]
-    fn test_correctness() {
+    fn test_correctness_cl100k() {
+        // This is quite a challenging test case...
+        let test_string = std::str::from_utf8(&[
+            125, 34, 10, 10, 46, 109, 107, 100, 105, 114, 115, 32, 102, 100, 115, 32, 97, 100, 105,
+            112, 105, 115, 105, 99, 105, 110, 103, 105, 116, 121, 69, 110, 103, 105, 110, 101, 32,
+            69, 67, 105, 114, 105, 101, 32, 111, 112, 116, 105, 109, 97, 108, 95, 68, 65, 32, 111,
+            102, 102, 101, 110, 100,
+        ])
+        .unwrap();
+        let time = Instant::now();
+        let bpe = BytePairEncoding::o200k();
+        println!("{:?}", time.elapsed());
+        let encoded1 = o200k_base_singleton()
+            .lock()
+            .encode_ordinary(test_string)
+            .into_iter()
+            .map(|t| t as u32)
+            .collect_vec();
+        let encoded2 = bpe.encode_via_backtracking(test_string.as_bytes());
+        assert_eq!(encoded1, encoded2);
+        let encoded3 = bpe.encode_via_table(test_string.as_bytes());
+        assert_eq!(encoded1, encoded3);
+        let encoded4 = bpe.encode_via_bitfield(test_string.as_bytes());
+        assert_eq!(encoded1, encoded4);
+    }
+
+    #[test]
+    fn test_correctness_o200k() {
         // This is quite a challenging test case...
         let test_string = std::str::from_utf8(&[
             125, 34, 10, 10, 46, 109, 107, 100, 105, 114, 115, 32, 102, 100, 115, 32, 97, 100, 105,