github · hendrikvanantwerpen · Oct 8, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "bpe-openai"
-version = "0.1.0"
+version = "0.1.1"
 edition = "2021"
 description = "Prebuilt fast byte-pair encoders for OpenAI."
 repository = "https://github.com/github/rust-gems"
@@ -17,6 +17,10 @@ bpe = { version = "0.1.0", path = "../bpe" }
 rmp-serde = "1"
 serde = { version = "1" }
 
+[dev-dependencies]
+fancy-regex = "0.13"
+tiktoken-rs = { version = "0.5" }
+
 [build-dependencies]
 bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
 rmp-serde = "1"

diff --git a/crates/bpe-openai/README.md b/crates/bpe-openai/README.md
@@ -12,6 +12,10 @@ Supported token sets:
 - cl100k
 - o200k
 
+> **⚠ CAUTION ⚠**
+> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
+> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
+
 ## Usage
 
 Add a dependency by running

diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -42,6 +42,8 @@ pub fn o200k() -> &'static BytePairEncoding {
 
 #[cfg(test)]
 mod tests {
+    use tiktoken_rs::cl100k_base_singleton;
+
     use super::*;
 
     #[test]
@@ -63,4 +65,33 @@ mod tests {
     fn can_load_o200k() {
         o200k().count("".as_bytes());
     }
+
+    /// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
+    #[test]
+    fn splitting_difference() {
+        let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t    Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
+        let input = text.as_bytes();
+        let expected: Vec<_> = cl100k_base_singleton()
+            .lock()
+            .encode_ordinary(text)
+            .into_iter()
+            .map(|i| i as u32)
+            .collect();
+
+        let without_splitting = BPE_CL100K.encode_via_backtracking(input);
+        assert_ne!(without_splitting, expected);
+
+        let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
+        let re = fancy_regex::Regex::new(pat).unwrap();
+        println!("{}", re.find_iter(text).count());
+        let with_splitting: Vec<_> = re
+            .find_iter(text)
+            .flat_map(|piece| {
+                BPE_CL100K
+                    .encode_via_backtracking(piece.unwrap().as_str().as_bytes())
+                    .into_iter()
+            })
+            .collect();
+        assert_eq!(with_splitting, expected);
+    }
 }