github · hendrikvanantwerpen · Oct 14, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,6 +2,7 @@
 
 members = [
     "crates/*",
+    "crates/bpe/benchmarks",
 ]
 resolver = "2"
 

diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml
@@ -14,11 +14,12 @@ bench = false
 
 [dependencies]
 bpe = { version = "0.1.0", path = "../bpe" }
+either = "1.13"
+fancy-regex = "0.13"
 rmp-serde = "1"
 serde = { version = "1" }
 
 [dev-dependencies]
-fancy-regex = "0.13"
 tiktoken-rs = { version = "0.5" }
 
 [build-dependencies]

diff --git a/crates/bpe-openai/README.md b/crates/bpe-openai/README.md
@@ -5,17 +5,13 @@ Serialized BPE instances are generated during build and lazily loaded at runtime
 The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
 For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
 
-Supported token sets:
+Supported tokenizers:
 
 - r50k
 - p50k
 - cl100k
 - o200k
 
-> **⚠ CAUTION ⚠**
-> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
-> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
-
 ## Usage
 
 Add a dependency by running

diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -1,42 +1,103 @@
 use std::sync::LazyLock;
 
 use bpe::byte_pair_encoding::BytePairEncoding;
+use either::Either;
+use fancy_regex::Regex;
 
-static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat = [
+        "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
+        "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
+        "\\p{N}{1,3}",
+        " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
+        "\\s*[\\r\\n]+",
+        "\\s+(?!\\S)",
+        "\\s+",
+    ].join("|");
+    Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
 });
 
 pub use bpe::*;
 
-pub fn r50k() -> &'static BytePairEncoding {
+pub struct Tokenizer {
+    /// The byte-pair encoding for this tokenizer.
+    pub bpe: BytePairEncoding,
+    /// The pattern regex used to split the input.
+    pub pat: Option<Regex>,
+}
+
+impl Tokenizer {
+    #[allow(clippy::result_large_err)]
+    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
+        let pat = pat.map(fancy_regex::Regex::new).transpose()?;
+        Ok(Self { bpe, pat })
+    }
+
+    pub fn count(&self, text: &str) -> usize {
+        self.split(text)
+            .map(|piece| self.bpe.count(piece.as_bytes()))
+            .sum()
+    }
+
+    pub fn encode(&self, text: &str) -> Vec<u32> {
+        self.split(text)
+            .flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
+            .collect()
+    }
+
+    pub fn decode(&self, tokens: &[u32]) -> Option<String> {
+        String::from_utf8(self.bpe.decode_tokens(tokens)).ok()
+    }
+
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
+        match &self.pat {
+            Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
+                let m = m.expect("match succeeded");
+                assert_eq!(*start, m.start(), "pattern should match all input text");
+                *start = m.end();
+                Some(m.as_str())
+            })),
+            None => Either::Right(std::iter::once(text)),
+        }
+    }
+}
+
+pub fn r50k() -> &'static Tokenizer {
     &BPE_R50K
 }
 
-pub fn p50k() -> &'static BytePairEncoding {
+pub fn p50k() -> &'static Tokenizer {
     &BPE_P50K
 }
 
-pub fn cl100k() -> &'static BytePairEncoding {
+pub fn cl100k() -> &'static Tokenizer {
     &BPE_CL100K
 }
 
-pub fn o200k() -> &'static BytePairEncoding {
+pub fn o200k() -> &'static Tokenizer {
     &BPE_O200K
 }
 
@@ -48,25 +109,25 @@ mod tests {
 
     #[test]
     fn can_load_r50k() {
-        r50k().count("".as_bytes());
+        r50k().count("");
     }
 
     #[test]
     fn can_load_p50k() {
-        p50k().count("".as_bytes());
+        p50k().count("");
     }
 
     #[test]
     fn can_load_cl100k() {
-        cl100k().count("".as_bytes());
+        cl100k().count("");
     }
 
     #[test]
     fn can_load_o200k() {
-        o200k().count("".as_bytes());
+        o200k().count("");
     }
 
-    /// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
+    /// Test demonstrating a case where input splitting makes a difference.
     #[test]
     fn splitting_difference() {
         let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t    Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
@@ -78,20 +139,10 @@ mod tests {
             .map(|i| i as u32)
             .collect();
 
-        let without_splitting = BPE_CL100K.encode_via_backtracking(input);
+        let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
         assert_ne!(without_splitting, expected);
 
-        let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
-        let re = fancy_regex::Regex::new(pat).unwrap();
-        println!("{}", re.find_iter(text).count());
-        let with_splitting: Vec<_> = re
-            .find_iter(text)
-            .flat_map(|piece| {
-                BPE_CL100K
-                    .encode_via_backtracking(piece.unwrap().as_str().as_bytes())
-                    .into_iter()
-            })
-            .collect();
+        let with_splitting: Vec<_> = BPE_CL100K.encode(text);
         assert_eq!(with_splitting, expected);
     }
 }
diff --git a/crates/bpe/Cargo.toml b/crates/bpe/Cargo.toml
@@ -12,12 +12,6 @@ categories = ["algorithms", "data-structures", "encoding", "science"]
 crate-type = ["lib", "staticlib"]
 bench = false
 
-[[bench]]
-name = "performance"
-path = "benches/performance.rs"
-harness = false
-test = false
-
 [features]
 rand = ["dep:rand"]
 tiktoken-rs = ["dep:tiktoken-rs"]
@@ -33,4 +27,3 @@ tiktoken-rs = { version = "0.5", optional = true }
 
 [dev-dependencies]
 bpe = { path = ".", features = ["rand", "tiktoken-rs"] }
-criterion = "0.5"