github · hendrikvanantwerpen · Oct 7, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 4, 2024
diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "bpe-openai"
+version = "0.1.0"
+edition = "2021"
+description = "Prebuilt fast byte-pair encoders for OpenAI."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
+categories = ["algorithms", "data-structures", "encoding", "science"]
+
+[lib]
+crate-type = ["lib", "staticlib"]
+bench = false
+
+[dependencies]
+bpe = { version = "0.1.0", path = "../bpe" }
+rmp-serde = "1"
+serde = { version = "1" }
+
+[build-dependencies]
+bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
+rmp-serde = "1"
+tiktoken-rs = { version = "0.5" }
+serde = { version = "1" }
diff --git a/crates/bpe-openai/README.md b/crates/bpe-openai/README.md
@@ -0,0 +1,42 @@
+# OpenAI Byte Pair Encoders
+
+Fast tokenizers for OpenAI token sets based on the [bpe](https://crates.io/crates/bpe) crate.
+Serialized BPE instances are generated during build and lazily loaded at runtime as static values.
+The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
+For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
+
+Supported token sets:
+
+- r50k
+- p50k
+- cl100k
+- o200k
+
+## Usage
+
+Add a dependency by running
+
+```sh
+cargo add bpe-openai
+```
+
+or by adding the following to `Cargo.toml`
+
+```toml
+[dependencies]
+bpe-openai = "0.1"
+```
+
+Counting tokens is as simple as:
+
+```rust
+use bpe_openai::cl100k;
+
+fn main() {
+  let bpe = cl100k();
+  let count = bpe.count("Hello, world!");
+  println!("{tokens}");
+}
+```
+
+For more detailed documentation we refer to [bpe](https://crates.io/crates/bpe).
diff --git a/crates/bpe-openai/build.rs b/crates/bpe-openai/build.rs
@@ -0,0 +1,51 @@
+use std::env;
+use std::fs::File;
+use std::path::PathBuf;
+
+use bpe::byte_pair_encoding::BytePairEncoding;
+use serde::Serialize;
+use tiktoken_rs::CoreBPE;
+
+fn main() {
+    serialize_tokens(
+        "r50k",
+        &tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
+        50256,
+        1,
+    );
+    serialize_tokens(
+        "p50k",
+        &tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
+        50280,
+        1,
+    );
+    serialize_tokens(
+        "cl100k",
+        &tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
+        100256,
+        17846336922010275747,
+    );
+    serialize_tokens(
+        "cl100k",
+        &tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
+        100256,
+        17846336922010275747,
+    );
+    serialize_tokens(
+        "o200k",
+        &tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
+        199998,
+        17846336922010275747,
+    );
+    println!("cargo::rerun-if-changed=build.rs");
+}
+
+fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
+    let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
+    path.push(format!("bpe_{name}.dict"));
+    let file = File::create(path).expect("can create output file");
+    let mut serializer = rmp_serde::Serializer::new(file);
+    let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
+    bpe.serialize(&mut serializer)
+        .expect("serialization succeeds");
+}
diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -0,0 +1,66 @@
+use std::sync::LazyLock;
+
+use bpe::byte_pair_encoding::BytePairEncoding;
+
+static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
+    rmp_serde::from_slice(bytes).expect("")
+});
+
+static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
+    rmp_serde::from_slice(bytes).expect("")
+});
+
+static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
+    rmp_serde::from_slice(bytes).expect("")
+});
+
+static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
+    rmp_serde::from_slice(bytes).expect("")
+});
+
+pub use bpe::*;
+
+pub fn r50k() -> &'static BytePairEncoding {
+    &BPE_R50K
+}
+
+pub fn p50k() -> &'static BytePairEncoding {
+    &BPE_P50K
+}
+
+pub fn cl100k() -> &'static BytePairEncoding {
+    &BPE_CL100K
+}
+
+pub fn o200k() -> &'static BytePairEncoding {
+    &BPE_O200K
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn can_load_r50k() {
+        r50k().count("".as_bytes());
+    }
+
+    #[test]
+    fn can_load_p50k() {
+        p50k().count("".as_bytes());
+    }
+
+    #[test]
+    fn can_load_cl100k() {
+        cl100k().count("".as_bytes());
+    }
+
+    #[test]
+    fn can_load_o200k() {
+        o200k().count("".as_bytes());
+    }
+}
diff --git a/crates/bpe/Cargo.toml b/crates/bpe/Cargo.toml
@@ -2,6 +2,11 @@
 name = "bpe"
 version = "0.1.0"
 edition = "2021"
+description = "Fast byte-pair encoding implementation."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
+categories = ["algorithms", "data-structures", "encoding", "science"]
 
 [lib]
 crate-type = ["lib", "staticlib"]
@@ -11,6 +16,7 @@ bench = false
 name = "performance"
 path = "benches/performance.rs"
 harness = false
+test = false
 
 [features]
 rand = ["dep:rand"]

diff --git a/crates/bpe/benches/performance.rs b/crates/bpe/benches/performance.rs
@@ -10,21 +10,28 @@ use criterion::{
 use rand::{thread_rng, Rng};
 use tiktoken_rs::CoreBPE;
 
-static TOKENIZERS: LazyLock<[(&'static str, &'static BytePairEncoding, CoreBPE); 2]> =
-    LazyLock::new(|| {
-        [
-            (
-                "cl100k",
-                BytePairEncoding::cl100k(),
-                tiktoken_rs::cl100k_base().unwrap(),
+static TOKENIZERS: LazyLock<[(&'static str, BytePairEncoding, CoreBPE); 2]> = LazyLock::new(|| {
+    [
+        (
+            "cl100k",
+            BytePairEncoding::from_tiktoken(
+                &tiktoken_rs::cl100k_base_singleton().lock(),
+                100256,
+                Some(17846336922010275747),
             ),
-            (
-                "o200k",
-                BytePairEncoding::o200k(),
-                tiktoken_rs::o200k_base().unwrap(),
+            tiktoken_rs::cl100k_base().unwrap(),
+        ),
+        (
+            "o200k",
+            BytePairEncoding::from_tiktoken(
+                &tiktoken_rs::o200k_base_singleton().lock(),
+                199998,
+                Some(17846336922010275747),
             ),
-        ]
-    });
+            tiktoken_rs::o200k_base().unwrap(),
+        ),
+    ]
+});
 
 fn counting_benchmark(c: &mut Criterion) {
     for (name, bpe, _) in TOKENIZERS.iter() {

diff --git a/crates/bpe/src/appendable_encoder.rs b/crates/bpe/src/appendable_encoder.rs
@@ -90,13 +90,13 @@ impl<'a> AppendableEncoder<'a> {
 
 #[cfg(test)]
 mod tests {
-    use crate::byte_pair_encoding::{create_test_bytes, BytePairEncoding};
+    use crate::byte_pair_encoding::{create_test_bytes, BPE_CL100K};
 
     use super::AppendableEncoder;
 
     #[test]
     fn test_appendable_encoder() {
-        let bpe = BytePairEncoding::cl100k();
+        let bpe = &BPE_CL100K;
         let mut enc = AppendableEncoder::new(bpe);
         let input_string = create_test_bytes(bpe, 100);
         for (i, c) in input_string.iter().enumerate() {