Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial release #22

Merged
merged 11 commits into from
Oct 7, 2024
24 changes: 24 additions & 0 deletions crates/bpe-openai/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[package]
name = "bpe-openai"
version = "0.1.0"
edition = "2021"
description = "Prebuilt fast byte-pair encoders for OpenAI."
repository = "https://github.com/github/rust-gems"
license = "MIT"
keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
categories = ["algorithms", "data-structures", "encoding", "science"]

[lib]
crate-type = ["lib", "staticlib"]
bench = false

[dependencies]
bpe = { version = "0.1.0", path = "../bpe" }
rmp-serde = "1"
serde = { version = "1" }

[build-dependencies]
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
rmp-serde = "1"
tiktoken-rs = { version = "0.5" }
serde = { version = "1" }
42 changes: 42 additions & 0 deletions crates/bpe-openai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# OpenAI Byte Pair Encoders

Fast tokenizers for OpenAI token sets based on the [bpe](https://crates.io/crates/bpe) crate.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there should be a warning that this is crate is NOT replicating the regex "word splitting" used by openAI.
Therefore, results will differ!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the warning and also a test that shows an example of the issue.

Serialized BPE instances are generated during build and lazily loaded at runtime as static values.
The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.

Supported token sets:

- r50k
- p50k
- cl100k
- o200k

## Usage

Add a dependency by running

```sh
cargo add bpe-openai
```

or by adding the following to `Cargo.toml`

```toml
[dependencies]
bpe-openai = "0.1"
```

Counting tokens is as simple as:

```rust
use bpe_openai::cl100k;

fn main() {
let bpe = cl100k();
let count = bpe.count("Hello, world!");
println!("{tokens}");
}
```

For more detailed documentation we refer to [bpe](https://crates.io/crates/bpe).
51 changes: 51 additions & 0 deletions crates/bpe-openai/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use std::env;
use std::fs::File;
use std::path::PathBuf;

use bpe::byte_pair_encoding::BytePairEncoding;
use serde::Serialize;
use tiktoken_rs::CoreBPE;

fn main() {
serialize_tokens(
"r50k",
&tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
50256,
1,
);
serialize_tokens(
"p50k",
&tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
50280,
1,
);
serialize_tokens(
"cl100k",
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
100256,
17846336922010275747,
);
serialize_tokens(
"cl100k",
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
100256,
17846336922010275747,
);
serialize_tokens(
"o200k",
&tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
199998,
17846336922010275747,
);
println!("cargo::rerun-if-changed=build.rs");
}

fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
path.push(format!("bpe_{name}.dict"));
let file = File::create(path).expect("can create output file");
let mut serializer = rmp_serde::Serializer::new(file);
let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
bpe.serialize(&mut serializer)
.expect("serialization succeeds");
}
66 changes: 66 additions & 0 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use std::sync::LazyLock;

use bpe::byte_pair_encoding::BytePairEncoding;

static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
rmp_serde::from_slice(bytes).expect("")
});

static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
rmp_serde::from_slice(bytes).expect("")
});

static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
rmp_serde::from_slice(bytes).expect("")
});

static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
rmp_serde::from_slice(bytes).expect("")
});

pub use bpe::*;

pub fn r50k() -> &'static BytePairEncoding {
&BPE_R50K
}

pub fn p50k() -> &'static BytePairEncoding {
&BPE_P50K
}

pub fn cl100k() -> &'static BytePairEncoding {
&BPE_CL100K
}

pub fn o200k() -> &'static BytePairEncoding {
&BPE_O200K
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn can_load_r50k() {
r50k().count("".as_bytes());
}

#[test]
fn can_load_p50k() {
p50k().count("".as_bytes());
}

#[test]
fn can_load_cl100k() {
cl100k().count("".as_bytes());
}

#[test]
fn can_load_o200k() {
o200k().count("".as_bytes());
}
}
6 changes: 6 additions & 0 deletions crates/bpe/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
name = "bpe"
version = "0.1.0"
edition = "2021"
description = "Fast byte-pair encoding implementation."
repository = "https://github.com/github/rust-gems"
license = "MIT"
keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
categories = ["algorithms", "data-structures", "encoding", "science"]

[lib]
crate-type = ["lib", "staticlib"]
Expand All @@ -11,6 +16,7 @@ bench = false
name = "performance"
path = "benches/performance.rs"
harness = false
test = false

[features]
rand = ["dep:rand"]
Expand Down
33 changes: 20 additions & 13 deletions crates/bpe/benches/performance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,28 @@ use criterion::{
use rand::{thread_rng, Rng};
use tiktoken_rs::CoreBPE;

static TOKENIZERS: LazyLock<[(&'static str, &'static BytePairEncoding, CoreBPE); 2]> =
LazyLock::new(|| {
[
(
"cl100k",
BytePairEncoding::cl100k(),
tiktoken_rs::cl100k_base().unwrap(),
static TOKENIZERS: LazyLock<[(&'static str, BytePairEncoding, CoreBPE); 2]> = LazyLock::new(|| {
[
(
"cl100k",
BytePairEncoding::from_tiktoken(
&tiktoken_rs::cl100k_base_singleton().lock(),
100256,
Some(17846336922010275747),
),
(
"o200k",
BytePairEncoding::o200k(),
tiktoken_rs::o200k_base().unwrap(),
tiktoken_rs::cl100k_base().unwrap(),
),
(
"o200k",
BytePairEncoding::from_tiktoken(
&tiktoken_rs::o200k_base_singleton().lock(),
199998,
Some(17846336922010275747),
),
]
});
tiktoken_rs::o200k_base().unwrap(),
),
]
});

fn counting_benchmark(c: &mut Criterion) {
for (name, bpe, _) in TOKENIZERS.iter() {
Expand Down
4 changes: 2 additions & 2 deletions crates/bpe/src/appendable_encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ impl<'a> AppendableEncoder<'a> {

#[cfg(test)]
mod tests {
use crate::byte_pair_encoding::{create_test_bytes, BytePairEncoding};
use crate::byte_pair_encoding::{create_test_bytes, BPE_CL100K};

use super::AppendableEncoder;

#[test]
fn test_appendable_encoder() {
let bpe = BytePairEncoding::cl100k();
let bpe = &BPE_CL100K;
let mut enc = AppendableEncoder::new(bpe);
let input_string = create_test_bytes(bpe, 100);
for (i, c) in input_string.iter().enumerate() {
Expand Down
Loading