-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Generate serialized data in build script
- Loading branch information
1 parent
1c2506d
commit bae9f01
Showing
10 changed files
with
144 additions
and
110 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
[package] | ||
name = "bpe-openai" | ||
version = "0.0.1" | ||
edition = "2021" | ||
description = "Prebuilt fast byte-pair encoders for OpenAI." | ||
repository = "https://github.com/github/rust-gems" | ||
license = "MIT" | ||
keywords = ["tokenizer", "algorithm", "encoding", "bpe"] | ||
categories = ["algorithms", "data-structures", "encoding", "science"] | ||
|
||
[lib] | ||
crate-type = ["lib", "staticlib"] | ||
bench = false | ||
|
||
[dependencies] | ||
bpe = { version = "0.0.1", path = "../bpe" } | ||
rmp-serde = "1" | ||
serde = { version = "1" } | ||
|
||
[build-dependencies] | ||
bpe = { version = "0.0.1", path = "../bpe", features = ["tiktoken-rs"] } | ||
rmp-serde = "1" | ||
tiktoken-rs = { version = "0.5" } | ||
serde = { version = "1" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
use std::env; | ||
use std::fs::File; | ||
use std::path::PathBuf; | ||
|
||
use bpe::byte_pair_encoding::BytePairEncoding; | ||
use serde::Serialize; | ||
use tiktoken_rs::CoreBPE; | ||
|
||
fn main() { | ||
serialize_tokens( | ||
"cl100k", | ||
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"), | ||
100256, | ||
17846336922010275747, | ||
); | ||
serialize_tokens( | ||
"o200k", | ||
&tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"), | ||
199998, | ||
17846336922010275747, | ||
); | ||
println!("cargo::rerun-if-changed=build.rs"); | ||
} | ||
|
||
fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) { | ||
let mut path = PathBuf::from(env::var("OUT_DIR").unwrap()); | ||
path.push(format!("bpe_{name}.dict")); | ||
let file = File::create(path).unwrap(); | ||
let mut serializer = rmp_serde::Serializer::new(file); | ||
let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor)); | ||
bpe.serialize(&mut serializer).unwrap(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
use std::sync::LazyLock; | ||
|
||
use bpe::byte_pair_encoding::BytePairEncoding; | ||
|
||
static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| { | ||
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict")); | ||
rmp_serde::from_slice(bytes).expect("") | ||
}); | ||
|
||
static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| { | ||
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict")); | ||
rmp_serde::from_slice(bytes).expect("") | ||
}); | ||
|
||
pub use bpe::*; | ||
|
||
pub fn cl100k() -> &'static BytePairEncoding { | ||
&BPE_CL100K | ||
} | ||
|
||
pub fn o200k() -> &'static BytePairEncoding { | ||
&BPE_O200K | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn can_load_cl100k() { | ||
cl100k().count("".as_bytes()); | ||
} | ||
|
||
#[test] | ||
fn can_load_o200k() { | ||
o200k().count("".as_bytes()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters