Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
hendrikvanantwerpen committed Oct 9, 2024
1 parent 7f627d5 commit 599e11d
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 13 deletions.
19 changes: 10 additions & 9 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,28 @@ use fancy_regex::Regex;

static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("");
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).unwrap()
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("");
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).unwrap()
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("");
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).unwrap()
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("");
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = [
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
Expand All @@ -37,7 +37,7 @@ static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
"\\s+(?!\\S)",
"\\s+",
].join("|");
Tokenizer::new(bpe, Some(&pat)).unwrap()
Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
});

pub use bpe::*;
Expand All @@ -50,8 +50,9 @@ pub struct Tokenizer {
}

impl Tokenizer {
#[allow(clippy::result_large_err)]
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
let pat = pat.map(|pat| fancy_regex::Regex::new(pat)).transpose()?;
let pat = pat.map(fancy_regex::Regex::new).transpose()?;
Ok(Self { bpe, pat })
}

Expand Down
8 changes: 4 additions & 4 deletions crates/bpe/benchmarks/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ pub static TOKENIZERS: LazyLock<
(
"cl100k",
bpe_openai::cl100k(),
tiktoken_rs::cl100k_base().unwrap(),
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).unwrap(),
tiktoken_rs::cl100k_base().expect("tokenizer available"),
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).expect("model available"),
),
(
"o200k",
bpe_openai::o200k(),
tiktoken_rs::o200k_base().unwrap(),
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).unwrap(),
tiktoken_rs::o200k_base().expect("tokenizer available"),
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).expect("model available"),
),
]
});
Expand Down

0 comments on commit 599e11d

Please sign in to comment.