Skip to content

Commit

Permalink
Apply CR suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
torymur committed Nov 19, 2024
1 parent 1e8c60c commit c5db1dd
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 19 deletions.
9 changes: 3 additions & 6 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use thiserror::Error;

pub type Result<T, E = crate::Error> = std::result::Result<T, E>;

#[derive(Error, Debug)]
#[error("{0}")]
pub struct TokenizersError(pub tokenizers::Error);

impl PartialEq for TokenizersError {
Expand All @@ -9,12 +12,6 @@ impl PartialEq for TokenizersError {
}
}

impl std::fmt::Display for TokenizersError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}

#[derive(Error, Debug, PartialEq)]
pub enum Error {
#[error("The vocabulary does not allow us to build a sequence that matches the input")]
Expand Down
4 changes: 1 addition & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@ pub mod primitives;
pub mod regex;
pub mod vocabulary;

use error::Error;

pub type Result<T, E = Error> = std::result::Result<T, E>;
pub use error::{Error, Result};

#[cfg(feature = "python-bindings")]
mod python_bindings;
4 changes: 4 additions & 0 deletions src/vocabulary/locator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use tokenizers::{FromPretrainedParameters, Tokenizer};

use crate::primitives::*;

/// Mapping of characters to bytes for GPT-2 like tokenizers.
/// List of common eos token locations appearing on hugging face hub, ordered by priority.
const COMMON_LOCATIONS: &[EosTokenLocation] = &[
// Most projects have `generation_config.json` that looks like:
Expand Down Expand Up @@ -71,6 +72,7 @@ struct Object {
eos_token: Content,
}

/// `eos_token` provided in a `Content`.
#[derive(Debug, Serialize, Deserialize)]
struct Content {
content: String,
Expand All @@ -91,6 +93,7 @@ struct EosTokenLocation {

/// Locates eos token id.
pub(crate) trait Locator {
/// Locates eos token id in defined locations by `Locator`.
fn locate_eos_token_id(
model: &str,
tokenizer: &Tokenizer,
Expand All @@ -102,6 +105,7 @@ pub(crate) trait Locator {
pub(crate) struct HFLocator;

impl Locator for HFLocator {
/// Locates eos token id in defined locations.
fn locate_eos_token_id(
model: &str,
tokenizer: &Tokenizer,
Expand Down
24 changes: 14 additions & 10 deletions src/vocabulary/processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,6 @@ static CHAR_MAP: Lazy<HashMap<char, u8>> = Lazy::new(|| {
char_map
});

/// Token processor to adjust tokens according to the tokenizer's level.
#[derive(Debug)]
pub(crate) struct TokenProcessor {
level: TokenProcessorLevel,
}

/// Recognizes different tokenizer's levels.
#[derive(Debug, Clone, PartialEq)]
pub(crate) enum TokenProcessorLevel {
Expand All @@ -99,13 +93,17 @@ pub(crate) struct Mods {
spacechar: char,
}

/// Default string modification to be applied by `TokenProcessor` of `ByteFallback` level.
static DEFAULT_MODS: Mods = Mods { spacechar: ' ' };
impl Default for Mods {
/// Default string modification to be applied by `TokenProcessor` of `ByteFallback` level.
fn default() -> Self {
Self { spacechar: ' ' }
}
}

impl Mods {
/// Apply default modifications.
/// Apply default modifications to each token.
fn apply_default(&self, token: String) -> String {
let to = DEFAULT_MODS.spacechar.to_string();
let to = Self::default().spacechar.to_string();
token.replace(self.spacechar, &to)
}
}
Expand Down Expand Up @@ -142,6 +140,12 @@ enum ReplacePattern {
String(String),
}

/// Token processor to adjust tokens according to the tokenizer's level.
#[derive(Debug)]
pub(crate) struct TokenProcessor {
level: TokenProcessorLevel,
}

impl TokenProcessor {
/// Create new `TokenProcessor` with the level defined based on tokenizer's decoders.
pub(crate) fn new(tokenizer: &Tokenizer) -> Result<Self> {
Expand Down

0 comments on commit c5db1dd

Please sign in to comment.