From c5db1dd78b9f6254898683f03f34f9b7d8df6270 Mon Sep 17 00:00:00 2001
From: "Victoria Terenina (torymur)" <torymur@gmail.com>
Date: Tue, 19 Nov 2024 11:28:09 +0000
Subject: [PATCH] Apply CR suggestions

---
 src/error.rs                |  9 +++------
 src/lib.rs                  |  4 +---
 src/vocabulary/locator.rs   |  4 ++++
 src/vocabulary/processor.rs | 24 ++++++++++++++----------
 4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/error.rs b/src/error.rs
index 652fa74..f589731 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -1,6 +1,9 @@
 use thiserror::Error;
 
+pub type Result<T, E = crate::Error> = std::result::Result<T, E>;
+
 #[derive(Error, Debug)]
+#[error("{0}")]
 pub struct TokenizersError(pub tokenizers::Error);
 
 impl PartialEq for TokenizersError {
@@ -9,12 +12,6 @@ impl PartialEq for TokenizersError {
     }
 }
 
-impl std::fmt::Display for TokenizersError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0)
-    }
-}
-
 #[derive(Error, Debug, PartialEq)]
 pub enum Error {
     #[error("The vocabulary does not allow us to build a sequence that matches the input")]
diff --git a/src/lib.rs b/src/lib.rs
index 4c45de4..6155b71 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -6,9 +6,7 @@ pub mod primitives;
 pub mod regex;
 pub mod vocabulary;
 
-use error::Error;
-
-pub type Result<T, E = Error> = std::result::Result<T, E>;
+pub use error::{Error, Result};
 
 #[cfg(feature = "python-bindings")]
 mod python_bindings;
diff --git a/src/vocabulary/locator.rs b/src/vocabulary/locator.rs
index 782b621..d3f8bcf 100644
--- a/src/vocabulary/locator.rs
+++ b/src/vocabulary/locator.rs
@@ -4,6 +4,7 @@ use tokenizers::{FromPretrainedParameters, Tokenizer};
 
 use crate::primitives::*;
 
+/// Mapping of characters to bytes for GPT-2 like tokenizers.
 /// List of common eos token locations appearing on hugging face hub, ordered by priority.
 const COMMON_LOCATIONS: &[EosTokenLocation] = &[
     // Most projects have `generation_config.json` that looks like:
@@ -71,6 +72,7 @@ struct Object {
     eos_token: Content,
 }
 
+/// `eos_token` provided in a `Content`.
 #[derive(Debug, Serialize, Deserialize)]
 struct Content {
     content: String,
@@ -91,6 +93,7 @@ struct EosTokenLocation {
 
 /// Locates eos token id.
 pub(crate) trait Locator {
+    /// Locates eos token id in defined locations by `Locator`.
     fn locate_eos_token_id(
         model: &str,
         tokenizer: &Tokenizer,
@@ -102,6 +105,7 @@ pub(crate) trait Locator {
 pub(crate) struct HFLocator;
 
 impl Locator for HFLocator {
+    /// Locates eos token id in defined locations.
     fn locate_eos_token_id(
         model: &str,
         tokenizer: &Tokenizer,
diff --git a/src/vocabulary/processor.rs b/src/vocabulary/processor.rs
index 55b6cde..7426f24 100644
--- a/src/vocabulary/processor.rs
+++ b/src/vocabulary/processor.rs
@@ -77,12 +77,6 @@ static CHAR_MAP: Lazy<HashMap<char, u8>> = Lazy::new(|| {
     char_map
 });
 
-/// Token processor to adjust tokens according to the tokenizer's level.
-#[derive(Debug)]
-pub(crate) struct TokenProcessor {
-    level: TokenProcessorLevel,
-}
-
 /// Recognizes different tokenizer's levels.
 #[derive(Debug, Clone, PartialEq)]
 pub(crate) enum TokenProcessorLevel {
@@ -99,13 +93,17 @@ pub(crate) struct Mods {
     spacechar: char,
 }
 
-/// Default string modification to be applied by `TokenProcessor` of `ByteFallback` level.
-static DEFAULT_MODS: Mods = Mods { spacechar: ' ' };
+impl Default for Mods {
+    /// Default string modification to be applied by `TokenProcessor` of `ByteFallback` level.
+    fn default() -> Self {
+        Self { spacechar: ' ' }
+    }
+}
 
 impl Mods {
-    /// Apply default modifications.
+    /// Apply default modifications to each token.
     fn apply_default(&self, token: String) -> String {
-        let to = DEFAULT_MODS.spacechar.to_string();
+        let to = Self::default().spacechar.to_string();
         token.replace(self.spacechar, &to)
     }
 }
@@ -142,6 +140,12 @@ enum ReplacePattern {
     String(String),
 }
 
+/// Token processor to adjust tokens according to the tokenizer's level.
+#[derive(Debug)]
+pub(crate) struct TokenProcessor {
+    level: TokenProcessorLevel,
+}
+
 impl TokenProcessor {
     /// Create new `TokenProcessor` with the level defined based on tokenizer's decoders.
     pub(crate) fn new(tokenizer: &Tokenizer) -> Result<Self> {