From fa36be99a49387a3bbf8de8844744942958a8d76 Mon Sep 17 00:00:00 2001 From: "Victoria Terenina (torymur)" Date: Wed, 18 Dec 2024 15:48:36 +0000 Subject: [PATCH] Use bytes as Token type, more tests for Index --- src/index.rs | 55 ++++- src/primitives.rs | 2 +- src/vocabulary/mod.rs | 479 ------------------------------------------ 3 files changed, 51 insertions(+), 485 deletions(-) diff --git a/src/index.rs b/src/index.rs index a915766c..3df6e742 100644 --- a/src/index.rs +++ b/src/index.rs @@ -138,7 +138,7 @@ impl Index { } let mut next_state = current_state; - for transition_byte in token.as_bytes() { + for transition_byte in token { next_state = dfa.next_state(next_state, *transition_byte); if dfa.is_dead_state(next_state) || dfa.is_quit_state(next_state) { continue 'token_loop; @@ -230,19 +230,64 @@ mod tests { .insert("blah", 0) .insert("1a", 1) .insert("2", 2) - .insert("0", 3) - .insert("", 4); + .insert("0", 3); let index = Index::from_regex(regex, &vocabulary).expect("Index failed"); assert_eq!(index.initial(), 40); assert_eq!(index.final_states(), &HashSet::from_iter([24, 48, 56])); - let expected: HashMap> = HashMap::from_iter([ + let expected = HashMap::from_iter([ (24, HashMap::from_iter([(3, 24), (4, 24), (2, 24)])), (48, HashMap::from_iter([(4, 48)])), (40, HashMap::from_iter([(3, 48), (2, 56)])), (56, HashMap::from_iter([(3, 24), (4, 56), (2, 24)])), ]); - assert_eq!(&expected, index.transitions()); + assert_eq!(index.transitions(), &expected); + } + + #[test] + fn index_from_regex_initital_in_allowed() { + let regex = "`\\n(\\.\\n)?`\\n"; + let vocabulary = Vocabulary::new(Some(104)) + .insert("\n", 103) + .insert(".", 102) + .insert("`", 101); + + let index = Index::from_regex(regex, &vocabulary).expect("Index failed"); + let allowed = index + .allowed_tokens(index.initial()) + .expect("No allowed tokens"); + assert!(allowed.contains(&101)); + } + + #[test] + fn index_from_regex_multibyte() { + let regex = "πŸ˜‡| [😈-😍][πŸ˜‡-😎]*"; + let vocabulary = Vocabulary::new(Some(8)) + .insert(" 😍", 5) + .insert("blah", 0) + .insert("πŸ˜‡", 2) + .insert("😈a", 1) + .insert("😍", 3) + .insert(vec![32, 240, 159, 152], 7) + .insert(vec![32, 240, 159, 152, 141], 6) + .insert(vec![240, 159, 152, 141], 4); + + let index = Index::from_regex(regex, &vocabulary).expect("Index failed"); + + assert_eq!(index.final_states(), &HashSet::from_iter([208, 128])); + + let expected = HashMap::from_iter([ + ( + 208, + HashMap::from_iter([(3, 208), (8, 208), (4, 208), (2, 208)]), + ), + ( + 80, + HashMap::from_iter([(2, 128), (7, 192), (5, 208), (6, 208)]), + ), + (128, HashMap::from_iter([(8, 128)])), + ]); + assert_eq!(index.transitions(), &expected); } } diff --git a/src/primitives.rs b/src/primitives.rs index e12bf036..0976f76d 100644 --- a/src/primitives.rs +++ b/src/primitives.rs @@ -2,7 +2,7 @@ pub type TransitionKey = u32; /// Token content. -pub type Token = String; +pub type Token = Vec; /// Token identifier. pub type TokenId = u32; diff --git a/src/vocabulary/mod.rs b/src/vocabulary/mod.rs index 13156ade..8b137891 100644 --- a/src/vocabulary/mod.rs +++ b/src/vocabulary/mod.rs @@ -1,480 +1 @@ -use rustc_hash::FxHashMap as HashMap; -use tokenizers::normalizers::Sequence; -use tokenizers::{FromPretrainedParameters, NormalizerWrapper, Tokenizer}; - -use crate::prelude::*; -use crate::{Error, Result}; - -use locator::{HFLocator, Locator}; -use processor::TokenProcessor; - -mod locator; -mod processor; - -/// Vocabulary of an LLM. -/// -/// ## Examples -/// -/// ```rust -/// # use outlines_core::prelude::*; -/// # -/// let vocabulary = Vocabulary::new(None) -/// .insert("blah", 0) -/// .insert("1a", 1) -/// .insert("2", 2) -/// .insert("0", 3); -/// ``` -#[derive(Clone, Debug, Default)] -pub struct Vocabulary { - // TODO: Option is temp for back compatibility - eos_token_id: Option, - tokens: HashMap>, -} - -impl Vocabulary { - /// Creates an empty vocabulary. - pub fn new(eos_token_id: Option) -> Self { - Self { - eos_token_id, - tokens: HashMap::default(), - } - } - - pub fn with_eos_token_id(self, eos_token_id: Option) -> Self { - Self { - eos_token_id, - ..self - } - } - - /// Creates the vocabulary of pre-trained model from Hugging Face Hub. - pub fn from_pretrained( - model: &str, - parameters: Option, - ) -> Result { - Self::from_pretrained_with_locator::(model, parameters) - } - - #[doc(hidden)] - #[inline(always)] - fn from_pretrained_with_locator( - model: &str, - parameters: Option, - ) -> Result { - let mut tokenizer = Tokenizer::from_pretrained(model, parameters.clone())?; - Self::filter_prepend_normalizers(&mut tokenizer); - - // Locate eos_token_id in defined locations. - let eos_token_id = L::locate_eos_token_id(model, &tokenizer, ¶meters); - let Some(eos_token_id) = eos_token_id else { - return Err(Error::UnsupportedTokenizer { - model: model.to_string(), - reason: "EOS token id".to_string(), - }); - }; - - // Start building the vocabulary from eos_token_id and added tokens. - let mut vocabulary = Vocabulary::new(Some(eos_token_id)); - for (id, added_token) in tokenizer.get_added_tokens_decoder().iter() { - if !added_token.special { - vocabulary = vocabulary.insert(added_token.content.clone(), *id); - } - } - - // Process each vocabulary token according to the tokenizer's level. - let Ok(processor) = TokenProcessor::new(&tokenizer) else { - return Err(Error::UnsupportedTokenizer { - model: model.to_string(), - reason: "Token processor".to_string(), - }); - }; - for (token, token_id) in tokenizer.get_vocab(false) { - let token_bytes = processor.process(token)?; - // TODO: lossy is temp: - // - in python in was handled by byte_symbol function - // - interface needs to be redefined to treat Token type as bytes: Vec - let processed_token = String::from_utf8_lossy(&token_bytes); - vocabulary = vocabulary.insert(processed_token, token_id); - } - - Ok(vocabulary) - } - - /// Returns all tokens with their token ids in vocabulary - pub fn tokens_to_ids(&self) -> &HashMap> { - &self.tokens - } - - /// Per provided token returns vector of `TokenId`s if available in the vocabulary. - pub fn token_to_ids(&self, token: &str) -> Option<&Vec> { - self.tokens.get(token) - } - - /// Gets the identifier of the special end of the sentence token. - pub fn eos_token_id(&self) -> Option { - self.eos_token_id - } - - /// Filters out `Prepend` kind of tokenizer's normalizers. - fn filter_prepend_normalizers(tokenizer: &mut Tokenizer) { - // Main concern is prepend normalizers, for example https://github.com/google/sentencepiece - // In `sentencepiece` tokenizer, `▁` is used to denote spaces in the source text, - // e.g. `Hello World.` could be tokenized as: [Hello] [▁Wor] [ld] [.] - // - // We don't want to deal with the special characters, so we remove `Prepend` normalizers. - if let Some(normalizer) = tokenizer.get_normalizer() { - match normalizer { - NormalizerWrapper::Sequence(normalization_sequence) => { - let new_sequence = Sequence::new( - normalization_sequence - .get_normalizers() - .iter() - .filter_map(|normalizer| match normalizer { - NormalizerWrapper::Prepend(_) => None, - _ => Some(normalizer.clone()), - }) - .collect(), - ); - tokenizer.with_normalizer(new_sequence.into()); - } - NormalizerWrapper::Prepend(_) => { - tokenizer.with_normalizer(None::); - } - _ => {} - } - } - } -} - -impl Vocabulary { - /// Inserts a token to the vocabulary with the specified identifier. - pub fn insert(mut self, token: impl Into, id: TokenId) -> Vocabulary { - self.insert_in_place(token, id); - self - } - - /// Extends the vocabulary with tokens and their identifiers. - pub fn extend, I: IntoIterator>( - mut self, - tokens_and_ids: impl IntoIterator, - ) -> Vocabulary { - self.extend_in_place(tokens_and_ids); - self - } -} - -impl Vocabulary { - /// Inserts a token to the vocabulary with the specified identifier, in place. - pub fn insert_in_place(&mut self, token: impl Into, id: TokenId) { - // TODO: return error if eos token id is inserted - let token = token.into(); - self.tokens.entry(token).or_default().push(id); - } - - /// Extends the vocabulary with tokens and their identifiers, in place. - pub fn extend_in_place, I: IntoIterator>( - &mut self, - tokens_and_ids: impl IntoIterator, - ) { - for (token, ids) in tokens_and_ids.into_iter() { - let token = token.into(); - self.tokens.entry(token).or_default().extend(ids); - } - } -} - -impl std::ops::Deref for Vocabulary { - type Target = HashMap>; - - fn deref(&self) -> &HashMap> { - &self.tokens - } -} - -impl std::fmt::Display for Vocabulary { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - for (index, (token, token_ids)) in self.iter().enumerate() { - if index != (self.len() - 1) { - writeln!(f, "{:?} -> {:?}", token, token_ids)?; - } else { - write!(f, "{:?} -> {:?}", token, token_ids)?; - } - } - Ok(()) - } -} - -impl From>> for Vocabulary { - fn from(tokens: HashMap>) -> Vocabulary { - Vocabulary { - eos_token_id: None, - tokens, - } - } -} - -impl FromIterator<(T, I)> for Vocabulary -where - T: Into, - I: IntoIterator, -{ - fn from_iter>(tokens_and_ids: A) -> Self { - Vocabulary::new(None).extend(tokens_and_ids) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn insert() { - let vocabulary = Vocabulary::new(None) - .insert("blah", 0) - .insert("1a", 1) - .insert("2", 2) - .insert("0", 3); - - assert_eq!(vocabulary.len(), 4); - assert_eq!(vocabulary["blah"], &[0]); - assert_eq!(vocabulary["1a"], &[1]); - assert_eq!(vocabulary["2"], &[2]); - assert_eq!(vocabulary["0"], &[3]); - } - - #[test] - fn extend() { - let vocabulary = Vocabulary::new(None).extend([ - ("blah", vec![0]), - ("1a", vec![1]), - ("2", vec![2]), - ("0", vec![3]), - ]); - - assert_eq!(vocabulary.len(), 4); - assert_eq!(vocabulary["blah"], &[0]); - assert_eq!(vocabulary["1a"], &[1]); - assert_eq!(vocabulary["2"], &[2]); - assert_eq!(vocabulary["0"], &[3]); - } - - #[test] - fn new_empty_vocabulary() { - let vocabulary = Vocabulary::new(None); - assert!(vocabulary.eos_token_id.is_none()); - assert!(vocabulary.tokens.is_empty()); - } - - #[test] - fn new_empty_vocabulary_from_hashmap() { - let map = HashMap::default(); - let vocabulary = Vocabulary::from(map); - assert!(vocabulary.eos_token_id.is_none()); - assert!(vocabulary.tokens.is_empty()); - } - - #[test] - fn new_vocabulary_from_iterator() { - let token: Token = "abc".to_string(); - let id: Vec = vec![1]; - let it = vec![(token, id)]; - let vocabulary = Vocabulary::from_iter(it); - assert!(vocabulary.eos_token_id.is_none()); - assert!(!vocabulary.tokens.is_empty()); - } - - #[test] - fn supported_pretrained_models() { - // Support is expected for these: - for model in [ - // GPT 2 - "openai-community/gpt2", - // Llama 2 - "hf-internal-testing/Llama-2-7B-GPTQ", - // Llama 3 - // OpenCoder: shares llama tokenizers - "hf-internal-testing/llama-3-8b-internal", - // Qwen - "Qwen/Qwen2-7B-Instruct", - // Salamandra - "BSC-LT/salamandra-2b", - ] { - let vocabulary = Vocabulary::from_pretrained(model, None); - match vocabulary { - Ok(v) => { - assert!(v.eos_token_id().is_some()); - assert_eq!(v.eos_token_id, v.eos_token_id()); - assert!(!v.tokens.is_empty()); - } - Err(_) => unreachable!(), - } - } - } - - #[test] - fn pretrained_from_gpt2() { - let model = "openai-community/gpt2"; - let tokenizer = Tokenizer::from_pretrained(model, None).expect("Tokenizer failed"); - let vocabulary = Vocabulary::from_pretrained(model, None).expect("Vocabulary failed"); - - let v_eos = vocabulary.eos_token_id; - assert_eq!(v_eos, vocabulary.eos_token_id()); - assert!(v_eos.is_some()); - - let v_eos = v_eos.unwrap(); - assert_eq!(v_eos, 50256); - assert_eq!( - tokenizer.id_to_token(v_eos).expect("Token not found"), - "<|endoftext|>" - ); - - let token = "Δ al"; - assert!(vocabulary.token_to_ids(token).is_none()); - assert!(tokenizer.token_to_id(token).is_some()); - - for (v_token, t_token_expected) in [("abc", "abc"), (" O", "Δ O")] { - let v_ids = vocabulary.token_to_ids(v_token); - assert!(v_ids.is_some()); - for v_id in v_ids.unwrap() { - let t_token = tokenizer - .id_to_token(*v_id) - .expect("Token id not found in tokenizer"); - assert_eq!(&t_token, t_token_expected); - } - } - } - - #[test] - fn pretrained_from_llama() { - let model = "hf-internal-testing/llama-tokenizer"; - let tokenizer = Tokenizer::from_pretrained(model, None).expect("Tokenizer failed"); - let vocabulary = Vocabulary::from_pretrained(model, None).expect("Vocabulary failed"); - - let v_eos = vocabulary.eos_token_id; - assert_eq!(v_eos, vocabulary.eos_token_id()); - assert!(v_eos.is_some()); - - let v_eos = v_eos.unwrap(); - assert_eq!(v_eos, 2); - assert_eq!( - tokenizer.id_to_token(v_eos).expect("Token not found"), - "" - ); - - for (v_token, t_token_expected) in [ - ("abc", "abc"), - (" al", "▁al"), - (" O", "▁O"), - (" ", "▁▁▁"), - // TODO: won't pass since first we need to change token's type to bytes - // ("<0xFF>", "ΓΏ"), - // ("<0x20>", "▁"), - ] { - let v_ids = vocabulary.token_to_ids(v_token); - assert!(v_ids.is_some()); - for v_id in v_ids.unwrap() { - let t_token = tokenizer - .id_to_token(*v_id) - .expect("Token id not found in tokenizer"); - assert_eq!(&t_token, t_token_expected); - } - } - } - - #[test] - fn token_processor_error() { - let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM"; - let vocabulary = Vocabulary::from_pretrained(model, None); - - match vocabulary { - Err(Error::UnsupportedTokenizer { model, reason }) => { - assert_eq!(model, model.to_string()); - assert_eq!(&reason, "Token processor"); - } - _ => unreachable!(), - } - } - - #[test] - fn tokenizer_error() { - let model = "hf-internal-testing/some-non-existent-model"; - let vocabulary = Vocabulary::from_pretrained(model, None); - - match vocabulary { - Err(Error::TokenizersError(e)) => assert!(!e.to_string().is_empty()), - _ => unreachable!(), - } - } - - struct NoneLocator; - impl Locator for NoneLocator { - fn locate_eos_token_id( - _model: &str, - _tokenizer: &Tokenizer, - _parameters: &Option, - ) -> Option { - None - } - } - - #[test] - fn unable_to_locate_eos_token_id_error() { - let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM"; - let vocabulary = Vocabulary::from_pretrained_with_locator::(model, None); - - match vocabulary { - Err(Error::UnsupportedTokenizer { model, reason }) => { - assert_eq!(model, model.to_string()); - assert_eq!(&reason, "EOS token id"); - } - _ => unreachable!(), - } - } - - #[test] - fn prepend_normalizers_filtered_out() { - use tokenizers::normalizers::{Prepend, Sequence}; - - let prepend = Prepend::new("_".to_string()); - let prepend_normalizer = NormalizerWrapper::Prepend(prepend); - let sequence = Sequence::new(vec![prepend_normalizer.clone()]); - let sequence_normalizer = NormalizerWrapper::Sequence(sequence); - - let model = "hf-internal-testing/llama-tokenizer"; - let tokenizer = Tokenizer::from_pretrained(model, None).expect("Tokenizer failed"); - - for normalizer in [prepend_normalizer, sequence_normalizer] { - let mut normalized_t = tokenizer.clone(); - normalized_t.with_normalizer(Some(normalizer)); - Vocabulary::filter_prepend_normalizers(&mut normalized_t); - if let Some(n) = normalized_t.get_normalizer() { - match n { - NormalizerWrapper::Sequence(seq) => { - for n in seq.get_normalizers() { - if let NormalizerWrapper::Prepend(_) = n { - unreachable!() - } - } - } - NormalizerWrapper::Prepend(_) => unreachable!(), - _ => {} - } - } - } - } - - #[test] - fn other_normalizers_being_kept() { - use tokenizers::normalizers::BertNormalizer; - - let model = "hf-internal-testing/llama-tokenizer"; - let normalizer = NormalizerWrapper::BertNormalizer(BertNormalizer::default()); - let mut tokenizer = Tokenizer::from_pretrained(model, None).expect("Tokenizer failed"); - tokenizer.with_normalizer(Some(normalizer)); - - Vocabulary::filter_prepend_normalizers(&mut tokenizer); - - assert!(tokenizer.get_normalizer().is_some()); - } -}