Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend Vocabulary #88

Merged
merged 14 commits into from
Nov 19, 2024
6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -7,12 +7,16 @@ license = "Apache-2.0"
repository = "https://github.com/dottxt-ai/outlines-core"

[dependencies]
once_cell = "1.20"
anyhow = "1.0.86"
thiserror = "1.0"
pyo3 = { version = "0.22.0", features = ["extension-module"], optional = true }
regex = "1.10.6"
serde-pyobject = "0.4.0"
serde_json = { version = "1.0.125", features = ["preserve_order"] }
serde_json = { version = "1.0", features = ["preserve_order"] }
serde = {version = "1", features = ["derive"]}
hf-hub = "=0.3.2"
tokenizers = { version = "=0.20.0", features = ["http"] }
torymur marked this conversation as resolved.
Show resolved Hide resolved

[features]
python-bindings = ["pyo3"]
37 changes: 36 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -10,12 +10,47 @@ mod python_bindings;

use thiserror::Error;

#[derive(Error, Debug)]
#[derive(Error, Debug, PartialEq)]
pub enum Error {
#[error("The vocabulary does not allow us to build a sequence that matches the input")]
IndexError,
}

#[derive(Error, Debug)]
#[error("Tokenizer error")]
pub struct TokenizerError(tokenizers::Error);

impl PartialEq for TokenizerError {
fn eq(&self, other: &Self) -> bool {
self.0.to_string() == other.0.to_string()
}
}

#[derive(Error, Debug, PartialEq)]
pub enum VocabularyError {
#[error("Unable to create tokenizer for {model}, source {source}")]
UnableToCreateTokenizer {
model: String,
source: TokenizerError,
},
#[error("Unable to locate EOS token for {model}")]
UnableToLocateEosTokenId { model: String },
#[error("Unable to process token")]
TokenProcessorError(#[from] TokenProcessorError),
}

#[derive(Error, Debug, PartialEq)]
pub enum TokenProcessorError {
#[error("Tokenizer is not supported")]
UnsupportedTokenizer,
#[error("Decoder unpacking failed")]
DecoderUnpackingFailed,
#[error("Token processing failed for byte level processor")]
ByteProcessorFailed,
#[error("Token processing failed for byte fallback level processor")]
ByteFallbackProcessorFailed,
}

#[cfg(feature = "python-bindings")]
impl From<Error> for pyo3::PyErr {
fn from(e: Error) -> Self {
6 changes: 0 additions & 6 deletions src/prelude.rs
Original file line number Diff line number Diff line change
@@ -2,9 +2,3 @@ pub use super::{
primitives::{State, Token, TokenId, TransitionKey},
vocabulary::Vocabulary,
};

pub(crate) use std::{
collections::{HashMap, HashSet},
fmt::{self, Display},
ops::Deref,
};
1 change: 1 addition & 0 deletions src/regex.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::prelude::*;
use std::collections::{HashMap, HashSet};

pub fn walk_fsm(
fsm_transitions: &HashMap<(State, TransitionKey), State>,
133 changes: 0 additions & 133 deletions src/vocabulary.rs

This file was deleted.

Loading