Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend Vocabulary #88

Merged
merged 14 commits into from
Nov 19, 2024
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ benchmarks/results
# Remove doc build folders
.cache/
build/

rust-coverage/
target/
*.so
*.pyd
Expand Down
12 changes: 10 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@ license = "Apache-2.0"
repository = "https://github.com/dottxt-ai/outlines-core"

[dependencies]
once_cell = "1.20"
anyhow = "1.0.86"
thiserror = "1.0"
thiserror = "2.0"
pyo3 = { version = "0.22.0", features = ["extension-module"], optional = true }
regex = "1.10.6"
serde-pyobject = "0.4.0"
serde_json = { version = "1.0.125", features = ["preserve_order"] }
serde_json = { version = "1.0", features = ["preserve_order"] }
serde = {version = "1.0", features = ["derive"]}
# Fragile dependencies, minor updates often break the code
hf-hub = "=0.3.2"
tokenizers = { version = "=0.20.3", features = ["http"] }

[features]
python-bindings = ["pyo3"]
Expand All @@ -31,3 +36,6 @@ panic = 'abort'
[package.metadata.scripts]
build-python-extension = "python setup.py build_rust --inplace --debug"
build-python-extension-release = "python setup.py build_rust --inplace --release"

[lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tarpaulin_include)'] }
23 changes: 22 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Optional target to test/benchmark.
TARGET ?=
TARPAULIN_INSTALLED := $(shell command -v cargo-tarpaulin > /dev/null && echo 1 || echo 0)

.ONESHELL:
.PHONY: venv setup install install-release build-extension-debug build-extension-release watch-extension watch-extension-release pcc test test-rust test-python bench pybench doc dist clean check-clean-git
.PHONY: venv setup install install-release build-extension-debug build-extension-release watch-extension watch-extension-release pcc test test-rust test-python bench pybench doc dist clean check-clean-git check-tarpaulin test-rust-cov
.SILENT:

# Create a fresh virtual environment with the latest pip.
Expand Down Expand Up @@ -59,6 +60,26 @@ test-python: build-extension-debug
--cov=outlines_core \
--cov-report=term-missing:skip-covered

# Check if tarpaulin needs to be installed first.
check-tarpaulin:
ifeq ($(TARPAULIN_INSTALLED), 0)
@echo "cargo-tarpaulin is not found, installing..."
cargo install cargo-tarpaulin
else
@echo "cargo-tarpaulin is already installed"
endif

# Run rust tests with coverage report.
test-rust-cov: check-tarpaulin
RUSTFLAGS="-C instrument-coverage" cargo tarpaulin \
--out=Lcov \
--output-dir=rust-coverage \
--engine=llvm \
--exclude-files=src/python_bindings/* \
--no-dead-code \
--workspace \
--verbose

# Run rust benchmarks.
bench:
ifeq ($(TARGET),)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ filterwarnings = [
"error",
"ignore::pydantic.warnings.PydanticDeprecatedSince20",
"ignore::UserWarning",
"ignore::DeprecationWarning",
]
addopts = [
"--import-mode=importlib"
Expand Down
41 changes: 41 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use thiserror::Error;

pub type Result<T, E = crate::Error> = std::result::Result<T, E>;

#[derive(Error, Debug)]
#[error("{0}")]
pub struct TokenizersError(pub tokenizers::Error);

impl PartialEq for TokenizersError {
fn eq(&self, other: &Self) -> bool {
self.0.to_string() == other.0.to_string()
}
}

#[derive(Error, Debug, PartialEq)]
pub enum Error {
#[error("The vocabulary does not allow us to build a sequence that matches the input")]
IndexError,
torymur marked this conversation as resolved.
Show resolved Hide resolved
#[error(transparent)]
TokenizersError(#[from] TokenizersError),
#[error("Unsupported tokenizer for {model}: {reason}, please open an issue with the full error message: https://github.com/dottxt-ai/outlines-core/issues")]
UnsupportedTokenizer { model: String, reason: String },
#[error("Unable to locate EOS token for {model}")]
UnableToLocateEosTokenId { model: String },
#[error("Tokenizer is not supported by token processor")]
UnsupportedByTokenProcessor,
#[error("Decoder unpacking failed for token processor")]
DecoderUnpackingFailed,
#[error("Token processing failed for byte level processor")]
ByteProcessorFailed,
#[error("Token processing failed for byte fallback level processor")]
ByteFallbackProcessorFailed,
}

#[cfg(feature = "python-bindings")]
impl From<Error> for pyo3::PyErr {
fn from(e: Error) -> Self {
use pyo3::{exceptions::PyValueError, PyErr};
PyErr::new::<PyValueError, _>(e.to_string())
}
}
5 changes: 2 additions & 3 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
use crate::prelude::{State, TransitionKey};
use crate::regex::{get_vocabulary_transition_keys, state_scan_tokens};
use crate::vocabulary::Vocabulary;
use crate::{Error, Result};
use std::collections::{HashMap, HashSet};

pub type Result<T, E = crate::Error> = std::result::Result<T, E>;

#[derive(Debug)]
pub struct FSMInfo {
pub(crate) initial: State,
Expand Down Expand Up @@ -101,7 +100,7 @@ impl Index {
eos_token_id,
})
} else {
Err(crate::Error::IndexError)
Err(Error::IndexError)
}
}

Expand Down
19 changes: 3 additions & 16 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,12 @@
pub mod error;
pub mod index;
pub mod json_schema;
pub mod prelude;
pub mod primitives;
pub mod regex;
pub mod vocabulary;

#[cfg(feature = "python-bindings")]
mod python_bindings;

use thiserror::Error;

#[derive(Error, Debug)]
pub enum Error {
#[error("The vocabulary does not allow us to build a sequence that matches the input")]
IndexError,
}
pub use error::{Error, Result};

#[cfg(feature = "python-bindings")]
impl From<Error> for pyo3::PyErr {
fn from(e: Error) -> Self {
use pyo3::{exceptions::PyValueError, PyErr};
PyErr::new::<PyValueError, _>(e.to_string())
}
}
mod python_bindings;
6 changes: 0 additions & 6 deletions src/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,3 @@ pub use super::{
primitives::{State, Token, TokenId, TransitionKey},
vocabulary::Vocabulary,
};

pub(crate) use std::{
collections::{HashMap, HashSet},
fmt::{self, Display},
ops::Deref,
};
1 change: 1 addition & 0 deletions src/regex.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::prelude::*;
use std::collections::{HashMap, HashSet};

pub fn walk_fsm(
fsm_transitions: &HashMap<(State, TransitionKey), State>,
Expand Down
133 changes: 0 additions & 133 deletions src/vocabulary.rs

This file was deleted.

Loading
Loading