From 190e73f45cce7a50908701e69d147c48c74a8e07 Mon Sep 17 00:00:00 2001 From: erikkaum Date: Thu, 15 Aug 2024 18:30:41 +0200 Subject: [PATCH] basic outlines of repo structure with core in rust and python bindings --- .github/workflows/outlines_core_python_ci.yml | 169 ++++++++++++++ .gitignore | 1 + Cargo.lock | 176 +++++++++++++++ Cargo.toml | 7 + README.md | 15 ++ bindings/python/.gitignore | 72 ++++++ bindings/python/Cargo.toml | 13 ++ bindings/python/pyproject.toml | 15 ++ bindings/python/src/lib.rs | 53 +++++ justfile | 5 + outlines-core/Cargo.toml | 6 + outlines-core/src/lib.rs | 211 ++++++++++++++++++ 12 files changed, 743 insertions(+) create mode 100644 .github/workflows/outlines_core_python_ci.yml create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 bindings/python/.gitignore create mode 100644 bindings/python/Cargo.toml create mode 100644 bindings/python/pyproject.toml create mode 100644 bindings/python/src/lib.rs create mode 100644 justfile create mode 100644 outlines-core/Cargo.toml create mode 100644 outlines-core/src/lib.rs diff --git a/.github/workflows/outlines_core_python_ci.yml b/.github/workflows/outlines_core_python_ci.yml new file mode 100644 index 00000000..0e66a341 --- /dev/null +++ b/.github/workflows/outlines_core_python_ci.yml @@ -0,0 +1,169 @@ +# This file is autogenerated by maturin v1.7.0 +# To update, run +# +# maturin generate-ci github +# +name: CI + +on: + push: + branches: + - main + - master + tags: + - '*' + pull_request: + workflow_dispatch: + +permissions: + contents: read + +jobs: + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + - runner: ubuntu-latest + target: s390x + - runner: ubuntu-latest + target: ppc64le + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.platform.target }} + path: dist + + musllinux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: musllinux_1_2 + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-musllinux-${{ matrix.platform.target }} + path: dist + + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + - runner: windows-latest + target: x86 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + architecture: ${{ matrix.platform.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: dist + + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-12 + target: x86_64 + - runner: macos-14 + target: aarch64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }} + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: "startsWith(github.ref, 'refs/tags/')" + needs: [linux, musllinux, windows, macos, sdist] + steps: + - uses: actions/download-artifact@v4 + - name: Publish to PyPI + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/.gitignore b/.gitignore index 9add6d8c..73a1dcff 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ docs/build *.gguf .venv benchmarks/results +**/target/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..37ad0400 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,176 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "outlines-core" +version = "0.1.0" + +[[package]] +name = "outlines-core-python" +version = "0.1.0" +dependencies = [ + "outlines-core", + "pyo3", +] + +[[package]] +name = "portable-atomic" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "831e8e819a138c36e212f3af3fd9eeffed6bf1510a805af35b0edee5ffa59433" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e8730e591b14492a8945cdff32f089250b05f5accecf74aeddf9e8272ce1fa8" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e97e919d2df92eb88ca80a037969f44e5e70356559654962cbb3316d00300c6" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb57983022ad41f9e683a599f2fd13c3664d7063a3ac5714cae4b7bee7d3f206" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec480c0c51ddec81019531705acac51bcdbeae563557c982aa8263bb96880372" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..34af0dec --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[workspace] + members = [ + "outlines-core", + "bindings/python", + ] + + resolver = "2" diff --git a/README.md b/README.md index 769c87c1..93f4dc87 100644 --- a/README.md +++ b/README.md @@ -1 +1,16 @@ # Outlines-core + +## structure + +- `outlines-core/` can be consumed as an independent low-level package +- `bindings/` contains the API exposed to other languages, in this case only python + +## developing + +- build only the outlines-core package `cd outlines-core && cargo build` +- dev build of python bindings `cd bindings/python && maturin develop`. If you have the conda `outlines-dev` environment activated, the outlines-core module is installed within the env automatically + +There's also a [justfile](https://github.com/casey/just) for running these easier: +- `just dev-core` +- `just dev-python` + diff --git a/bindings/python/.gitignore b/bindings/python/.gitignore new file mode 100644 index 00000000..c8f04429 --- /dev/null +++ b/bindings/python/.gitignore @@ -0,0 +1,72 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml new file mode 100644 index 00000000..b85017fe --- /dev/null +++ b/bindings/python/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "outlines-core-python" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +name = "outlines_core" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = "0.22.0" +outlines-core = { path = "../../outlines-core" } diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml new file mode 100644 index 00000000..3e319949 --- /dev/null +++ b/bindings/python/pyproject.toml @@ -0,0 +1,15 @@ +[build-system] +requires = ["maturin>=1.7,<2.0"] +build-backend = "maturin" + +[project] +name = "outlines_core" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dynamic = ["version"] +[tool.maturin] +features = ["pyo3/extension-module"] diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs new file mode 100644 index 00000000..73243893 --- /dev/null +++ b/bindings/python/src/lib.rs @@ -0,0 +1,53 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use ::outlines_core as core_lib; +use pyo3::types::{PyAnyMethods, PyDict, PyModule, PyModuleMethods, PySet}; +use pyo3::{pyfunction, pymodule, wrap_pyfunction, Bound, PyResult}; + +#[pyfunction] +/// Create an FSM index end-to-end. +/// +/// Args: +/// fsm_transitions (Dict[Tuple[int, int], int]): FSM transitions mapping. +/// alphabet_symbol_mapping (Dict[str, int]): Alphabet symbol mapping. +/// alphabet_anything_value (int): Value representing 'anything' in the alphabet. +/// fsm_initial (int): Initial state of the FSM. +/// fsm_finals (Set[int]): Set of final states in the FSM. +/// vocabulary (Dict[str, List[int]]): Vocabulary mapping. +/// +/// Returns: +/// Dict[int, Set[Tuple[int, int]]]: The created FSM index. +/// +/// Raises: +/// ValueError: If the input types are incorrect or conversion fails. +fn create_fsm_index_end_to_end( + fsm_transitions: Bound, + alphabet_symbol_mapping: Bound, + alphabet_anything_value: i32, + fsm_initial: i32, + fsm_finals: Bound, + vocabulary: Bound, +) -> PyResult>> { + let fsm_transitions_map = fsm_transitions.extract::>()?; + let alphabet_symbol_mapping_map = alphabet_symbol_mapping.extract::>()?; + let fsm_finals_set = fsm_finals.extract::>()?; + let vocabulary_map = vocabulary.extract::>>()?; + + let res = core_lib::create_fsm_index_end_to_end_rust( + &fsm_transitions_map, + &alphabet_symbol_mapping_map, + alphabet_anything_value, + fsm_initial, + &fsm_finals_set, + &vocabulary_map, + ); + + Ok(res) +} + +/// Outlines is a Generative Model Programming Framework. +#[pymodule] +fn outlines_core(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(create_fsm_index_end_to_end, m)?)?; + Ok(()) +} diff --git a/justfile b/justfile new file mode 100644 index 00000000..d2092db2 --- /dev/null +++ b/justfile @@ -0,0 +1,5 @@ +dev-core: + cd outlines-core && cargo build + +dev-python: + cd bindings/python && maturin develop \ No newline at end of file diff --git a/outlines-core/Cargo.toml b/outlines-core/Cargo.toml new file mode 100644 index 00000000..9043204e --- /dev/null +++ b/outlines-core/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "outlines-core" +version = "0.1.0" +edition = "2021" + +[dependencies] diff --git a/outlines-core/src/lib.rs b/outlines-core/src/lib.rs new file mode 100644 index 00000000..14e3dbb2 --- /dev/null +++ b/outlines-core/src/lib.rs @@ -0,0 +1,211 @@ +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use std::thread; + +pub fn create_fsm_index_end_to_end_rust( + fsm_transitions: &BTreeMap<(i32, i32), i32>, + alphabet_symbol_mapping: &BTreeMap, + alphabet_anything_value: i32, + fsm_initial: i32, + fsm_finals: &BTreeSet, + vocabulary: &BTreeMap>, +) -> BTreeMap> { + let mut states_to_token_subsets: BTreeMap> = BTreeMap::new(); + let mut seen: BTreeSet = BTreeSet::new(); + let mut next_states = vec![fsm_initial]; + + // TODO adjust dynamically + let n_threads = 16; + + let tokens: Vec = vocabulary.keys().cloned().collect(); + let _token_ids: Vec> = vocabulary.values().cloned().collect(); + + let n_tokens = tokens.len(); + let _chunk_size = n_tokens / n_threads; + + let _start_time = std::time::Instant::now(); + + while let Some(start_state) = next_states.pop() { + let _start = std::time::Instant::now(); + let token_ids_end_states = state_scan_tokens( + fsm_transitions, + alphabet_symbol_mapping, + alphabet_anything_value, + fsm_initial, + fsm_finals, + vocabulary, + start_state, + ); + for token_id_and_end_state in token_ids_end_states { + let end_state = token_id_and_end_state.1; + if !seen.contains(&end_state) { + next_states.push(end_state); + } + states_to_token_subsets + .entry(start_state) + .or_default() + .insert(token_id_and_end_state.0, token_id_and_end_state.1); + } + // println!("state_scan_tokens: {:?}", start.elapsed()); + seen.insert(start_state); + } + + let mut states_to_token_subsets_btree: BTreeMap> = BTreeMap::new(); + + for (k, v) in states_to_token_subsets.iter() { + let token_subsets: BTreeSet<(i32, i32)> = v.iter().map(|(&k1, &v1)| (k1, v1)).collect(); + states_to_token_subsets_btree.insert(*k, token_subsets); + } + + states_to_token_subsets_btree +} + +fn walk_fsm( + fsm_transitions: &BTreeMap<(i32, i32), i32>, + alphabet_symbol_mapping: &BTreeMap, + alphabet_anything_value: i32, + _fsm_initial: i32, + fsm_finals: &BTreeSet, + input_string: &str, + start_state: i32, + full_match: bool, +) -> Vec { + let mut state = start_state; + let mut accepted_states = Vec::new(); + let mut is_final_state_reached = false; + + for symbol in input_string.chars() { + let trans_key = alphabet_symbol_mapping + .get(&symbol) + .unwrap_or(&alphabet_anything_value); + if let Some(&new_state) = fsm_transitions.get(&(state, *trans_key)) { + state = new_state; + if fsm_finals.contains(&state) { + is_final_state_reached = true; + } + accepted_states.push(state); + } else { + // Exit early if not full match and a final state was reached before + if !full_match && is_final_state_reached { + break; + } + return Vec::new(); + } + } + + if full_match && !is_final_state_reached { + return Vec::new(); + } + + accepted_states +} + +fn state_scan_tokens( + fsm_transitions_map: &BTreeMap<(i32, i32), i32>, + alphabet_symbol_mapping_map: &BTreeMap, + alphabet_anything_value: i32, + fsm_initial: i32, + fsm_finals_set: &BTreeSet, + vocabulary_map: &BTreeMap>, + start_state: i32, +) -> Vec<(i32, i32)> { + let _start_time = std::time::Instant::now(); + + // TODO choose dynamically + let mut n_threads = 16; + + // Convert fsm_transitions to BTreeMap and two vectors + let mut tokens = Vec::new(); + let mut token_ids = Vec::new(); + + for (k, v) in vocabulary_map.iter() { + tokens.push(k.clone()); + token_ids.push(v.clone()); + } + + let n_tokens = tokens.len(); + let _chunk_size = n_tokens / n_threads; + + // Prepare for multithreading + let fsm_transitions_map_arc = Arc::new(fsm_transitions_map); + let alphabet_symbol_mapping_map_arc = Arc::new(alphabet_symbol_mapping_map); + let fsm_finals_set_arc = Arc::new(fsm_finals_set); + let tokens_arc = Arc::new(tokens); + let token_ids_arc = Arc::new(token_ids); + + let mut token_chunks = Vec::new(); + let tokens_per_thread = (n_tokens as f32 / n_threads as f32).ceil() as usize; + + if n_tokens > 1000 { + for i in 0..n_threads { + let start = i * tokens_per_thread; + let mut end = start + tokens_per_thread; + + // Make sure we don't go out of bounds on the last chunk + if end > n_tokens { + end = n_tokens; + } + + // Only add chunks that have data to process + if start < n_tokens { + token_chunks.push((start, end)); + } + } + } else { + n_threads = 1; + token_chunks.push((0, n_tokens)); + } + + let all_outputs: Vec> = thread::scope(|s| { + (0..n_threads) + .map(|thread_id| { + let _start_time = std::time::Instant::now(); + + let start = token_chunks[thread_id].0; + let end = token_chunks[thread_id].1; + + let token_chunk = tokens_arc[start..end].to_vec(); + let token_ids_chunk = token_ids_arc[start..end].to_vec(); + + let fsm_transitions_map_clone = Arc::clone(&fsm_transitions_map_arc); + let alphabet_symbol_mapping_map_clone = + Arc::clone(&alphabet_symbol_mapping_map_arc); + let fsm_finals_set_clone = Arc::clone(&fsm_finals_set_arc); + let _token_ids_arc_clone = Arc::clone(&token_ids_arc); + s.spawn(move || { + let mut res = Vec::new(); + // zip the token_chunk with the token_ids_chunk + for i in 0..token_chunk.len() { + let token = &token_chunk[i]; + let token_ids = &token_ids_chunk[i]; + let state_seq = walk_fsm( + &fsm_transitions_map_clone, + &alphabet_symbol_mapping_map_clone, + alphabet_anything_value, + fsm_initial, + &fsm_finals_set_clone, + token, + start_state, + false, + ); + if state_seq.len() < token.len() { + continue; + } + + for token_id in token_ids { + res.push((*token_id, state_seq[state_seq.len() - 1])); + } + } + res + }) + }) + .collect::>() + .into_iter() + // wait for each thread to finish and collect their results + .map(|handle| handle.join().expect("Thread failed")) + .collect::>() + }); + let res = all_outputs.into_iter().flatten().collect(); + res +}