Skip to content

Commit

Permalink
some doccomment stubs I wrote along the way
Browse files Browse the repository at this point in the history
  • Loading branch information
flammie committed Jan 14, 2025
1 parent 63d807a commit 7b60b05
Show file tree
Hide file tree
Showing 15 changed files with 96 additions and 0 deletions.
9 changes: 9 additions & 0 deletions divvunspell/src/archive/boxf.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Box-based archive stuff.
use std::sync::Arc;

use box_format::BoxFileReader;
Expand All @@ -18,28 +19,35 @@ use crate::transducer::{
use crate::vfs::boxf::Filesystem as BoxFilesystem;
use crate::vfs::Filesystem;

/// An archive with mmaped language and error model THFST automata archive.
pub type ThfstBoxSpellerArchive = BoxSpellerArchive<
MemmapThfstTransducer<crate::vfs::boxf::File>,
MemmapThfstTransducer<crate::vfs::boxf::File>,
>;

/// An archive with mmaped chunked language and error model THFST automata
/// file.
pub type ThfstChunkedBoxSpeller = HfstSpeller<
crate::vfs::boxf::File,
MemmapThfstChunkedTransducer<crate::vfs::boxf::File>,
MemmapThfstChunkedTransducer<crate::vfs::boxf::File>,
>;

/// An archive with mmaped language and error model THFST automata file.
pub type ThfstBoxSpeller = HfstSpeller<
crate::vfs::boxf::File,
MemmapThfstTransducer<crate::vfs::boxf::File>,
MemmapThfstTransducer<crate::vfs::boxf::File>,
>;

/// An archive with mmaped chunked language and error model THFST automata
/// archive.
pub type ThfstChunkedBoxSpellerArchive = BoxSpellerArchive<
MemmapThfstChunkedTransducer<crate::vfs::boxf::File>,
MemmapThfstChunkedTransducer<crate::vfs::boxf::File>,
>;

/// Speller in box archive.
pub struct BoxSpellerArchive<T, U>
where
T: Transducer<crate::vfs::boxf::File>,
Expand All @@ -54,6 +62,7 @@ where
T: Transducer<crate::vfs::boxf::File> + Send + Sync + 'static,
U: Transducer<crate::vfs::boxf::File> + Send + Sync + 'static,
{
/// get the spell-checking component
pub fn hfst_speller(&self) -> Arc<HfstSpeller<crate::vfs::boxf::File, T, U>> {
self.speller.clone()
}
Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/archive/error.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Archive-related errors.
use std::{ffi::OsString, io::Error};

#[cfg(feature = "gpt2")]
Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/archive/meta.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Archive metadata handling
use serde::{Deserialize, Serialize};
use serde_xml_rs::{from_reader, Error, ParserConfig};

Expand Down
10 changes: 10 additions & 0 deletions divvunspell/src/archive/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Handling of archives of spell-checking models.
use memmap2::Mmap;
use std::{ffi::OsString, path::Path, sync::Arc};

Expand Down Expand Up @@ -38,24 +39,33 @@ impl MmapRef {
}
}

/// Speller archive is a file read into spell-checker with metadata.
pub trait SpellerArchive {
/// Read and parse a speller archive.
fn open(path: &Path) -> Result<Self, SpellerArchiveError>
where
Self: Sized;

/// retrieve spell-checker.
fn speller(&self) -> Arc<dyn Speller + Send + Sync>;
/// retrieve metadata.
fn metadata(&self) -> Option<&SpellerMetadata>;
}

/// Predictor archive is a file read intoo a predictor with metadata.
pub trait PredictorArchive {
/// Read and parse a predictor archive.
fn open(path: &Path, predictor_name: Option<&str>) -> Result<Self, PredictorArchiveError>
where
Self: Sized;

/// Retrieve predictor.
fn predictor(&self) -> Arc<dyn Predictor + Send + Sync>;
/// retrieve metadata.
fn metadata(&self) -> Option<&PredictorMetadata>;
}

/// Reads a speller archive.
pub fn open<P>(path: P) -> Result<Arc<dyn SpellerArchive + Send + Sync>, SpellerArchiveError>
where
P: AsRef<Path>,
Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/archive/zip.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Zip archive stuff.
use ::zip::{CompressionMethod, ZipArchive};
use memmap2::MmapOptions;
use std::fs::File;
Expand Down
29 changes: 29 additions & 0 deletions divvunspell/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
/*! Spell-checking and correction with Finite-State Automata.
Implements spell-checking and correction using weighted finite-state
automata. The automata can be compiled using [`HFST`],
this library is originally based on C++ code in [`HFST
ospell`]
[`HFST`]: (https://hfst.github.io)
[`HFST ospell`]: (https://github.com/hfst/hfst-ospell)
# Usage examples
```
use divvunspell::archive::ZipSpellerArchive
let path = Path();
let speller = ZipSpellerArchive::open(path)
let cfg = SpellerConfig::default();
let words = vec!("words", "schmords");
todo!
```
Further examples of how to use divvunspell library can be found in the
[`divvunspell-bin`] in the same repository.
[`divvunspell-bin`]: (https://github.com/divvun/divvunspell)
*/

#![warn(missing_docs)]
pub mod archive;
#[cfg(feature = "internal_ffi")]
Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/paths.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Handling of system paths containing spell-checkers on different OS.
#[cfg(target_os = "macos")]
use std::path::PathBuf;
#[cfg(target_os = "windows")]
Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/predictor/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Autocorrect type spell-checking that predicts next word.
#[cfg(feature = "gpt2")]
pub mod gpt2;

Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/speller/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Speller model for spell-checking and corrections.
use std::f32;
use std::sync::Arc;

Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/speller/suggestion.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Suggestion for a spelling correction.
use crate::types::Weight;
use serde::{Deserialize, Serialize};
use smol_str::SmolStr;
Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Tokenizer splits strings into words and punctuations.
use unic_ucd_common::alphanumeric::is_alphanumeric;
use word::{WordBoundIndices, Words};

Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/transducer/hfst/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Finite-state automaton in HFST format.
pub mod alphabet;
pub mod header;
pub mod index_table;
Expand Down
35 changes: 35 additions & 0 deletions divvunspell/src/transducer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
//! Transducer is a Finite-State Automaton with two tapes / two symbols per
//! transition.
//!
//! Transducer in divvunspell is modeled after the C++ transducer in the
//! hfst-ospell library. It may contain some complex optimisations and
//! specifics to underlying finite-state systems and lot of this is
//! pretty hacky.
pub mod hfst;
pub mod thfst;

Expand All @@ -11,17 +18,22 @@ use self::symbol_transition::SymbolTransition;
use crate::types::{SymbolNumber, TransitionTableIndex, Weight};
use crate::vfs::{self, Filesystem};

/// Error with transducer reading or processing.
#[derive(Debug, thiserror::Error)]
pub enum TransducerError {
/// Error with mmapping
#[error("Memory mapping error")]
Memmap(#[source] std::io::Error),
/// Error with input/output.
#[error("IO error")]
Io(#[source] std::io::Error),
/// Error with FSA alphabets.
#[error("Alphabet error")]
Alphabet(#[source] Box<dyn std::error::Error + Send + Sync>),
}

impl TransducerError {
/// Wrap into i/o error.
pub fn into_io_error(self) -> std::io::Error {
match self {
TransducerError::Memmap(v) => v,
Expand All @@ -33,53 +45,76 @@ impl TransducerError {
}
}

/// A file-based finite-state transducer.
pub trait Transducer<F: vfs::File>: Sized {
/// file extension.
const FILE_EXT: &'static str;

/// read a transducer from a file.
fn from_path<P, FS>(fs: &FS, path: P) -> Result<Self, TransducerError>
where
P: AsRef<std::path::Path>,
FS: Filesystem<File = F>;

/// get transducer's alphabet.
fn alphabet(&self) -> &TransducerAlphabet;
/// get transducer's alphabet as mutable reference.
fn mut_alphabet(&mut self) -> &mut TransducerAlphabet;

/// get input symbol number of given transition arc.
fn transition_input_symbol(&self, i: TransitionTableIndex) -> Option<SymbolNumber>;
/// check if there are transitions at given index.
fn has_transitions(&self, i: TransitionTableIndex, s: Option<SymbolNumber>) -> bool;
/// get next transition with a symbol.
fn next(&self, i: TransitionTableIndex, symbol: SymbolNumber) -> Option<TransitionTableIndex>;
/// check if there are free transitions at index.
fn has_epsilons_or_flags(&self, i: TransitionTableIndex) -> bool;
/// follow free transitions.
fn take_epsilons_and_flags(&self, i: TransitionTableIndex) -> Option<SymbolTransition>;
/// follow epsilon transitions.
fn take_epsilons(&self, i: TransitionTableIndex) -> Option<SymbolTransition>;
/// follow transitions with given symbol.
fn take_non_epsilons(
&self,
i: TransitionTableIndex,
symbol: SymbolNumber,
) -> Option<SymbolTransition>;
/// check if given index is an end state.
fn is_final(&self, i: TransitionTableIndex) -> bool;
/// get end state weight of a state.
fn final_weight(&self, i: TransitionTableIndex) -> Option<Weight>;
}

/// Transition table contains the arcs of the automaton (and states).
pub trait TransitionTable<F: vfs::File>: Sized {
/// read transition table from a file.
fn from_path<P, FS>(fs: &FS, path: P) -> Result<Self, TransducerError>
where
P: AsRef<std::path::Path>,
FS: Filesystem<File = F>;
/// get input symbol of a transition.
fn input_symbol(&self, i: TransitionTableIndex) -> Option<SymbolNumber>;
/// get output symbol of a transition.
fn output_symbol(&self, i: TransitionTableIndex) -> Option<SymbolNumber>;
/// get the target state in the index.
fn target(&self, i: TransitionTableIndex) -> Option<TransitionTableIndex>;
/// get the weight of the transition.
fn weight(&self, i: TransitionTableIndex) -> Option<Weight>;

/// check if the state is a final state.
#[inline(always)]
fn is_final(&self, i: TransitionTableIndex) -> bool {
self.input_symbol(i) == None && self.output_symbol(i) == None && self.target(i) == Some(1)
}

/// ???
#[inline(always)]
fn symbol_transition(&self, i: TransitionTableIndex) -> SymbolTransition {
SymbolTransition::new(self.target(i), self.output_symbol(i), self.weight(i))
}
}

/// Index table contains something.
pub trait IndexTable<F: vfs::File>: Sized {
fn from_path<P, FS>(fs: &FS, path: P) -> Result<Self, TransducerError>
where
Expand Down
1 change: 1 addition & 0 deletions divvunspell/src/transducer/thfst/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Finite-state automaton in optimised mmapped format.
// We manually ensure alignment of reads in this file.
#![allow(clippy::cast_ptr_alignment)]

Expand Down
3 changes: 3 additions & 0 deletions divvunspell/src/vfs.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Some stuff for filesystems and different OSes.
use fs_extra::dir::CopyOptions;
use memmap2::{Mmap, MmapOptions};
use std::fmt::Debug;
Expand Down Expand Up @@ -56,6 +57,7 @@ impl File for std::fs::File {
}
}

/// File system.
pub struct Fs;

impl Filesystem for Fs {
Expand All @@ -74,6 +76,7 @@ impl Filesystem for Fs {
}
}

/// Box file.
pub mod boxf {
use box_format::{BoxFileReader, BoxPath};
use std::io::{Read, Result};
Expand Down

0 comments on commit 7b60b05

Please sign in to comment.