From 64e1ee3ab44065b1d58ea00a9ef822cdbf59ab96 Mon Sep 17 00:00:00 2001 From: Arseny Teramachi Date: Thu, 29 Feb 2024 14:07:11 +0900 Subject: [PATCH 1/4] allow string literals as segmentation modes --- .gitignore | 1 + Cargo.lock | 1 + python/Cargo.toml | 1 + python/py_src/sudachipy/sudachipy.pyi | 23 ++++- python/src/dictionary.rs | 32 +++++-- python/src/morpheme.rs | 9 +- python/src/tokenizer.rs | 104 ++++++++++++++------- python/tests/test_tokenizer.py | 31 +++++- sudachi/src/analysis/stateful_tokenizer.rs | 5 + 9 files changed, 153 insertions(+), 54 deletions(-) diff --git a/.gitignore b/.gitignore index 2ae048dc..92d7f72f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ README*.html python/dist/ __pycache__/ .env +.venv *.egg-info *.so python/py_src/sudachipy/*.pyd diff --git a/Cargo.lock b/Cargo.lock index cc202f2b..813c2f27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -932,6 +932,7 @@ name = "sudachipy" version = "0.6.9-a1" dependencies = [ "pyo3", + "scopeguard", "sudachi", "thread_local", ] diff --git a/python/Cargo.toml b/python/Cargo.toml index f3cf24e5..ae321108 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -15,6 +15,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.20", features = ["extension-module"] } thread_local = "1.1" # Apache 2.0/MIT +scopeguard = "1" # Apache 2.0/MIT [dependencies.sudachi] path = "../sudachi" diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 08f45bcb..3028bf0e 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -1,5 +1,5 @@ from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set -from sudachipy.config import Config +from .config import Config POS = Tuple[str, str, str, str, str, str] # POS element @@ -32,7 +32,12 @@ class SplitMode: B: ClassVar[SplitMode] = ... C: ClassVar[SplitMode] = ... @classmethod - def __init__(cls) -> None: ... + def __init__(cls, mode: str = "C") -> None: + """ + Creates a split mode from a string value + :param mode: string representation of the split mode + """ + ... class Dictionary: @@ -65,7 +70,7 @@ class Dictionary: ... def create(self, - mode: SplitMode = SplitMode.C, + mode: Union[SplitMode, Literal["A"], Literal["B"], Literal["C"]] = SplitMode.C, fields: FieldSet = None, *, projection: str = None) -> Tokenizer: @@ -191,7 +196,7 @@ class Morpheme: """ ... - def split(self, mode: SplitMode, out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList: + def split(self, mode: Union[SplitMode, Literal["A", "B", "C"]], out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList: """ Returns sub-morphemes in the provided split mode. @@ -278,7 +283,7 @@ class Tokenizer: def __init__(cls) -> None: ... def tokenize(self, text: str, - mode: SplitMode = ..., + mode: Union[SplitMode, Literal["A", "B", "C"]] = ..., out: Optional[MorphemeList] = None) -> MorphemeList: """ Break text into morphemes. @@ -295,6 +300,14 @@ class Tokenizer: """ ... + @property + def mode(self) -> SplitMode: + """ + Get the current analysis mode + :return: current analysis mode + """ + ... + class WordInfo: a_unit_split: ClassVar[List[int]] = ... diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 814ffc35..2185f471 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -20,7 +20,9 @@ use std::convert::TryFrom; use std::fmt::Write; use std::ops::Deref; use std::path::{Path, PathBuf}; +use std::str::FromStr; use std::sync::Arc; +use sudachi::analysis::Mode; use crate::errors::{wrap, wrap_ctx, SudachiError as SudachiErr}; use sudachi::analysis::stateless_tokenizer::DictionaryAccess; @@ -218,16 +220,20 @@ impl PyDictionary { /// :param fields: load only a subset of fields. /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html #[pyo3( - text_signature = "($self, mode: sudachipy.SplitMode = sudachipy.SplitMode.C) -> sudachipy.Tokenizer", + text_signature = "($self, mode = 'C') -> sudachipy.Tokenizer", signature = (mode = None, fields = None, *, projection = None) )] - fn create( - &self, - mode: Option, - fields: Option<&PySet>, - projection: Option<&PyString>, + fn create<'py>( + &'py self, + py: Python<'py>, + mode: Option<&'py PyAny>, + fields: Option<&'py PySet>, + projection: Option<&'py PyString>, ) -> PyResult { - let mode = mode.unwrap_or(PySplitMode::C).into(); + let mode = match mode { + Some(m) => extract_mode(py, m)?, + None => Mode::C, + }; let fields = parse_field_subset(fields)?; let mut required_fields = self.config.projection.required_subset(); let dict = self.dictionary.as_ref().unwrap().clone(); @@ -401,6 +407,18 @@ fn config_repr(cfg: &Config) -> Result { Ok(result) } +pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &'py PyAny) -> PyResult { + if mode.is_instance_of::() { + let mode = mode.str()?.to_str()?; + Mode::from_str(mode).map_err(|e| SudachiErr::new_err(e).into()) + } else if mode.is_instance_of::() { + let mode = mode.extract::()?; + Ok(Mode::from(mode)) + } else { + Err(SudachiErr::new_err(("unknown mode", mode.into_py(py)))) + } +} + fn read_config_from_fs(path: Option<&Path>) -> PyResult { wrap(ConfigBuilder::from_opt_file(path)) } diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index ab31f36c..ad3929dd 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -24,9 +24,8 @@ use pyo3::types::{PyList, PyString, PyTuple, PyType}; use sudachi::prelude::{Morpheme, MorphemeList}; -use crate::dictionary::{PyDicData, PyDictionary}; +use crate::dictionary::{extract_mode, PyDicData, PyDictionary}; use crate::projection::MorphemeProjection; -use crate::tokenizer::PySplitMode; use crate::word_info::PyWordInfo; pub(crate) type PyMorphemeList = MorphemeList>; @@ -362,12 +361,14 @@ impl PyMorpheme { fn split<'py>( &'py self, py: Python<'py>, - mode: PySplitMode, + mode: &PyAny, out: Option<&'py PyCell>, add_single: Option, ) -> PyResult<&'py PyCell> { let list = self.list(py); + let mode = extract_mode(py, mode)?; + let out_cell = match out { None => { let list = list.empty_clone(py); @@ -385,7 +386,7 @@ impl PyMorpheme { out_ref.clear(); let splitted = list .internal(py) - .split_into(mode.into(), self.index, out_ref) + .split_into(mode, self.index, out_ref) .map_err(|e| { PyException::new_err(format!("Error while splitting morpheme: {}", e.to_string())) })?; diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 7b84b9f2..1150db03 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -14,9 +14,10 @@ * limitations under the License. */ +use std::ops::DerefMut; +use std::str::FromStr; use std::sync::Arc; -use pyo3::exceptions::PyException; use pyo3::prelude::*; use sudachi::analysis::stateful_tokenizer::StatefulTokenizer; @@ -24,8 +25,9 @@ use sudachi::analysis::stateful_tokenizer::StatefulTokenizer; use sudachi::dic::subset::InfoSubset; use sudachi::prelude::*; -use crate::dictionary::PyDicData; +use crate::dictionary::{extract_mode, PyDicData}; use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; +use crate::errors::{SudachiError as SudachiPyErr}; /// Unit to split text /// @@ -35,25 +37,13 @@ use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; /// /// C == long mode // -// This implementation is a workaround. Waiting for the pyo3 enum feature. -// ref: [PyO3 issue #834](https://github.com/PyO3/pyo3/issues/834). -#[pyclass(module = "sudachipy.tokenizer", name = "SplitMode")] -#[derive(Clone, PartialEq, Eq)] -#[repr(transparent)] -pub struct PySplitMode { - mode: u8, -} - -#[pymethods] -impl PySplitMode { - #[classattr] - pub const A: Self = Self { mode: 0 }; - - #[classattr] - pub const B: Self = Self { mode: 1 }; - - #[classattr] - pub const C: Self = Self { mode: 2 }; +#[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] +#[derive(Clone, PartialEq, Eq, Copy, Debug)] +#[repr(u8)] +pub enum PySplitMode { + A, + B, + C, } impl From for Mode { @@ -61,11 +51,38 @@ impl From for Mode { match mode { PySplitMode::A => Mode::A, PySplitMode::B => Mode::B, - _ => Mode::C, + PySplitMode::C => Mode::C, + } + } +} + +impl From for PySplitMode { + fn from(value: Mode) -> Self { + match value { + Mode::A => PySplitMode::A, + Mode::B => PySplitMode::B, + Mode::C => PySplitMode::C, + } + } +} + +#[pymethods] +impl PySplitMode { + #[new] + fn new(mode: Option<&str>) -> PyResult { + let mode = match mode { + Some(m) => m, + None => return Ok(PySplitMode::C) + }; + + match Mode::from_str(mode) { + Ok(m) => Ok(m.into()), + Err(e) => Err(SudachiPyErr::new_err(e.to_string())) } } } + /// Sudachi Tokenizer, Python version #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] pub(crate) struct PyTokenizer { @@ -112,7 +129,7 @@ impl PyTokenizer { /// :type mode: sudachipy.SplitMode /// :type out: sudachipy.MorphemeList #[pyo3( - text_signature = "($self, text: str, mode: SplitMode = None, logger = None, out = None) -> sudachipy.MorphemeList", + text_signature = "($self, text: str, mode = None, logger = None, out = None) -> sudachipy.MorphemeList", signature = (text, mode = None, logger = None, out = None) )] #[allow(unused_variables)] @@ -120,21 +137,34 @@ impl PyTokenizer { &'py mut self, py: Python<'py>, text: &'py str, - mode: Option, + mode: Option<&PyAny>, logger: Option, out: Option<&'py PyCell>, ) -> PyResult<&'py PyCell> { - // keep default mode to restore later + + // restore default mode on scope exit + let mode = match mode { + None => None, + Some(m) => Some(extract_mode(py, m)?) + }; let default_mode = mode.map(|m| self.tokenizer.set_mode(m.into())); + let mut tokenizer = scopeguard::guard(&mut self.tokenizer, |t| { + default_mode.map(|m| t.set_mode(m)); + }); + + // this needs to be in GIL as it references Python memory + tokenizer.reset().push_str(text); + // analysis can be done without GIL + let err = { + let tokenizer = tokenizer.deref_mut(); + py.allow_threads(|| tokenizer.do_tokenize()) + }; - self.tokenizer.reset().push_str(text); - self.tokenizer - .do_tokenize() - .map_err(|e| PyException::new_err(format!("Tokenization error: {}", e.to_string())))?; + err.map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?; let out_list = match out { None => { - let dict = self.tokenizer.dict_clone(); + let dict = tokenizer.dict_clone(); let morphemes = MorphemeList::empty(dict); let wrapper = PyMorphemeListWrapper::from_components(morphemes, self.projection.clone()); @@ -146,16 +176,18 @@ impl PyTokenizer { let mut borrow = out_list.try_borrow_mut(); let morphemes = match borrow { Ok(ref mut ms) => ms.internal_mut(py), - Err(e) => return Err(PyException::new_err("out was used twice at the same time")), + Err(e) => return Err(SudachiPyErr::new_err("out was used twice at the same time")), }; morphemes - .collect_results(&mut self.tokenizer) - .map_err(|e| PyException::new_err(format!("Tokenization error: {}", e.to_string())))?; - - // restore default mode - default_mode.map(|m| self.tokenizer.set_mode(m)); + .collect_results(tokenizer.deref_mut()) + .map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?; Ok(out_list) } + + #[getter] + fn mode(&self) -> PySplitMode { + self.tokenizer.mode().into() + } } diff --git a/python/tests/test_tokenizer.py b/python/tests/test_tokenizer.py index d30c82f9..623cb78c 100644 --- a/python/tests/test_tokenizer.py +++ b/python/tests/test_tokenizer.py @@ -27,8 +27,25 @@ def setUp(self): resource_dir, 'sudachi.json'), resource_dir) self.tokenizer_obj = self.dict_.create() - def test_nothing(self): - pass + def test_split_mode_default(self): + mode_c = SplitMode() + self.assertEqual(mode_c, SplitMode.C) + + def test_split_mode_from_string_a(self): + mode = SplitMode("A") + self.assertEqual(mode, SplitMode.A) + + def test_split_mode_from_string_b(self): + mode = SplitMode("B") + self.assertEqual(mode, SplitMode.B) + + def test_split_mode_from_string_c(self): + mode = SplitMode("C") + self.assertEqual(mode, SplitMode.C) + + def test_tokenizer_with_split_mode_str(self): + tok_a = self.dict_.create("A") + self.assertEqual(tok_a.mode, SplitMode.A) def test_tokenize_small_katanana_only(self): ms = self.tokenizer_obj.tokenize('ァ') @@ -110,6 +127,16 @@ def test_tokenizer_morpheme_split(self): self.assertEqual(ms_a[0].surface(), '東京') self.assertEqual(ms_a[1].surface(), '都') + def test_tokenizer_morpheme_split_strings(self): + ms = self.tokenizer_obj.tokenize('東京都', 'C') + self.assertEqual(1, ms.size()) + self.assertEqual(ms[0].surface(), '東京都') + + ms_a = ms[0].split('A') + self.assertEqual(2, ms_a.size()) + self.assertEqual(ms_a[0].surface(), '東京') + self.assertEqual(ms_a[1].surface(), '都') + def test_tokenizer_morpheme_list_range(self): ms = self.tokenizer_obj.tokenize('東京都', SplitMode.A) self.assertEqual(2, ms.size()) diff --git a/sudachi/src/analysis/stateful_tokenizer.rs b/sudachi/src/analysis/stateful_tokenizer.rs index c935c71d..ae9199ba 100644 --- a/sudachi/src/analysis/stateful_tokenizer.rs +++ b/sudachi/src/analysis/stateful_tokenizer.rs @@ -86,6 +86,11 @@ impl StatefulTokenizer { std::mem::replace(&mut self.mode, mode) } + /// Return current analysis mode + pub fn mode(&self) -> Mode { + return self.mode + } + /// Analyzer will read only following [`WordInfo`] field subset pub fn set_subset(&mut self, subset: InfoSubset) -> InfoSubset { let mode_subset = match self.mode { From 65bab9dc030dfb300eebcafe1579bf5ce7469bc6 Mon Sep 17 00:00:00 2001 From: Arseny Teramachi Date: Thu, 29 Feb 2024 14:16:21 +0900 Subject: [PATCH 2/4] run cargo fmt --- python/src/tokenizer.rs | 10 ++++------ sudachi/src/analysis/stateful_tokenizer.rs | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 1150db03..2600e902 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -26,8 +26,8 @@ use sudachi::dic::subset::InfoSubset; use sudachi::prelude::*; use crate::dictionary::{extract_mode, PyDicData}; +use crate::errors::SudachiError as SudachiPyErr; use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; -use crate::errors::{SudachiError as SudachiPyErr}; /// Unit to split text /// @@ -72,17 +72,16 @@ impl PySplitMode { fn new(mode: Option<&str>) -> PyResult { let mode = match mode { Some(m) => m, - None => return Ok(PySplitMode::C) + None => return Ok(PySplitMode::C), }; match Mode::from_str(mode) { Ok(m) => Ok(m.into()), - Err(e) => Err(SudachiPyErr::new_err(e.to_string())) + Err(e) => Err(SudachiPyErr::new_err(e.to_string())), } } } - /// Sudachi Tokenizer, Python version #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] pub(crate) struct PyTokenizer { @@ -141,11 +140,10 @@ impl PyTokenizer { logger: Option, out: Option<&'py PyCell>, ) -> PyResult<&'py PyCell> { - // restore default mode on scope exit let mode = match mode { None => None, - Some(m) => Some(extract_mode(py, m)?) + Some(m) => Some(extract_mode(py, m)?), }; let default_mode = mode.map(|m| self.tokenizer.set_mode(m.into())); let mut tokenizer = scopeguard::guard(&mut self.tokenizer, |t| { diff --git a/sudachi/src/analysis/stateful_tokenizer.rs b/sudachi/src/analysis/stateful_tokenizer.rs index ae9199ba..fa69402e 100644 --- a/sudachi/src/analysis/stateful_tokenizer.rs +++ b/sudachi/src/analysis/stateful_tokenizer.rs @@ -88,7 +88,7 @@ impl StatefulTokenizer { /// Return current analysis mode pub fn mode(&self) -> Mode { - return self.mode + return self.mode; } /// Analyzer will read only following [`WordInfo`] field subset From 0283406847218eee8a981ebab8345617d2f0e4d7 Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Tue, 26 Mar 2024 12:58:12 +0900 Subject: [PATCH 3/4] make pretokenizer accept string literals as well --- python/py_src/sudachipy/sudachipy.pyi | 4 ++-- python/src/dictionary.rs | 7 +++++-- python/tests/test_pretokenizers.py | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 3028bf0e..7279870c 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -70,7 +70,7 @@ class Dictionary: ... def create(self, - mode: Union[SplitMode, Literal["A"], Literal["B"], Literal["C"]] = SplitMode.C, + mode: Union[SplitMode, Literal["A", "B", "C"]] = SplitMode.C, fields: FieldSet = None, *, projection: str = None) -> Tokenizer: @@ -101,7 +101,7 @@ class Dictionary: ... def pre_tokenizer(self, - mode: SplitMode = SplitMode.C, + mode: Union[SplitMode, Literal["A", "B", "C"]] = "C", fields: FieldSet = None, handler: Optional[Callable[[int, object, MorphemeList], list]] = None, *, diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 2185f471..3f70773a 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -289,12 +289,15 @@ impl PyDictionary { fn pre_tokenizer<'p>( &'p self, py: Python<'p>, - mode: Option, + mode: Option<&PyAny>, fields: Option<&PySet>, handler: Option>, projection: Option<&PyString>, ) -> PyResult<&'p PyAny> { - let mode = mode.unwrap_or(PySplitMode::C).into(); + let mode = match mode { + Some(m) => extract_mode(py, m)?, + None => Mode::C, + }; let subset = parse_field_subset(fields)?; if let Some(h) = handler.as_ref() { if !h.as_ref(py).is_callable() { diff --git a/python/tests/test_pretokenizers.py b/python/tests/test_pretokenizers.py index 0791beaf..d9dec76e 100644 --- a/python/tests/test_pretokenizers.py +++ b/python/tests/test_pretokenizers.py @@ -58,6 +58,21 @@ def test_works_with_different_split_mode(self): res = tok.encode("外国人参政権") self.assertEqual(res.ids, [1, 5, 2, 3]) + def test_works_with_different_split_mode_str(self): + pretok = self.dict.pre_tokenizer(mode='A') + vocab = { + "[UNK]": 0, + "外国": 1, + "参政": 2, + "権": 3, + "人": 5, + "外国人参政権": 4 + } + tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]")) + tok.pre_tokenizer = pretok + res = tok.encode("外国人参政権") + self.assertEqual(res.ids, [1, 5, 2, 3]) + def test_with_handler(self): def _handler(index, sentence: tokenizers.NormalizedString, ml: MorphemeList): return [tokenizers.NormalizedString(ml[0].part_of_speech()[0]), tokenizers.NormalizedString(str(len(ml)))] From e850158d41d098298f503873eea3868c5e14ee4e Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Tue, 26 Mar 2024 13:11:23 +0900 Subject: [PATCH 4/4] slightly cleanup tokenizer.rs logic --- python/src/tokenizer.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 2600e902..558d02cb 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -150,13 +150,11 @@ impl PyTokenizer { default_mode.map(|m| t.set_mode(m)); }); - // this needs to be in GIL as it references Python memory - tokenizer.reset().push_str(text); // analysis can be done without GIL - let err = { - let tokenizer = tokenizer.deref_mut(); - py.allow_threads(|| tokenizer.do_tokenize()) - }; + let err = py.allow_threads(|| { + tokenizer.reset().push_str(text); + tokenizer.do_tokenize() + }); err.map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?;