Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python compatibility #77

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions python/src/dictionary.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use std::path::PathBuf;
Expand All @@ -29,8 +29,9 @@ use crate::tokenizer::{PySplitMode, PyTokenizer};
#[pyclass(module = "sudachi.dictionary", name = "Dictionary")]
#[pyo3(text_signature = "(config_path, resource_dir)")]
#[derive(Clone)]
#[repr(transparent)]
pub struct PyDictionary {
pub(super) dictionary: Arc<JapaneseDictionary>,
pub(super) dictionary: Option<Arc<JapaneseDictionary>>,
}

#[pymethods]
Expand All @@ -50,16 +51,24 @@ impl PyDictionary {
))
})?);

Ok(Self { dictionary })
Ok(Self {
dictionary: Some(dictionary),
})
}

/// Creates a sudachi tokenizer
#[pyo3(text_signature = "($self, mode)")]
#[args(mode = "None")]
fn create(&self, mode: Option<PySplitMode>) -> PyTokenizer {
let tokenizer = StatelessTokenizer::new(self.dictionary.clone());
let tokenizer = StatelessTokenizer::new(self.dictionary.as_ref().unwrap().clone());
let mode = mode.unwrap_or(PySplitMode::C).into();

PyTokenizer::new(tokenizer, mode)
}

/// Close this dictionary
#[pyo3(text_signature = "($self)")]
fn close(&mut self) {
self.dictionary = None;
}
}
7 changes: 5 additions & 2 deletions python/src/morpheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type PyMorphemeList = MorphemeList<Arc<JapaneseDictionary>>;

/// A list of morphemes
#[pyclass(module = "sudachi.morpheme", name = "MorphemeList")]
#[repr(transparent)]
pub struct PyMorphemeListWrapper {
inner: Arc<PyMorphemeList>,
}
Expand All @@ -40,9 +41,11 @@ impl PyMorphemeListWrapper {
/// Returns an empty morpheme list with dictionary
#[classmethod]
#[pyo3(text_signature = "(dict)")]
fn empty(_cls: &PyType, dict: PyDictionary) -> Self {
fn empty(_cls: &PyType, dict: &PyDictionary) -> Self {
Self {
inner: Arc::new(PyMorphemeList::empty(dict.dictionary.clone())),
inner: Arc::new(PyMorphemeList::empty(
dict.dictionary.as_ref().unwrap().clone(),
)),
}
}

Expand Down
49 changes: 21 additions & 28 deletions python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,15 @@ use crate::morpheme::PyMorphemeListWrapper;

/// Unit to split text
///
/// This implementation is a workaround. Waiting for the pyo3 enum feature.
/// ref: [PyO3 issue #834](https://github.com/PyO3/pyo3/issues/834).
/// A == short mode
/// B == middle mode
/// C == long mode
//
// This implementation is a workaround. Waiting for the pyo3 enum feature.
// ref: [PyO3 issue #834](https://github.com/PyO3/pyo3/issues/834).
#[pyclass(module = "sudachi.tokenizer", name = "SplitMode")]
#[derive(Clone, PartialEq, Eq)]
#[repr(transparent)]
pub struct PySplitMode {
mode: u8,
}
Expand All @@ -47,16 +52,6 @@ impl PySplitMode {
pub const C: Self = Self { mode: 2 };
}

impl From<Mode> for PySplitMode {
fn from(mode: Mode) -> Self {
match mode {
Mode::A => PySplitMode::A,
Mode::B => PySplitMode::B,
Mode::C => PySplitMode::C,
}
}
}

impl From<PySplitMode> for Mode {
fn from(mode: PySplitMode) -> Self {
match mode {
Expand All @@ -67,18 +62,6 @@ impl From<PySplitMode> for Mode {
}
}

impl std::str::FromStr for PySplitMode {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"A" | "a" => Ok(PySplitMode::A),
"B" | "b" => Ok(PySplitMode::B),
"C" | "c" => Ok(PySplitMode::C),
_ => Err("Mode must be one of \"A\", \"B\", or \"C\" (in lower or upper case)."),
}
}
}

#[pyclass(module = "sudachi.tokenizer", name = "Tokenizer")]
pub struct PyTokenizer {
tokenizer: StatelessTokenizer<Arc<JapaneseDictionary>>,
Expand All @@ -93,23 +76,33 @@ impl PyTokenizer {

#[pymethods]
impl PyTokenizer {
#[classattr]
#[allow(non_snake_case)]
fn SplitMode() -> PySplitMode {
PySplitMode::C
}

/// Break text into morphemes
#[pyo3(text_signature = "($self, text, /, mode, enable_debug)")]
#[args(text, mode = "None", enable_debug = "None")]
///
/// This ignores the logger provided
#[pyo3(text_signature = "($self, text, /, mode, logger)")]
#[args(text, mode = "None", logger = "None")]
#[allow(unused_variables)]
fn tokenize(
&self,
text: &str,
mode: Option<PySplitMode>,
enable_debug: Option<bool>, // want to take logger instead of debug flag
logger: Option<PyObject>,
) -> PyResult<PyMorphemeListWrapper> {
let mode: Mode = match mode {
Some(m) => m.into(),
None => self.mode,
};
let enable_debug = false;

let morphemes = self
.tokenizer
.tokenize(text, mode, enable_debug.unwrap_or(false))
.tokenize(text, mode, enable_debug)
.map_err(|e| {
PyException::new_err(format!("Error while tokenization: {}", e.to_string()))
})?
Expand Down
16 changes: 16 additions & 0 deletions python/src/word_info.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use pyo3::prelude::*;

use sudachi::dic::lexicon::word_infos::WordInfo;
Expand Down