Merge branch 'develop' into refactor/258-py-consistent-error

WorksApplications · Nov 11, 2024 · a86c2ff · a86c2ff
2 parents a4a47e2 + d2b53d8
commit a86c2ff
Show file tree

Hide file tree

Showing 20 changed files with 861 additions and 622 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -15,9 +15,9 @@ name = "sudachipy"
 crate-type = ["cdylib"]
 
 [dependencies]
-pyo3 = { version = "0.20", features = ["extension-module"] }
-thread_local = "1.1" # Apache 2.0/MIT
+pyo3 = { version = "0.21", features = ["extension-module"] }
 scopeguard = "1" # Apache 2.0/MIT
+thread_local = "1.1" # Apache 2.0/MIT
 
 [dependencies.sudachi]
 path = "../sudachi"
diff --git a/python/README.md b/python/README.md
@@ -66,7 +66,7 @@ $ pip install sudachipy
 
 ### Step 2. Get a Dictionary
 
-You can get dictionary as a Python package. It make take a while to download the dictionary file (around 70MB for the `core` edition).
+You can get dictionary as a Python package. It may take a while to download the dictionary file (around 70MB for the `core` edition).
 
 ```bash
 $ pip install sudachidict_core
@@ -209,7 +209,7 @@ There are three editions of Sudachi Dictionary, namely, `small`, `core`, and `fu
 
 SudachiPy uses `sudachidict_core` by default.
 
-Dictionaries are installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`.
+Dictionaries can be installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`.
 
 * [SudachiDict-small · PyPI](https://pypi.org/project/SudachiDict-small/)
 * [SudachiDict-core · PyPI](https://pypi.org/project/SudachiDict-core/)
@@ -234,19 +234,19 @@ $ echo "外国人参政権" | sudachipy -s full
 
 ### Dictionary option: Python package
 
-You can specify the dictionary with the `Dicionary()` argument; `config_path` or `dict_type`.
+You can specify the dictionary with the `Dicionary()` argument; `config` or `dict`.
 
 ```python
-class Dictionary(config_path=None, resource_dir=None, dict_type=None)
+class Dictionary(config=None, resource_dir=None, dict=None)
 ```
 
-1. `config_path`
-    * You can specify the file path to the setting file with `config_path` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail).
+1. `config`
+    * You can specify the file path to the setting file with `config` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail).
     * If the dictionary file is specified in the setting file as `systemDict`, SudachiPy will use the dictionary.
-2. `dict_type`
-    * You can also specify the dictionary type with `dict_type`.
-    * The available arguments are `small`, `core`, or `full`.
-    * If different dictionaries are specified with `config_path` and `dict_type`, **a dictionary defined `dict_type` overrides** those defined in the config path.
+2. `dict`
+    * You can also specify the dictionary type with `dict`.
+    * The available arguments are `small`, `core`, `full`, or a path to the dictionary file.
+    * If different dictionaries are specified with `config` and `dict`, **a dictionary defined `dict` overrides** those defined in the config.
 
 ```python
 from sudachipy import Dictionary
@@ -255,16 +255,16 @@ from sudachipy import Dictionary
 tokenizer_obj = Dictionary().create()
 
 # The dictionary given by the `systemDict` key in the config file (/path/to/sudachi.json) will be used
-tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json").create()
+tokenizer_obj = Dictionary(config="/path/to/sudachi.json").create()
 
-# The dictionary specified by `dict_type` will be set.
-tokenizer_obj = Dictionary(dict_type="core").create()  # sudachidict_core (same as default)
-tokenizer_obj = Dictionary(dict_type="small").create()  # sudachidict_small
-tokenizer_obj = Dictionary(dict_type="full").create()  # sudachidict_full
+# The dictionary specified by `dict` will be used.
+tokenizer_obj = Dictionary(dict="core").create()  # sudachidict_core (same as default)
+tokenizer_obj = Dictionary(dict="small").create()  # sudachidict_small
+tokenizer_obj = Dictionary(dict="full").create()  # sudachidict_full
 
-# The dictionary specified by `dict_type` overrides those defined in the config path.
+# The dictionary specified by `dict` overrides those defined in the config.
 # In the following code, `sudachidict_full` will be used regardless of a dictionary defined in the config file.
-tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json", dict_type="full").create()
+tokenizer_obj = Dictionary(config="/path/to/sudachi.json", dict="full").create()
 ```
 
 
@@ -303,10 +303,8 @@ Then specify your `sudachi.json` with the `-r` option.
 $ sudachipy -r path/to/sudachi.json
 ```
 
-
 You can build a user dictionary with the subcommand `ubuild`.
 
-
 ```bash
 $ sudachipy ubuild -h
 usage: sudachipy ubuild [-h] [-o file] [-d string] -s file file [file ...]

diff --git a/python/py_src/sudachipy/__init__.py b/python/py_src/sudachipy/__init__.py
@@ -5,6 +5,7 @@
     MorphemeList,
     Morpheme,
     WordInfo,
+    PosMatcher,
 )
 from .config import Config
 from . import errors

diff --git a/python/py_src/sudachipy/command_line.py b/python/py_src/sudachipy/command_line.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 Works Applications Co., Ltd.
+# Copyright (c) 2019-2024 Works Applications Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,6 +24,13 @@
 from . import sudachipy
 
 
+logging.basicConfig(
+    style="{",
+    format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}',
+    datefmt="%m-%d-%Y %H:%M:%S",
+)
+
+
 def _set_default_subparser(self, name, args=None):
     """
     copy and modify code from https://bitbucket.org/ruamel/std.argparse
@@ -51,7 +58,7 @@ def _set_default_subparser(self, name, args=None):
 argparse.ArgumentParser.set_default_subparser = _set_default_subparser
 
 
-def run(tokenizer, input_, output, print_all, morphs, is_stdout):
+def run(tokenizer, input_, output, print_all, pos_list, is_stdout):
     # get an empty MorphemeList for memory reuse
     mlist = tokenizer.tokenize("")
     for line in input_:
@@ -60,7 +67,7 @@ def run(tokenizer, input_, output, print_all, morphs, is_stdout):
         for m in tokenizer.tokenize(line, out=mlist):
             list_info = [
                 m.surface(),
-                morphs[m.part_of_speech_id()],
+                pos_list[m.part_of_speech_id()],
                 m.normalized_form()]
             if print_all:
                 list_info += [
@@ -97,27 +104,27 @@ def _command_tokenize(args, print_usage):
     if args.fpath_out:
         output = open(args.fpath_out, "w", encoding="utf-8")
 
-    stdout_logger = logging.getLogger(__name__)
-    handler = logging.StreamHandler(sys.stdout)
-    handler.setLevel(logging.DEBUG)
-    stdout_logger.addHandler(handler)
-    stdout_logger.setLevel(logging.DEBUG)
-    stdout_logger.propagate = False
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
 
     print_all = args.a
+    debug = args.d
+    if debug:
+        logger.warning("-d option is not implemented in python.")
 
     try:
         dict_ = Dictionary(config_path=args.fpath_setting,
                            dict_type=args.system_dict_type)
         # empty matcher - get all POS tags
-        all_morphs = dict_.pos_matcher([()])
+        all_pos_matcher = dict_.pos_matcher([()])
         # precompute output POS strings
-        morphs = [",".join(ms) for ms in all_morphs]
+        pos_list = [",".join(ms) for ms in all_pos_matcher]
 
         tokenizer_obj = dict_.create(mode=args.mode)
         input_ = fileinput.input(
             args.in_files, openhook=fileinput.hook_encoded("utf-8"))
-        run(tokenizer_obj, input_, output, print_all, morphs, is_stdout=args.fpath_out is None)
+        run(tokenizer_obj, input_, output, print_all,
+            pos_list, is_stdout=args.fpath_out is None)
     finally:
         if args.fpath_out:
             output.close()
@@ -139,7 +146,8 @@ def _command_build(args, print_usage):
 
     out_file = Path(args.out_file)
     if out_file.exists():
-        print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr)
+        print("File", out_file,
+              "already exists, refusing to overwrite it", file=sys.stderr)
         return
 
     description = args.description or ""
@@ -161,7 +169,8 @@ def _command_build(args, print_usage):
 def _command_user_build(args, print_usage):
     system = Path(args.system_dic)
     if not system.exists():
-        print("System dictionary file", system, "does not exist", file=sys.stderr)
+        print("System dictionary file", system,
+              "does not exist", file=sys.stderr)
         return print_usage()
 
     in_files = []
@@ -174,7 +183,8 @@ def _command_user_build(args, print_usage):
 
     out_file = Path(args.out_file)
     if out_file.exists():
-        print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr)
+        print("File", out_file,
+              "already exists, refusing to overwrite it", file=sys.stderr)
         return
 
     description = args.description or ""
@@ -217,7 +227,7 @@ def main():
     parser_tk.add_argument("-a", action="store_true",
                            help="print all of the fields")
     parser_tk.add_argument("-d", action="store_true",
-                           help="print the debug information")
+                           help="print the debug information (not implemented yet)")
     parser_tk.add_argument("-v", "--version", action="store_true",
                            dest="version", help="print sudachipy version")
     parser_tk.add_argument("in_files", metavar="file",

diff --git a/python/py_src/sudachipy/errors.py b/python/py_src/sudachipy/errors.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2023 Works Applications Co., Ltd.
+#   Copyright (c) 2023-2024 Works Applications Co., Ltd.
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use this file except in compliance with the License.
@@ -13,4 +13,6 @@
 #   limitations under the License.
 
 class SudachiError(Exception):
-    pass
+    """Base class for all Sudachipy exceptions.
+    """
+    pass