diff --git a/Cargo.lock b/Cargo.lock index e9ad71bc..73ca27fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,47 +19,48 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.13" +version = "0.6.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" [[package]] name = "anstyle-parse" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.2" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.2" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" dependencies = [ "anstyle", "windows-sys", @@ -92,12 +93,6 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.5.0" @@ -106,9 +101,9 @@ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" [[package]] name = "bumpalo" -version = "3.15.4" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "cast" @@ -200,9 +195,9 @@ checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "colorchoice" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" [[package]] name = "criterion" @@ -261,9 +256,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.19" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "crunchy" @@ -301,9 +296,9 @@ dependencies = [ [[package]] name = "either" -version = "1.10.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" [[package]] name = "equivalent" @@ -313,9 +308,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", "windows-sys", @@ -334,15 +329,15 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.2" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "half" -version = "2.4.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ "cfg-if", "crunchy", @@ -350,9 +345,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "heck" @@ -374,13 +369,13 @@ checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" [[package]] name = "honggfuzz" -version = "0.5.55" +version = "0.5.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "848e9c511092e0daa0a35a63e8e6e475a3e8f870741448b9f6028d69b142f18e" +checksum = "7c76b6234c13c9ea73946d1379d33186151148e0da231506b964b44f3d023505" dependencies = [ "arbitrary", "lazy_static", - "memmap2 0.5.10", + "memmap2", "rustc_version", ] @@ -411,6 +406,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + [[package]] name = "itertools" version = "0.10.5" @@ -422,9 +423,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] @@ -466,9 +467,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.153" +version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "libloading" @@ -477,14 +478,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ "cfg-if", - "windows-targets 0.52.4", + "windows-targets", ] [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" @@ -503,18 +504,9 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "memchr" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" - -[[package]] -name = "memmap2" -version = "0.5.10" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" -dependencies = [ - "libc", -] +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "memmap2" @@ -552,9 +544,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] @@ -573,9 +565,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -583,22 +575,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.48.5", + "windows-targets", ] [[package]] name = "plotters" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" +checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" dependencies = [ "num-traits", "plotters-backend", @@ -609,15 +601,15 @@ dependencies = [ [[package]] name = "plotters-backend" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" +checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" [[package]] name = "plotters-svg" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" +checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" dependencies = [ "plotters-backend", ] @@ -630,18 +622,18 @@ checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" [[package]] name = "proc-macro2" -version = "1.0.79" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23" dependencies = [ "unicode-ident", ] [[package]] name = "pyo3" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233" +checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" dependencies = [ "cfg-if", "indoc", @@ -657,9 +649,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7" +checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" dependencies = [ "once_cell", "target-lexicon", @@ -667,9 +659,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa" +checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" dependencies = [ "libc", "pyo3-build-config", @@ -677,9 +669,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158" +checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -689,9 +681,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185" +checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" dependencies = [ "heck 0.4.1", "proc-macro2", @@ -702,9 +694,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -731,11 +723,11 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.4.1" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" dependencies = [ - "bitflags 1.3.2", + "bitflags", ] [[package]] @@ -778,11 +770,11 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.32" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.5.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -791,9 +783,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "same-file" @@ -812,24 +804,24 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "1.0.22" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.197" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.197" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", @@ -838,9 +830,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.115" +version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" +checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" dependencies = [ "itoa", "ryu", @@ -871,18 +863,18 @@ name = "sudachi" version = "0.6.9-a1" dependencies = [ "aho-corasick", - "bitflags 2.5.0", + "bitflags", "claim", "csv", "default_input_text", "fancy-regex", "indexmap", - "itertools 0.12.1", + "itertools 0.13.0", "join_katakana_oov", "join_numeric", "lazy_static", "libloading", - "memmap2 0.9.4", + "memmap2", "nom", "regex", "serde", @@ -900,7 +892,7 @@ version = "0.6.9-a1" dependencies = [ "cfg-if", "clap", - "memmap2 0.9.4", + "memmap2", "sudachi", ] @@ -926,9 +918,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.55" +version = "2.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "002a1b3dbf967edfafc32655d0f377ab0bb7b994aa1d32c8cc7e9b8bf3ebb8f0" +checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5" dependencies = [ "proc-macro2", "quote", @@ -955,18 +947,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.58" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.58" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", @@ -1109,159 +1101,87 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - [[package]] name = "winapi-util" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" dependencies = [ - "winapi", + "windows-sys", ] -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" -version = "0.48.5" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" [[package]] -name = "windows_i686_gnu" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" - -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" +name = "windows_i686_gnullvm" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" [[package]] name = "yada" diff --git a/python/Cargo.toml b/python/Cargo.toml index e1143743..53cd97e5 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -15,9 +15,9 @@ name = "sudachipy" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.20", features = ["extension-module"] } -thread_local = "1.1" # Apache 2.0/MIT +pyo3 = { version = "0.21", features = ["extension-module"] } scopeguard = "1" # Apache 2.0/MIT +thread_local = "1.1" # Apache 2.0/MIT [dependencies.sudachi] path = "../sudachi" diff --git a/python/README.md b/python/README.md index 4d95d7fb..b1ad3e5e 100644 --- a/python/README.md +++ b/python/README.md @@ -66,7 +66,7 @@ $ pip install sudachipy ### Step 2. Get a Dictionary -You can get dictionary as a Python package. It make take a while to download the dictionary file (around 70MB for the `core` edition). +You can get dictionary as a Python package. It may take a while to download the dictionary file (around 70MB for the `core` edition). ```bash $ pip install sudachidict_core @@ -209,7 +209,7 @@ There are three editions of Sudachi Dictionary, namely, `small`, `core`, and `fu SudachiPy uses `sudachidict_core` by default. -Dictionaries are installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`. +Dictionaries can be installed as Python packages `sudachidict_small`, `sudachidict_core`, and `sudachidict_full`. * [SudachiDict-small · PyPI](https://pypi.org/project/SudachiDict-small/) * [SudachiDict-core · PyPI](https://pypi.org/project/SudachiDict-core/) @@ -234,19 +234,19 @@ $ echo "外国人参政権" | sudachipy -s full ### Dictionary option: Python package -You can specify the dictionary with the `Dicionary()` argument; `config_path` or `dict_type`. +You can specify the dictionary with the `Dicionary()` argument; `config` or `dict`. ```python -class Dictionary(config_path=None, resource_dir=None, dict_type=None) +class Dictionary(config=None, resource_dir=None, dict=None) ``` -1. `config_path` - * You can specify the file path to the setting file with `config_path` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail). +1. `config` + * You can specify the file path to the setting file with `config` (See [Dictionary in The Setting File](#Dictionary in The Setting File) for the detail). * If the dictionary file is specified in the setting file as `systemDict`, SudachiPy will use the dictionary. -2. `dict_type` - * You can also specify the dictionary type with `dict_type`. - * The available arguments are `small`, `core`, or `full`. - * If different dictionaries are specified with `config_path` and `dict_type`, **a dictionary defined `dict_type` overrides** those defined in the config path. +2. `dict` + * You can also specify the dictionary type with `dict`. + * The available arguments are `small`, `core`, `full`, or a path to the dictionary file. + * If different dictionaries are specified with `config` and `dict`, **a dictionary defined `dict` overrides** those defined in the config. ```python from sudachipy import Dictionary @@ -255,16 +255,16 @@ from sudachipy import Dictionary tokenizer_obj = Dictionary().create() # The dictionary given by the `systemDict` key in the config file (/path/to/sudachi.json) will be used -tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json").create() +tokenizer_obj = Dictionary(config="/path/to/sudachi.json").create() -# The dictionary specified by `dict_type` will be set. -tokenizer_obj = Dictionary(dict_type="core").create() # sudachidict_core (same as default) -tokenizer_obj = Dictionary(dict_type="small").create() # sudachidict_small -tokenizer_obj = Dictionary(dict_type="full").create() # sudachidict_full +# The dictionary specified by `dict` will be used. +tokenizer_obj = Dictionary(dict="core").create() # sudachidict_core (same as default) +tokenizer_obj = Dictionary(dict="small").create() # sudachidict_small +tokenizer_obj = Dictionary(dict="full").create() # sudachidict_full -# The dictionary specified by `dict_type` overrides those defined in the config path. +# The dictionary specified by `dict` overrides those defined in the config. # In the following code, `sudachidict_full` will be used regardless of a dictionary defined in the config file. -tokenizer_obj = Dictionary(config_path="/path/to/sudachi.json", dict_type="full").create() +tokenizer_obj = Dictionary(config="/path/to/sudachi.json", dict="full").create() ``` @@ -303,10 +303,8 @@ Then specify your `sudachi.json` with the `-r` option. $ sudachipy -r path/to/sudachi.json ``` - You can build a user dictionary with the subcommand `ubuild`. - ```bash $ sudachipy ubuild -h usage: sudachipy ubuild [-h] [-o file] [-d string] -s file file [file ...] diff --git a/python/py_src/sudachipy/__init__.py b/python/py_src/sudachipy/__init__.py index bdf67f40..fb551538 100644 --- a/python/py_src/sudachipy/__init__.py +++ b/python/py_src/sudachipy/__init__.py @@ -5,6 +5,7 @@ MorphemeList, Morpheme, WordInfo, + PosMatcher, ) from .config import Config from . import errors diff --git a/python/py_src/sudachipy/command_line.py b/python/py_src/sudachipy/command_line.py index 07f59c19..e7574bf1 100644 --- a/python/py_src/sudachipy/command_line.py +++ b/python/py_src/sudachipy/command_line.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 Works Applications Co., Ltd. +# Copyright (c) 2019-2024 Works Applications Co., Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,6 +24,13 @@ from . import sudachipy +logging.basicConfig( + style="{", + format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}', + datefmt="%m-%d-%Y %H:%M:%S", +) + + def _set_default_subparser(self, name, args=None): """ copy and modify code from https://bitbucket.org/ruamel/std.argparse @@ -51,7 +58,7 @@ def _set_default_subparser(self, name, args=None): argparse.ArgumentParser.set_default_subparser = _set_default_subparser -def run(tokenizer, input_, output, print_all, morphs, is_stdout): +def run(tokenizer, input_, output, print_all, pos_list, is_stdout): # get an empty MorphemeList for memory reuse mlist = tokenizer.tokenize("") for line in input_: @@ -60,7 +67,7 @@ def run(tokenizer, input_, output, print_all, morphs, is_stdout): for m in tokenizer.tokenize(line, out=mlist): list_info = [ m.surface(), - morphs[m.part_of_speech_id()], + pos_list[m.part_of_speech_id()], m.normalized_form()] if print_all: list_info += [ @@ -97,27 +104,27 @@ def _command_tokenize(args, print_usage): if args.fpath_out: output = open(args.fpath_out, "w", encoding="utf-8") - stdout_logger = logging.getLogger(__name__) - handler = logging.StreamHandler(sys.stdout) - handler.setLevel(logging.DEBUG) - stdout_logger.addHandler(handler) - stdout_logger.setLevel(logging.DEBUG) - stdout_logger.propagate = False + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) print_all = args.a + debug = args.d + if debug: + logger.warning("-d option is not implemented in python.") try: dict_ = Dictionary(config_path=args.fpath_setting, dict_type=args.system_dict_type) # empty matcher - get all POS tags - all_morphs = dict_.pos_matcher([()]) + all_pos_matcher = dict_.pos_matcher([()]) # precompute output POS strings - morphs = [",".join(ms) for ms in all_morphs] + pos_list = [",".join(ms) for ms in all_pos_matcher] tokenizer_obj = dict_.create(mode=args.mode) input_ = fileinput.input( args.in_files, openhook=fileinput.hook_encoded("utf-8")) - run(tokenizer_obj, input_, output, print_all, morphs, is_stdout=args.fpath_out is None) + run(tokenizer_obj, input_, output, print_all, + pos_list, is_stdout=args.fpath_out is None) finally: if args.fpath_out: output.close() @@ -139,7 +146,8 @@ def _command_build(args, print_usage): out_file = Path(args.out_file) if out_file.exists(): - print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr) + print("File", out_file, + "already exists, refusing to overwrite it", file=sys.stderr) return description = args.description or "" @@ -161,7 +169,8 @@ def _command_build(args, print_usage): def _command_user_build(args, print_usage): system = Path(args.system_dic) if not system.exists(): - print("System dictionary file", system, "does not exist", file=sys.stderr) + print("System dictionary file", system, + "does not exist", file=sys.stderr) return print_usage() in_files = [] @@ -174,7 +183,8 @@ def _command_user_build(args, print_usage): out_file = Path(args.out_file) if out_file.exists(): - print("File", out_file, "already exists, refusing to overwrite it", file=sys.stderr) + print("File", out_file, + "already exists, refusing to overwrite it", file=sys.stderr) return description = args.description or "" @@ -217,7 +227,7 @@ def main(): parser_tk.add_argument("-a", action="store_true", help="print all of the fields") parser_tk.add_argument("-d", action="store_true", - help="print the debug information") + help="print the debug information (not implemented yet)") parser_tk.add_argument("-v", "--version", action="store_true", dest="version", help="print sudachipy version") parser_tk.add_argument("in_files", metavar="file", diff --git a/python/py_src/sudachipy/errors.py b/python/py_src/sudachipy/errors.py index e75e21cd..c11a8205 100644 --- a/python/py_src/sudachipy/errors.py +++ b/python/py_src/sudachipy/errors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Works Applications Co., Ltd. +# Copyright (c) 2023-2024 Works Applications Co., Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,4 +13,6 @@ # limitations under the License. class SudachiError(Exception): - pass \ No newline at end of file + """Base class for all Sudachipy exceptions. + """ + pass diff --git a/python/py_src/sudachipy/sudachipy.pyi b/python/py_src/sudachipy/sudachipy.pyi index 16c416f6..ca39a95c 100644 --- a/python/py_src/sudachipy/sudachipy.pyi +++ b/python/py_src/sudachipy/sudachipy.pyi @@ -1,6 +1,20 @@ +# Copyright (c) 2024 Works Applications Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import ClassVar, Iterator, List, Tuple, Union, Callable, Iterable, Optional, Literal, Set from .config import Config +# Part Of Speech POS = Tuple[str, str, str, str, str, str] # POS element PE = Optional[str] @@ -14,10 +28,20 @@ PartialPOS = Union[ Tuple[()], ] +""" +Fields that can be specified for partial dictionary loading. +See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. +""" FieldSet = Optional[Set[Literal["surface", "pos", "normalized_form", "dictionary_form", "reading_form", "word_structure", "split_a", "split_b", "synonym_group_id"]]] +""" +Strings that can be parsed as SplitMode +""" +SplitModeStr = Literal["A", "a", "B", "b", "C", "c"] + + class SplitMode: """ Unit to split text. @@ -34,10 +58,12 @@ class SplitMode: C: ClassVar[SplitMode] = ... @classmethod - def __init__(cls, mode: str = "C") -> None: + def __init__(cls, mode: Optional[SplitModeStr] = "C") -> None: """ - Creates a split mode from a string value - :param mode: string representation of the split mode + Creates a split mode from a string value. + + :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. + If None, returns SplitMode.C. """ ... @@ -54,14 +80,15 @@ class Dictionary: Creates a sudachi dictionary. If both config.systemDict and dict are not given, `sudachidict_core` is used. - If both config.systemDict and dict are given, dict_type is used. + If both config.systemDict and dict are given, dict is used. + If dict is an absolute path to a file, it is used as a dictionary. - :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.config.Config] object - :param config: alias to config_path, only one of them can be specified at the same time - :param resource_dir: path to the resource directory folder + :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. + :param config: alias to config_path, only one of them can be specified at the same time. + :param resource_dir: path to the resource directory folder. :param dict: type of pre-packaged system dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. Also, can be an _absolute_ path to a compiled dictionary file. - :param dict_type: deprecated alias to dict + :param dict_type: deprecated alias to dict. """ ... @@ -72,16 +99,16 @@ class Dictionary: ... def create(self, - mode: Union[SplitMode, Literal["A", "B", "C"]] = SplitMode.C, - fields: FieldSet = None, + mode: Union[SplitMode, SplitModeStr, None] = SplitMode.C, + fields: Optional[FieldSet] = None, *, - projection: str = None) -> Tokenizer: + projection: Optional[str] = None) -> Tokenizer: """ - Creates a Sudachi Tokenizer. + Creates a sudachi tokenizer. :param mode: sets the analysis mode for this Tokenizer :param fields: load only a subset of fields. - See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html + See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. :param projection: Projection override for created Tokenizer. See Config.projection for values. """ ... @@ -91,32 +118,32 @@ class Dictionary: Creates a new POS matcher. If target is a function, then it must return whether a POS should match or not. - If target a list, it should contain partially specified POS. - By partially specified it means that it is possible to omit POS fields or - use None as a sentinel value that matches any POS. + If target is a list, it should contain partially specified POS. + By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS. For example, ('名詞',) will match any noun and (None, None, None, None, None, '終止形') will match any word in 終止形 conjugation form. - :param target: can be either a function or a list of POS tuples. + :param target: can be either a list of POS partial tuples or a callable which maps POS to bool. """ ... def pre_tokenizer(self, - mode: Union[SplitMode, Literal["A", "B", "C"]] = "C", - fields: FieldSet = None, - handler: Optional[Callable[[int, object, MorphemeList], list]] = None, + mode: Union[SplitMode, SplitModeStr, None] = SplitMode.C, + fields: Optional[FieldSet] = None, + handler: Optional[Callable[[ + int, object, MorphemeList], list]] = None, *, - projection: str = None) -> object: + projection: Optional[str] = None) -> object: """ Creates HuggingFace Tokenizers-compatible PreTokenizer. Requires package `tokenizers` to be installed. :param mode: Use this split mode (C by default) - :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html - :param handler: custom callable to transform MorphemeList into list of tokens. See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py - First two parameters are the index (int) and HuggingFace NormalizedString. - The handler must return a List[NormalizedString]. By default, just segment the tokens. + :param fields: ask Sudachi to load only a subset of fields. See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations. + It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. + See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. :param projection: Projection override for created Tokenizer. See Config.projection for values. """ ... @@ -126,7 +153,7 @@ class Dictionary: Returns POS with the given id. :param pos_id: POS id - :return: POS tuple with the given id. + :return: POS tuple with the given id or None for non existing id. """ ... @@ -180,6 +207,9 @@ class Morpheme: def get_word_info(self) -> WordInfo: """ Returns the word info. + + ..deprecated:: v0.6.0 + Users should not touch the raw WordInfo. """ ... @@ -197,7 +227,8 @@ class Morpheme: def part_of_speech(self) -> POS: """ - Returns the part of speech. + Returns the part of speech as a six-element tuple. + Tuple elements are four POS levels, conjugation type and conjugation form. """ ... @@ -213,12 +244,15 @@ class Morpheme: """ ... - def split(self, mode: Union[SplitMode, Literal["A", "B", "C"]], out: Optional[MorphemeList] = None, add_single: bool = True) -> MorphemeList: + def split(self, + mode: Union[SplitMode, SplitModeStr], + out: Optional[MorphemeList] = None, + add_single: bool = True) -> MorphemeList: """ Returns sub-morphemes in the provided split mode. - :param mode: mode of new split - :param out: write results to this MorhpemeList instead of creating new one + :param mode: mode of new split. + :param out: write results to this MorhpemeList instead of creating new one. See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for more information on output parameters. Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter. @@ -230,6 +264,7 @@ class Morpheme: def surface(self) -> str: """ Returns the substring of input text corresponding to the morpheme, or a projection if one is configured. + See `Config.projection`. """ ... @@ -237,6 +272,7 @@ class Morpheme: def raw_surface(self) -> str: """ Returns the substring of input text corresponding to the morpheme regardless the configured projection. + See `Config.projection`. """ ... @@ -255,13 +291,14 @@ class Morpheme: def __len__(self) -> int: """ - Returns morpheme length in codepoints + Returns morpheme length in codepoints. """ class MorphemeList: """ A list of morphemes. + An object can not be instantiated manually. Use Tokenizer.tokenize("") to create an empty morpheme list. """ @@ -269,9 +306,12 @@ class MorphemeList: def __init__(self) -> None: ... @classmethod - def empty(cls, dict) -> MorphemeList: + def empty(cls, dict: Dictionary) -> MorphemeList: """ Returns an empty morpheme list with dictionary. + + .. deprecated:: + Use Tokenizer.tokenize("") if you need. """ ... @@ -287,29 +327,35 @@ class MorphemeList: """ ... - def __getitem__(self, index) -> Morpheme: ... + def __getitem__(self, index: int) -> Morpheme: ... def __iter__(self) -> Iterator[Morpheme]: ... def __len__(self) -> int: ... class Tokenizer: + """ + A sudachi tokenizer + + Create using Dictionary.create method. + """ SplitMode: ClassVar[SplitMode] = ... + @classmethod def __init__(cls) -> None: ... - def tokenize(self, text: str, - mode: Union[SplitMode, Literal["A", "B", "C"]] = ..., + def tokenize(self, + text: str, + mode: Union[SplitMode, SplitModeStr, None] = None, out: Optional[MorphemeList] = None) -> MorphemeList: """ Break text into morphemes. - SudachiPy 0.5.* had logger parameter, it is accepted, but ignored. - - :param text: text to analyze + :param text: text to analyze. :param mode: analysis mode. This parameter is deprecated. Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes. If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead. + :param logger: Arg for v0.5.* compatibility. Ignored. :param out: tokenization results will be written into this MorphemeList, a new one will be created instead. See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. """ @@ -336,47 +382,51 @@ class WordInfo: surface: ClassVar[str] = ... synonym_group_ids: ClassVar[List[int]] = ... word_structure: ClassVar[List[int]] = ... + @classmethod def __init__(self) -> None: ... def length(self) -> int: ... class PosMatcher: + """ + A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech. + + Create using Dictionary.pos_matcher method. + """ + def __iter__(self) -> Iterator[POS]: ... def __len__(self) -> int: ... - def __call__(self, m: Morpheme) -> bool: + def __call__(self, /, m: Morpheme) -> bool: """ - Checks whether a morpheme has matching POS - :param m: morpheme - :return: if morpheme has matching POS + Checks whether a morpheme has matching POS. + + :param m: a morpheme to check. + :return: if morpheme has matching POS. """ ... def __or__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if any of two matchers would match it - :return: PosMatcher + Returns a POS matcher which matches a POS if any of two matchers would match it. """ ... def __and__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if both matchers would match it at the same time - :return: PosMatcher + Returns a POS matcher which matches a POS if both matchers would match it at the same time. """ ... def __sub__(self, other: PosMatcher) -> PosMatcher: """ - Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS - :return: PosMatcher + Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS. """ ... def __invert__(self) -> PosMatcher: """ - Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher - :return: PosMatcher + Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher. """ ... diff --git a/python/src/build.rs b/python/src/build.rs index 6b3bd0ca..8fee1ff8 100644 --- a/python/src/build.rs +++ b/python/src/build.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,17 +26,17 @@ use sudachi::config::Config; use sudachi::dic::build::{DataSource, DictBuilder}; use sudachi::dic::dictionary::JapaneseDictionary; -pub fn register_functions(m: &PyModule) -> PyResult<()> { +pub fn register_functions(m: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(build_system_dic, m)?)?; m.add_function(wrap_pyfunction!(build_user_dic, m)?)?; Ok(()) } -fn to_stats(py: Python, builder: DictBuilder) -> PyResult<&PyList> { - let stats = PyList::empty(py); +fn to_stats(py: Python, builder: DictBuilder) -> PyResult> { + let stats = PyList::empty_bound(py); for p in builder.report() { - let t = PyTuple::new( + let t = PyTuple::new_bound( py, [ p.part().into_py(py), @@ -58,25 +58,40 @@ fn create_file(p: &Path) -> std::io::Result { OpenOptions::new().create_new(true).write(true).open(p) } +/// Build system dictionary from matrix and lexicons. +/// +/// :param matrix: Path to the matrix file. +/// :param lex: List of paths to lexicon files. +/// :param output: Path to output built dictionray. +/// :param description: A description text to embed in the dictionary. +/// :return: A build report, list of (part, size, time). +/// +/// :type matrix: pathlib.Path | str | bytes +/// :type lex: list[pathlib.Path | str | bytes] +/// :type output: pathlib.Path | str +/// :type description: str #[pyfunction] -#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list")] -fn build_system_dic<'p>( - py: Python<'p>, - matrix: &'p PyAny, - lex: &'p PyList, - output: &'p PyAny, +#[pyo3(text_signature = "(matrix, lex, output, description=None) -> list[tuple[str, int, float]]")] +fn build_system_dic<'py>( + py: Python<'py>, + matrix: &Bound<'py, PyAny>, + lex: &Bound<'py, PyList>, + output: &Bound<'py, PyAny>, description: Option<&str>, -) -> PyResult<&'p PyList> { +) -> PyResult> { let mut builder = DictBuilder::new_system(); description.map(|d| builder.set_description(d)); - let matrix_src = as_data_source(py, matrix)?; + let matrix_path = resolve_as_pypathstr(py, matrix)?; + let matrix_src = as_data_source(matrix_path.as_ref(), matrix)?; errors::wrap_ctx(builder.read_conn(matrix_src), matrix)?; for f in lex.iter() { - let lex_src = as_data_source(py, &f)?; + let lex_path = resolve_as_pypathstr(py, &f)?; + let lex_src = as_data_source(lex_path.as_ref(), &f)?; errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?; } - let out_file = match as_data_source(py, output)? { + let out_path = resolve_as_pypathstr(py, output)?; + let out_file = match as_data_source(out_path.as_ref(), output)? { DataSource::File(p) => errors::wrap_ctx(create_file(p), p)?, DataSource::Data(_) => return errors::wrap(Err("can't use bytes for output")), }; @@ -87,16 +102,29 @@ fn build_system_dic<'p>( to_stats(py, builder) } +/// Build user dictionary from lexicons based on the given system dictionary. +/// +/// :param system: Path to the system dictionary. +/// :param lex: List of paths to lexicon files. +/// :param output: Path to output built dictionray. +/// :param description: A description text to embed in the dictionary. +/// :return: A build report, list of (part, size, time). +/// +/// :type system: pathlib.Path | str +/// :type lex: list[pathlib.Path | str | bytes] +/// :type output: pathlib.Path | str +/// :type description: str #[pyfunction] -#[pyo3(text_signature = "(system, lex, output, description=None) -> list")] -fn build_user_dic<'p>( - py: Python<'p>, - system: &'p PyAny, - lex: &'p PyList, - output: &'p PyAny, +#[pyo3(text_signature = "(system, lex, output, description=None) -> list[tuple[str, int, float]]")] +fn build_user_dic<'py>( + py: Python<'py>, + system: &Bound<'py, PyAny>, + lex: &Bound<'py, PyList>, + output: &Bound<'py, PyAny>, description: Option<&str>, -) -> PyResult<&'p PyList> { - let system_dic = match as_data_source(py, system)? { +) -> PyResult> { + let system_path = resolve_as_pypathstr(py, system)?; + let system_dic = match as_data_source(system_path.as_ref(), system)? { DataSource::File(f) => { let resource_path = get_default_resource_dir(py)?; let cfg = Config::minimal_at(resource_path).with_system_dic(f); @@ -113,10 +141,12 @@ fn build_user_dic<'p>( description.map(|d| builder.set_description(d)); for f in lex.iter() { - let lex_src = as_data_source(py, &f)?; + let lex_path = resolve_as_pypathstr(py, &f)?; + let lex_src = as_data_source(lex_path.as_ref(), &f)?; errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?; } - let out_file = match as_data_source(py, output)? { + let out_path = resolve_as_pypathstr(py, output)?; + let out_file = match as_data_source(out_path.as_ref(), output)? { DataSource::File(p) => errors::wrap_ctx(create_file(p), p)?, DataSource::Data(_) => return errors::wrap(Err("can't use bytes for output")), }; @@ -127,25 +157,39 @@ fn build_user_dic<'p>( to_stats(py, builder) } -fn as_data_source<'p>(py: Python<'p>, data: &'p PyAny) -> PyResult> { - let path = py - .import("pathlib")? - .getattr("Path")? - .downcast::()?; +fn resolve_as_pypathstr<'py>( + py: Python<'py>, + data: &Bound<'py, PyAny>, +) -> PyResult>> { + let binding = py.import_bound("pathlib")?.getattr("Path")?; + let path = binding.downcast::()?; if data.is_instance(path)? { - let pypath = data.call_method0("resolve")?.str()?; - Ok(DataSource::File(Path::new(pypath.to_str()?))) + Ok(Some(data.call_method0("resolve")?.str()?)) } else if data.is_instance_of::() { - let pypath = data.str()?; - Ok(DataSource::File(Path::new(pypath.to_str()?))) - } else if data.is_instance_of::() { - let data = data.downcast::()?; - Ok(DataSource::Data(data.as_bytes())) + Ok(Some(data.str()?)) } else { - errors::wrap(Err(format!( - "data source should be Path, bytes or str, was {}: {}", - data, - data.get_type() - ))) + Ok(None) + } +} + +fn as_data_source<'py>( + resolved_path: Option<&'py Bound<'py, PyString>>, + original_obj: &'py Bound<'py, PyAny>, +) -> PyResult> { + match resolved_path { + Some(pystr) => Ok(DataSource::File(Path::new(pystr.to_str()?))), + None => { + if original_obj.is_instance_of::() { + Ok(DataSource::Data( + original_obj.downcast::()?.as_bytes(), + )) + } else { + errors::wrap(Err(format!( + "data source should be only Path, bytes or str, was {}: {}", + original_obj, + original_obj.get_type() + ))) + } + } } } diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 802e23c2..30fbeda6 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,7 +78,24 @@ impl PyDicData { } } -/// A sudachi dictionary +/// A sudachi dictionary. +/// +/// If both config.systemDict and dict are not given, `sudachidict_core` is used. +/// If both config.systemDict and dict are given, dict is used. +/// If dict is an absolute path to a file, it is used as a dictionary. +/// +/// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. +/// :param config: alias to config_path, only one of them can be specified at the same time. +/// :param resource_dir: path to the resource directory folder. +/// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. +/// Also, can be an _absolute_ path to a compiled dictionary file. +/// :param dict_type: deprecated alias to dict. +/// +/// :type config_path: Config | pathlib.Path | str | None +/// :type config: Config | pathlib.Path | str | None +/// :type resource_dir: pathlib.Path | str | None +/// :type dict: pathlib.Path | str | None +/// :type dict_type: pathlib.Path | str | None #[pyclass(module = "sudachipy.dictionary", name = "Dictionary")] #[derive(Clone)] pub struct PyDictionary { @@ -90,24 +107,34 @@ pub struct PyDictionary { impl PyDictionary { /// Creates a sudachi dictionary. /// - /// If both config.systemDict and dict_type are not given, `sudachidict_core` is used. - /// If both config.systemDict and dict_type are given, dict_type is used. - /// If dict is an absolute path to a file, it is used as a dictionary + /// If both config.systemDict and dict are not given, `sudachidict_core` is used. + /// If both config.systemDict and dict are given, dict is used. + /// If dict is an absolute path to a file, it is used as a dictionary. /// - /// :param config_path: path to the configuration JSON file - /// :param resource_dir: path to the resource directory folder - /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. + /// :param config_path: path to the configuration JSON file, config json as a string, or a [sudachipy.Config] object. + /// :param config: alias to config_path, only one of them can be specified at the same time. + /// :param resource_dir: path to the resource directory folder. + /// :param dict: type of pre-packaged dictionary, referring to sudachidict_ packages on PyPI: https://pypi.org/search/?q=sudachidict. /// Also, can be an _absolute_ path to a compiled dictionary file. - /// :param dict_type: deprecated alias to dict + /// :param dict_type: deprecated alias to dict. + /// + /// :type config_path: Config | pathlib.Path | str | None + /// :type config: Config | pathlib.Path | str | None + /// :type resource_dir: pathlib.Path | str | None + /// :type dict: pathlib.Path | str | None + /// :type dict_type: pathlib.Path | str | None #[new] - #[pyo3(signature=(config_path = None, resource_dir = None, dict = None, dict_type = None, *, config = None))] + #[pyo3( + text_signature="(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) -> Dictionary", + signature=(config_path=None, resource_dir=None, dict=None, dict_type=None, *, config=None) + )] fn new( py: Python, - config_path: Option<&PyAny>, + config_path: Option<&Bound>, resource_dir: Option, dict: Option<&str>, dict_type: Option<&str>, - config: Option<&PyAny>, + config: Option<&Bound>, ) -> PyResult { if config.is_some() && config_path.is_some() { return errors::wrap(Err("Both config and config_path options were specified at the same time, use one of them")); @@ -184,7 +211,7 @@ impl PyDictionary { .pos_list .iter() .map(|pos| { - let tuple: Py = PyTuple::new(py, pos).into_py(py); + let tuple: Py = PyTuple::new_bound(py, pos).into_py(py); tuple }) .collect(); @@ -211,22 +238,26 @@ impl PyDictionary { /// Creates a sudachi tokenizer. /// - /// :param mode: tokenizer's default split mode (C by default). + /// :param mode: sets the analysis mode for this Tokenizer /// :param fields: load only a subset of fields. - /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html + /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + /// :param projection: Projection override for created Tokenizer. See Config.projection for values. + /// + /// :type mode: SplitMode | str | None + /// :type fields: set[str] | None + /// :type projection: str | None #[pyo3( - text_signature = "($self, mode = 'C') -> sudachipy.Tokenizer", - signature = (mode = None, fields = None, *, projection = None) + text_signature="(self, /, mode=SplitMode.C, fields=None, *, projection=None) -> Tokenizer", + signature=(mode=None, fields=None, *, projection=None) )] fn create<'py>( &'py self, - py: Python<'py>, - mode: Option<&'py PyAny>, - fields: Option<&'py PySet>, - projection: Option<&'py PyString>, + mode: Option<&Bound<'py, PyAny>>, + fields: Option<&Bound<'py, PySet>>, + projection: Option<&Bound<'py, PyString>>, ) -> PyResult { let mode = match mode { - Some(m) => extract_mode(py, m)?, + Some(m) => extract_mode(m)?, None => Mode::C, }; let fields = parse_field_subset(fields)?; @@ -249,16 +280,20 @@ impl PyDictionary { /// Creates a POS matcher object /// /// If target is a function, then it must return whether a POS should match or not. - /// If target a list, it should contain partially specified POS. - /// By partially specified it means that it is possible to omit POS fields or - /// use None as a sentinel value that matches any POS. + /// If target is a list, it should contain partially specified POS. + /// By partially specified it means that it is possible to omit POS fields or use None as a sentinel value that matches any POS. /// /// For example, ('名詞',) will match any noun and /// (None, None, None, None, None, '終止形‐一般') will match any word in 終止形‐一般 conjugation form. /// - /// :param target: can be either a callable or list of POS partial tuples - #[pyo3(text_signature = "($self, target)")] - fn pos_matcher<'py>(&'py self, py: Python<'py>, target: &PyAny) -> PyResult { + /// :param target: can be either a list of POS partial tuples or a callable which maps POS to bool. + /// + /// :type target: Iterable[PartialPOS] | Callable[[POS], bool] + fn pos_matcher<'py>( + &'py self, + py: Python<'py>, + target: &Bound<'py, PyAny>, + ) -> PyResult { PyPosMatcher::create(py, self.dictionary.as_ref().unwrap(), target) } @@ -267,35 +302,35 @@ impl PyDictionary { /// /// :param mode: Use this split mode (C by default) /// :param fields: ask Sudachi to load only a subset of fields. - /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html - /// :param handler: a custom callable to transform MorphemeList into list of tokens. - /// It should be should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. - /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py - /// If nothing was passed, simply use surface as token representations. - /// :param projection: projection mode for a created PreTokenizer. - /// See :class:`sudachipy.config.Config` object documentation for supported projections. + /// See https://worksapplications.github.io/sudachi.rs/python/topics/subsetting.html. + /// :param handler: a custom callable to transform MorphemeList into list of tokens. If None, simply use surface as token representations. + /// It should be a `function(index: int, original: NormalizedString, morphemes: MorphemeList) -> List[NormalizedString]`. + /// See https://github.com/huggingface/tokenizers/blob/master/bindings/python/examples/custom_components.py. + /// :param projection: Projection override for created Tokenizer. See Config.projection for values. /// - /// :type mode: sudachipy.SplitMode - /// :type fields: Set[str] + /// :type mode: SplitMode | str | None + /// :type fields: set[str] | None + /// :type handler: Callable[[int, NormalizedString, MorphemeList], list[NormalizedString]] | None + /// :type projection: str | None #[pyo3( - text_signature = "($self, mode, fields, handler) -> tokenizers.PreTokenizer", - signature = (mode = None, fields = None, handler = None, *, projection = None) + text_signature="(self, /, mode=None, fields=None, handler=None, *, projection=None) -> tokenizers.PreTokenizer", + signature=(mode=None, fields=None, handler=None, *, projection=None) )] - fn pre_tokenizer<'p>( - &'p self, - py: Python<'p>, - mode: Option<&PyAny>, - fields: Option<&PySet>, + fn pre_tokenizer<'py>( + &'py self, + py: Python<'py>, + mode: Option<&Bound<'py, PyAny>>, + fields: Option<&Bound<'py, PySet>>, handler: Option>, - projection: Option<&PyString>, - ) -> PyResult<&'p PyAny> { + projection: Option<&Bound<'py, PyString>>, + ) -> PyResult> { let mode = match mode { - Some(m) => extract_mode(py, m)?, + Some(m) => extract_mode(m)?, None => Mode::C, }; let subset = parse_field_subset(fields)?; if let Some(h) = handler.as_ref() { - if !h.as_ref(py).is_callable() { + if !h.bind(py).is_callable() { return errors::wrap(Err("handler must be callable")); } } @@ -314,12 +349,12 @@ impl PyDictionary { let projector = resolve_projection(passed, &dict.projection); let internal = PyPretokenizer::new(dict, mode, required_fields, handler, projector); - let internal_cell = PyCell::new(py, internal)?; - let module = py.import("tokenizers.pre_tokenizers")?; + let internal_cell = Bound::new(py, internal)?; + let module = py.import_bound("tokenizers.pre_tokenizers")?; module .getattr("PreTokenizer")? .getattr("custom")? - .call1(PyTuple::new(py, [internal_cell])) + .call1(PyTuple::new_bound(py, [internal_cell])) } /// Look up morphemes in the binary dictionary without performing the analysis. @@ -332,21 +367,22 @@ impl PyDictionary { /// :param surface: find all morphemes with the given surface /// :param out: if passed, reuse the given morpheme list instead of creating a new one. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. + /// /// :type surface: str - /// :type out: sudachipy.MorphemeList - #[pyo3(text_signature = "($self, surface, out = None) -> sudachipy.MorphemeList")] - fn lookup<'p>( - &'p self, - py: Python<'p>, - surface: &'p str, - out: Option<&'p PyCell>, - ) -> PyResult<&'p PyCell> { + /// :type out: MorphemeList | None + #[pyo3(text_signature = "(self, /, surface, out=None) -> MorphemeList")] + fn lookup<'py>( + &'py self, + py: Python<'py>, + surface: &'py str, + out: Option>, + ) -> PyResult> { let l = match out { Some(l) => l, - None => PyCell::new( - py, - PyMorphemeListWrapper::new(self.dictionary.clone().unwrap()), - )?, + None => { + let list = PyMorphemeListWrapper::new(self.dictionary.clone().unwrap()); + Bound::new(py, list)? + } }; // this needs to be a variable @@ -361,17 +397,22 @@ impl PyDictionary { Ok(l) } - /// Close this dictionary - #[pyo3(text_signature = "($self)")] + /// Close this dictionary. + #[pyo3(text_signature = "(self, /) -> ()")] fn close(&mut self) { self.dictionary = None; } - /// Get POS Tuple by its id - #[pyo3(text_signature = "($self, pos_id: int)")] - fn pos_of<'p>(&'p self, py: Python<'p>, pos_id: usize) -> Option<&'p PyTuple> { + /// Returns POS with the given id. + /// + /// :param pos_id: POS id + /// :return: POS tuple with the given id or None for non existing id. + /// + /// :type pos_id: int + #[pyo3(text_signature = "(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")] + fn pos_of<'py>(&'py self, py: Python<'py>, pos_id: usize) -> Option<&Bound<'py, PyTuple>> { let dic = self.dictionary.as_ref().unwrap(); - dic.pos.get(pos_id).map(|x| x.as_ref(py)) + dic.pos.get(pos_id).map(|x| x.bind(py)) } fn __repr__(&self) -> PyResult { @@ -406,7 +447,7 @@ fn config_repr(cfg: &Config) -> Result { Ok(result) } -pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &'py PyAny) -> PyResult { +pub(crate) fn extract_mode<'py>(mode: &Bound<'py, PyAny>) -> PyResult { if mode.is_instance_of::() { errors::wrap(Mode::from_str(mode.str()?.to_str()?)) } else if mode.is_instance_of::() { @@ -425,9 +466,10 @@ fn read_config_from_fs(path: Option<&Path>) -> PyResult { errors::wrap(ConfigBuilder::from_opt_file(path)) } -fn read_config(config_opt: &PyAny) -> PyResult { +fn read_config(config_opt: &Bound) -> PyResult { if config_opt.is_instance_of::() { - let config_str = config_opt.str()?.to_str()?.trim(); + let config_pystr = config_opt.str()?; + let config_str = config_pystr.to_str()?.trim(); // looks like json if config_str.starts_with("{") && config_str.ends_with("}") { let result = ConfigBuilder::from_bytes(config_str.as_bytes()); @@ -443,10 +485,10 @@ fn read_config(config_opt: &PyAny) -> PyResult { ))); } let py = config_opt.py(); - let cfg_type = py.import("sudachipy.config")?.getattr("Config")?; - if config_opt.is_instance(cfg_type)? { + let cfg_type = py.import_bound("sudachipy.config")?.getattr("Config")?; + if config_opt.is_instance(&cfg_type)? { let cfg_as_str = config_opt.call_method0("as_jsons")?; - return read_config(cfg_as_str); + return read_config(&cfg_as_str); } errors::wrap(Err(format!( "config should be sudachipy.Config or str which represents a file path or json obj, was {}: {}", @@ -456,24 +498,26 @@ fn read_config(config_opt: &PyAny) -> PyResult { } pub(crate) fn read_default_config(py: Python) -> PyResult { - let path = PyModule::import(py, "sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?; + let path = py + .import_bound("sudachipy")? + .getattr("_DEFAULT_SETTINGFILE")?; let path = path.downcast::()?.to_str()?; let path = PathBuf::from(path); errors::wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path) } pub(crate) fn get_default_resource_dir(py: Python) -> PyResult { - let path = PyModule::import(py, "sudachipy")?.getattr("_DEFAULT_RESOURCEDIR")?; + let path = py + .import_bound("sudachipy")? + .getattr("_DEFAULT_RESOURCEDIR")?; let path = path.downcast::()?.to_str()?; Ok(PathBuf::from(path)) } fn find_dict_path(py: Python, dict_type: &str) -> PyResult { - let pyfunc = PyModule::import(py, "sudachipy")?.getattr("_find_dict_path")?; - let path = pyfunc - .call1((dict_type,))? - .downcast::()? - .to_str()?; + let pyfunc = py.import_bound("sudachipy")?.getattr("_find_dict_path")?; + let path = pyfunc.call1((dict_type,))?; + let path = path.downcast::()?.to_str()?; Ok(PathBuf::from(path)) } @@ -487,15 +531,14 @@ fn locate_system_dict(py: Python, path: &Path) -> PyResult { } } -fn parse_field_subset(data: Option<&PySet>) -> PyResult { +fn parse_field_subset(data: Option<&Bound>) -> PyResult { if data.is_none() { return Ok(InfoSubset::all()); } let mut subset = InfoSubset::empty(); - for el in data.unwrap().iter() { - let s = el.str()?.to_str()?; - subset |= match s { + for elem in data.unwrap().iter() { + subset |= match elem.str()?.to_str()? { "surface" => InfoSubset::SURFACE, "pos" | "pos_id" => InfoSubset::POS_ID, "normalized_form" => InfoSubset::NORMALIZED_FORM, diff --git a/python/src/errors.rs b/python/src/errors.rs index da72601a..6fdd0e97 100644 --- a/python/src/errors.rs +++ b/python/src/errors.rs @@ -37,5 +37,5 @@ pub fn wrap_ctx(v: Result, ctx: &C) -> P } pub fn warn_deprecation(py: Python<'_>, msg: &str) -> PyResult<()> { - PyErr::warn(py, &py.get_type::(), msg, 1) + PyErr::warn_bound(py, &py.get_type_bound::(), msg, 1) } diff --git a/python/src/lib.rs b/python/src/lib.rs index 68a9c91d..3a5f8f84 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,15 +26,18 @@ mod projection; mod tokenizer; mod word_info; -/// module root +/// SudachiPy raw module root. +/// +/// Users should not use this directly. #[pymodule] -fn sudachipy(_py: Python, m: &PyModule) -> PyResult<()> { +fn sudachipy(_py: Python, m: &Bound) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; build::register_functions(m)?; Ok(()) } diff --git a/python/src/morpheme.rs b/python/src/morpheme.rs index fd097336..3cf1bc5d 100644 --- a/python/src/morpheme.rs +++ b/python/src/morpheme.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,10 @@ use crate::word_info::PyWordInfo; pub(crate) type PyMorphemeList = MorphemeList>; pub(crate) type PyProjector = Option>; -/// A list of morphemes +/// A list of morphemes. +/// +/// An object can not be instantiated manually. +/// Use Tokenizer.tokenize("") to create an empty morpheme list. #[pyclass(module = "sudachipy.morphemelist", name = "MorphemeList")] pub struct PyMorphemeListWrapper { /// use `internal()` function instead @@ -87,12 +90,16 @@ impl PyMorphemeListWrapper { } } } + #[pymethods] impl PyMorphemeListWrapper { - /// Returns an empty morpheme list with dictionary + /// Returns an empty morpheme list with dictionary. + /// + /// .. deprecated:: 0.6.0 + /// Use Tokenizer.tokenize("") if you need. #[classmethod] - #[pyo3(text_signature = "(dict: sudachipy.Dictionary) -> sudachipy.MorphemeList")] - fn empty(_cls: &PyType, py: Python, dict: &PyDictionary) -> PyResult { + #[pyo3(text_signature = "(dict: Dictionary) -> MorphemeList")] + fn empty(_cls: &Bound, py: Python, dict: &PyDictionary) -> PyResult { errors::warn_deprecation( py, "Use Tokenizer.tokenize(\"\") if you need an empty MorphemeList.", @@ -106,14 +113,14 @@ impl PyMorphemeListWrapper { }) } - /// Returns the total cost of the path - #[pyo3(text_signature = "($self) -> int")] + /// Returns the total cost of the path. + #[pyo3(text_signature = "(self, /) -> int")] fn get_internal_cost(&self, py: Python) -> i32 { self.internal(py).get_internal_cost() } /// Returns the number of morpheme in this list. - #[pyo3(text_signature = "($self) -> int")] + #[pyo3(text_signature = "(self, /) -> int")] fn size(&self, py: Python) -> usize { self.internal(py).len() } @@ -122,7 +129,7 @@ impl PyMorphemeListWrapper { self.size(py) } - fn __getitem__(slf: &PyCell, mut idx: isize) -> PyResult { + fn __getitem__(slf: Bound, mut idx: isize) -> PyResult { let list = slf.borrow(); let py = slf.py(); let len = list.size(py) as isize; @@ -148,7 +155,7 @@ impl PyMorphemeListWrapper { }) } - fn __str__<'py>(&'py self, py: Python<'py>) -> &PyString { + fn __str__<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> { // do a simple tokenization __str__ let list = self.internal(py); let mut result = String::with_capacity(list.surface().len() * 2); @@ -159,10 +166,10 @@ impl PyMorphemeListWrapper { result.push_str(" "); } } - PyString::new(py, result.as_str()) + PyString::new_bound(py, result.as_str()) } - fn __repr__(slf: Py, py: Python) -> PyResult<&PyString> { + fn __repr__(slf: Py, py: Python) -> PyResult> { let self_ref = slf.borrow(py); let list = self_ref.internal(py); let mut result = String::with_capacity(list.surface().len() * 10); @@ -178,7 +185,7 @@ impl PyMorphemeListWrapper { result.push_str(",\n"); } result.push_str("]>"); - Ok(PyString::new(py, result.as_str())) + Ok(PyString::new_bound(py, result.as_str())) } fn __iter__(slf: Py) -> PyMorphemeIter { @@ -193,7 +200,7 @@ impl PyMorphemeListWrapper { } } -/// A morpheme (basic semantic unit of language). +/// An iterator over the MorphemeList. #[pyclass(module = "sudachipy.morphemelist", name = "MorphemeIter")] pub struct PyMorphemeIter { list: Py, @@ -237,6 +244,7 @@ impl<'py> Deref for MorphemeRef<'py> { } } +/// A morpheme (basic semantic unit of language). #[pyclass(module = "sudachipy.morpheme", name = "Morpheme", frozen)] pub struct PyMorpheme { list: Py, @@ -272,40 +280,44 @@ impl PyMorpheme { #[pymethods] impl PyMorpheme { - /// Returns the begin index of this in the input text - #[pyo3(text_signature = "($self) -> int")] + /// Returns the begin index of this in the input text. + #[pyo3(text_signature = "(self, /) -> int")] fn begin(&self, py: Python) -> usize { // call codepoint version self.morph(py).begin_c() } - /// Returns the end index of this in the input text - #[pyo3(text_signature = "($self) -> int")] + /// Returns the end index of this in the input text. + #[pyo3(text_signature = "(self, /) -> int")] fn end(&self, py: Python) -> usize { // call codepoint version self.morph(py).end_c() } - /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured - #[pyo3(text_signature = "($self) -> str")] - fn surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { + /// Returns the substring of input text corresponding to the morpheme, or a projection if one is configured. + /// + /// See `Config.projection`. + #[pyo3(text_signature = "(self, /) -> str")] + fn surface<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> { let list = self.list(py); let morph = self.morph(py); match list.projection() { - None => PyString::new(py, morph.surface().deref()), + None => PyString::new_bound(py, morph.surface().deref()), Some(proj) => proj.project(morph.deref(), py), } } - /// Returns the substring of input text corresponding to the morpheme regardless the configured projection - #[pyo3(text_signature = "($self) -> str")] - fn raw_surface<'py>(&'py self, py: Python<'py>) -> &'py PyString { - PyString::new(py, self.morph(py).surface().deref()) + /// Returns the substring of input text corresponding to the morpheme regardless the configured projection. + /// + /// See `Config.projection`. + #[pyo3(text_signature = "(self, /) -> str")] + fn raw_surface<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> { + PyString::new_bound(py, self.morph(py).surface().deref()) } /// Returns the part of speech as a six-element tuple. - /// Tuple elements are four POS levels, conjugation type and conjugation form. - #[pyo3(text_signature = "($self)")] + /// Tuple elements are four POS levels, conjugation type and conjugation form. + #[pyo3(text_signature = "(self, /) -> tuple[str, str, str, str, str, str]")] fn part_of_speech<'py>(&'py self, py: Python<'py>) -> Py { let pos_id = self.part_of_speech_id(py); self.list(py) @@ -315,60 +327,59 @@ impl PyMorpheme { .clone_ref(py) } - /// Returns the id of the part of speech in the dictionary - #[pyo3(text_signature = "($self) -> int")] + /// Returns the id of the part of speech in the dictionary. + #[pyo3(text_signature = "(self, /) -> int")] pub fn part_of_speech_id(&self, py: Python) -> u16 { self.morph(py).part_of_speech_id() } - /// Returns the dictionary form - #[pyo3(text_signature = "($self) -> str")] + /// Returns the dictionary form. + #[pyo3(text_signature = "(self, /) -> str")] fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().dictionary_form().into_py(py) } - /// Returns the normalized form - #[pyo3(text_signature = "($self) -> str")] + /// Returns the normalized form. + #[pyo3(text_signature = "(self, /) -> str")] fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().normalized_form().into_py(py) } - /// Returns the reading form - #[pyo3(text_signature = "($self) -> str")] + /// Returns the reading form. + #[pyo3(text_signature = "(self, /) -> str")] fn reading_form<'py>(&'py self, py: Python<'py>) -> PyObject { self.morph(py).get_word_info().reading_form().into_py(py) } /// Returns sub-morphemes in the provided split mode. /// - /// :param mode: mode of new split - /// :param out: write results to this MorhpemeList instead of creating new one + /// :param mode: mode of new split. + /// :param out: write results to this MorhpemeList instead of creating new one. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for /// more information on output parameters. /// Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter. /// :param add_single: return lists with the current morpheme if the split hasn't produced any elements. /// When False is passed, empty lists are returned instead. - /// :type mode: sudachipy.SplitMode - /// :type out: Optional[sudachipy.MorphemeList] + /// + /// :type mode: SplitMode | None + /// :type out: MorphemeList | None /// :type add_single: bool - #[pyo3( - text_signature = "($self, mode, out = None, add_single = False) -> sudachipy.MorphemeList" - )] + #[pyo3(text_signature = "(self, /, mode, out=None, add_single=False) -> MorphemeList")] fn split<'py>( &'py self, py: Python<'py>, - mode: &PyAny, - out: Option<&'py PyCell>, + mode: &Bound<'py, PyAny>, + out: Option>, add_single: Option, - ) -> PyResult<&'py PyCell> { + ) -> PyResult> { let list = self.list(py); - let mode = extract_mode(py, mode)?; + let mode = extract_mode(mode)?; let out_cell = match out { None => { let list = list.empty_clone(py); - PyCell::new(py, list)? + Bound::new(py, list)? } Some(r) => r, }; @@ -393,20 +404,20 @@ impl PyMorpheme { Ok(out_cell) } - /// Returns whether if this is out of vocabulary word - #[pyo3(text_signature = "($self) -> bool")] + /// Returns whether if this is out of vocabulary word. + #[pyo3(text_signature = "(self, /) -> bool")] fn is_oov(&self, py: Python) -> bool { self.morph(py).is_oov() } - /// Returns word id of this word in the dictionary - #[pyo3(text_signature = "($self) -> int")] + /// Returns word id of this word in the dictionary. + #[pyo3(text_signature = "(self, /) -> int")] fn word_id(&self, py: Python) -> u32 { self.morph(py).word_id().as_raw() } - /// Returns the dictionary id which this word belongs - #[pyo3(text_signature = "($self) -> int")] + /// Returns the dictionary id which this word belongs. + #[pyo3(text_signature = "(self, /) -> int")] fn dictionary_id(&self, py: Python) -> i32 { let word_id = self.morph(py).word_id(); if word_id.is_oov() { @@ -416,28 +427,31 @@ impl PyMorpheme { } } - /// Returns the list of synonym group ids - #[pyo3(text_signature = "($self) -> List[int]")] - fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> &'py PyList { + /// Returns the list of synonym group ids. + #[pyo3(text_signature = "(self, /) -> List[int]")] + fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> Bound { let mref = self.morph(py); let ids = mref.get_word_info().synonym_group_ids(); - PyList::new(py, ids) + PyList::new_bound(py, ids) } - /// Returns the word info - #[pyo3(text_signature = "($self) -> sudachipy.WordInfo")] + /// Returns the word info. + /// + /// ..deprecated:: v0.6.0 + /// Users should not touch the raw WordInfo. + #[pyo3(text_signature = "(self, /) -> WordInfo")] fn get_word_info(&self, py: Python) -> PyResult { errors::warn_deprecation(py, "Users should not touch the raw WordInfo.")?; Ok(self.morph(py).get_word_info().clone().into()) } - /// Returns morpheme length in codepoints + /// Returns morpheme length in codepoints. pub fn __len__(&self, py: Python) -> usize { let m = self.morph(py); m.end_c() - m.begin_c() } - pub fn __str__<'py>(&'py self, py: Python<'py>) -> &'py PyString { + pub fn __str__<'py>(&'py self, py: Python<'py>) -> Bound<'py, PyString> { self.surface(py) } diff --git a/python/src/pos_matcher.rs b/python/src/pos_matcher.rs index f0a53b64..352d21c1 100644 --- a/python/src/pos_matcher.rs +++ b/python/src/pos_matcher.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,12 @@ use crate::dictionary::PyDicData; use crate::errors; use crate::morpheme::PyMorpheme; -#[pyclass(name = "PosMatcher", module = "sudachipy")] +/// A part-of-speech matcher which checks if a morpheme belongs to a set of part of speech. +/// +/// Create using Dictionary.pos_matcher method. +/// +/// Use `__call__(m: Morpheme) -> bool` to check whether a morpheme has matching POS. +#[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcher")] pub struct PyPosMatcher { matcher: PosMatcher, dic: Arc, @@ -36,20 +41,20 @@ impl PyPosMatcher { pub(crate) fn create<'py>( py: Python<'py>, dic: &'py Arc, - data: &'py PyAny, + data: &Bound<'py, PyAny>, ) -> PyResult { if data.is_callable() { Self::create_from_fn(dic, data, py) } else { let iter = data.iter()?; - Self::create_from_iter(dic, iter) + Self::create_from_iter(dic, &iter) } } - fn create_from_fn(dic: &Arc, func: &PyAny, py: Python) -> PyResult { + fn create_from_fn(dic: &Arc, func: &Bound, py: Python) -> PyResult { let mut data = Vec::new(); for (pos_id, pos) in dic.pos.iter().enumerate() { - let args = PyTuple::new(py, &[pos]); + let args = PyTuple::new_bound(py, &[pos]); if func.call1(args)?.downcast::()?.is_true() { data.push(pos_id as u16); } @@ -60,10 +65,11 @@ impl PyPosMatcher { }) } - fn create_from_iter(dic: &Arc, data: &PyIterator) -> PyResult { + fn create_from_iter(dic: &Arc, data: &Bound) -> PyResult { let mut result = Vec::new(); for item in data { - let item = item?.downcast::()?; + let item = item?; + let item = item.downcast::()?; Self::match_pos_elements(&mut result, dic.as_ref(), item)?; } Ok(Self { @@ -72,7 +78,11 @@ impl PyPosMatcher { }) } - fn match_pos_elements(data: &mut Vec, dic: &PyDicData, elem: &PyTuple) -> PyResult<()> { + fn match_pos_elements( + data: &mut Vec, + dic: &PyDicData, + elem: &Bound, + ) -> PyResult<()> { let start_len = data.len(); let elen = elem.len(); @@ -118,6 +128,12 @@ impl PyPosMatcher { #[pymethods] impl PyPosMatcher { + /// Checks whether a morpheme has matching POS. + /// + /// :param m: a morpheme to check. + /// :return: if morpheme has matching POS. + /// + /// :type m: Morpheme pub fn __call__<'py>(&'py self, py: Python<'py>, m: &'py PyMorpheme) -> bool { let pos_id = m.part_of_speech_id(py); self.matcher.matches_id(pos_id) @@ -135,6 +151,7 @@ impl PyPosMatcher { self.matcher.num_entries() } + /// Returns a POS matcher which matches a POS if any of two matchers would match it. pub fn __or__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -148,6 +165,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches a POS if both matchers would match it at the same time. pub fn __and__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -161,6 +179,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches a POS if self would match the POS and other would not match the POS. pub fn __sub__(&self, other: &Self) -> Self { assert_eq!( Arc::as_ptr(&self.dic), @@ -174,6 +193,7 @@ impl PyPosMatcher { } } + /// Returns a POS matcher which matches all POS tags except ones defined in the current POS matcher. pub fn __invert__(&self) -> Self { let max_id = self.dic.pos.len(); // map -> filter chain is needed to handle exactly u16::MAX POS entries @@ -189,7 +209,8 @@ impl PyPosMatcher { } } -#[pyclass(name = "PosMatcherIterator", module = "sudachipy")] +/// An iterator over POS tuples in the PosPatcher +#[pyclass(module = "sudachipy.pos_matcher", name = "PosMatcherIterator")] pub struct PyPosIter { data: Vec, dic: Arc, @@ -210,11 +231,11 @@ impl PyPosIter { #[pymethods] impl PyPosIter { - fn __iter__(slf: &PyCell) -> &PyCell { + fn __iter__(slf: Bound) -> Bound { slf } - fn __next__<'py>(&'py mut self, py: Python<'py>) -> Option<&'py PyTuple> { + fn __next__<'py>(&'py mut self, py: Python<'py>) -> Option<&Bound<'py, PyTuple>> { let idx = self.index; self.index += 1; if idx >= self.data.len() { @@ -222,6 +243,6 @@ impl PyPosIter { } let pos_id = self.data[idx]; let pos = &self.dic.pos[pos_id as usize]; - Some(pos.as_ref(py)) + Some(pos.bind(py)) } } diff --git a/python/src/pretokenizer.rs b/python/src/pretokenizer.rs index 49cf1a29..3d72daab 100644 --- a/python/src/pretokenizer.rs +++ b/python/src/pretokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -76,9 +76,10 @@ impl PerThreadPreTokenizer { } } -/// Binding for the Tokenizer, which handles threading for tokenization +/// Binding for the Tokenizer, which handles threading for tokenization. /// -/// We use ThreadLocal for storing actual tokenizers +/// Create using Dictionary.pre_tokenizer method. +/// We use ThreadLocal for storing actual tokenizers. #[pyclass(module = "sudachipy.pretokenizer", name = "SudachiPreTokenizer")] pub struct PyPretokenizer { dict: Arc, @@ -126,13 +127,14 @@ impl PyPretokenizer { /// /// Implementation uses Sudachi to perform the analysis, then uses slice method /// of the passed parameter to create output data - pub fn __call__<'p>( - &'p self, - py: Python<'p>, - index: &'p PyAny, - string: &'p PyAny, - ) -> PyResult<&'p PyAny> { - let input_data = string.str()?.to_str()?; + pub fn __call__<'py>( + &'py self, + py: Python<'py>, + index: &Bound<'py, PyAny>, + string: &Bound<'py, PyAny>, + ) -> PyResult> { + let pystr = string.str()?; + let input_data = pystr.to_str()?; // tokenization itself should work without GIL, we have thread-local tokenizers here py.allow_threads(|| self.tokenizer_cell().borrow_mut().tokenize(input_data))?; // then prepare results with GIL @@ -144,38 +146,38 @@ impl PyPretokenizer { let py_ref = morphs.borrow(py); let morphs = py_ref.internal(py); match self.projection.as_deref() { - None => make_result_for_surface(py, morphs, string), - Some(p) => make_result_for_projection(py, morphs, p), + None => make_result_for_surface(py, morphs, string).map(|bl| bl.into_any()), + Some(p) => make_result_for_projection(py, morphs, p).map(|bl| bl.into_any()), } } Some(h) => { - let mrp: &PyAny = morphs.as_ref(py); - let args = PyTuple::new(py, &[index, string, mrp]); - h.as_ref(py).call1(args) + let mrp: &Bound = morphs.bind(py); + let args = PyTuple::new_bound(py, &[index, string, mrp]); + h.bind(py).call1(args) } } } /// Entry function for tokenization - pub fn pre_tokenize<'p>( - self_: &'p PyCell, - py: Python<'p>, - data: &'p PyAny, - ) -> PyResult<&'p PyAny> { - data.call_method1("split", PyTuple::new(py, [self_])) + pub fn pre_tokenize<'py>( + self_: Bound<'py, Self>, + py: Python<'py>, + data: &Bound<'py, PyAny>, + ) -> PyResult> { + data.call_method1(intern!(py, "split"), PyTuple::new_bound(py, [self_])) } } fn make_result_for_surface<'py>( py: Python<'py>, morphs: &PyMorphemeList, - string: &'py PyAny, -) -> PyResult<&'py PyAny> { - let result = PyList::empty(py); + string: &Bound<'py, PyAny>, +) -> PyResult> { + let result = PyList::empty_bound(py); for idx in 0..morphs.len() { let node = morphs.get(idx); - let slice = PySlice::new(py, node.begin_c() as isize, node.end_c() as isize, 1); - let args = PyTuple::new(py, [slice]); + let slice = PySlice::new_bound(py, node.begin_c() as isize, node.end_c() as isize, 1); + let args = PyTuple::new_bound(py, [slice]); let substring = string.call_method1(intern!(py, "slice"), args)?; result.append(substring)?; } @@ -186,20 +188,20 @@ fn make_result_for_projection<'py>( py: Python<'py>, morphs: &PyMorphemeList, proj: &dyn MorphemeProjection, -) -> PyResult<&'py PyAny> { - let result = PyList::empty(py); +) -> PyResult> { + let result = PyList::empty_bound(py); let nstring = { - static NORMALIZED_STRING: GILOnceCell> = pyo3::sync::GILOnceCell::new(); + static NORMALIZED_STRING: GILOnceCell> = GILOnceCell::new(); NORMALIZED_STRING.get_or_try_init(py, || -> PyResult> { - let ns = py.import("tokenizers")?.getattr("NormalizedString")?; - let tpe = ns.downcast::(); - tpe.map(|x| x.into_py(py)).map_err(|e| e.into()) + let ns = py.import_bound("tokenizers")?.getattr("NormalizedString")?; + let tpe = ns.downcast::()?; + Ok(tpe.clone().unbind()) })? }; for idx in 0..morphs.len() { let node = morphs.get(idx); let value = proj.project(&node, py); - let args = PyTuple::new(py, [value]); + let args = PyTuple::new_bound(py, [value]); let substring = nstring.call1(py, args)?; result.append(substring)?; } diff --git a/python/src/projection.rs b/python/src/projection.rs index 9140e747..97d2c557 100644 --- a/python/src/projection.rs +++ b/python/src/projection.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2023-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ use crate::dictionary::PyDicData; use crate::errors; use crate::morpheme::PyProjector; +use pyo3::prelude::*; use pyo3::types::PyString; use pyo3::{PyResult, Python}; use std::convert::TryFrom; @@ -28,14 +29,14 @@ use sudachi::pos::PosMatcher; use sudachi::prelude::Morpheme; pub(crate) trait MorphemeProjection { - fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> &'py PyString; + fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> Bound<'py, PyString>; } struct Surface {} impl MorphemeProjection for Surface { - fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> &'py PyString { - PyString::new(py, m.surface().deref()) + fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> Bound<'py, PyString> { + PyString::new_bound(py, m.surface().deref()) } } @@ -44,8 +45,8 @@ struct Mapped Fn(&'a Morpheme<'a, Arc>) -> &'a str> { } impl Fn(&'a Morpheme<'a, Arc>) -> &'a str> MorphemeProjection for Mapped { - fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> &'py PyString { - PyString::new(py, (self.func)(m)) + fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> Bound<'py, PyString> { + PyString::new_bound(py, (self.func)(m)) } } @@ -61,11 +62,11 @@ impl DictionaryAndSurface { } impl MorphemeProjection for DictionaryAndSurface { - fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> &'py PyString { + fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> Bound<'py, PyString> { if self.matcher.matches_id(m.part_of_speech_id()) { - PyString::new(py, m.surface().deref()) + PyString::new_bound(py, m.surface().deref()) } else { - PyString::new(py, m.dictionary_form()) + PyString::new_bound(py, m.dictionary_form()) } } } @@ -82,11 +83,11 @@ impl NormalizedAndSurface { } impl MorphemeProjection for NormalizedAndSurface { - fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> &'py PyString { + fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> Bound<'py, PyString> { if self.matcher.matches_id(m.part_of_speech_id()) { - PyString::new(py, m.surface().deref()) + PyString::new_bound(py, m.surface().deref()) } else { - PyString::new(py, m.normalized_form()) + PyString::new_bound(py, m.normalized_form()) } } } @@ -103,11 +104,11 @@ impl NormalizedNouns { } impl MorphemeProjection for NormalizedNouns { - fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> &'py PyString { + fn project<'py>(&self, m: &Morpheme>, py: Python<'py>) -> Bound<'py, PyString> { if self.matcher.matches_id(m.part_of_speech_id()) { - PyString::new(py, m.normalized_form()) + PyString::new_bound(py, m.normalized_form()) } else { - PyString::new(py, m.surface().deref()) + PyString::new_bound(py, m.surface().deref()) } } } @@ -165,7 +166,7 @@ pub(crate) fn resolve_projection(base: PyProjector, fallback: &PyProjector) -> P } pub(crate) fn parse_projection( - value: &PyString, + value: &Bound, dict: &D, ) -> PyResult<(PyProjector, SurfaceProjection)> { value.to_str().and_then(|s| parse_projection_raw(s, dict)) @@ -188,7 +189,7 @@ pub(crate) fn parse_projection_raw( } pub(crate) fn parse_projection_opt( - value: Option<&PyString>, + value: Option<&Bound>, dict: &D, ) -> PyResult<(PyProjector, SurfaceProjection)> { match value { diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 18ec0a63..a5026b9b 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,14 +29,18 @@ use crate::dictionary::{extract_mode, PyDicData}; use crate::errors; use crate::morpheme::{PyMorphemeListWrapper, PyProjector}; -/// Unit to split text +/// Unit to split text. /// /// A == short mode /// /// B == middle mode /// /// C == long mode -// +/// +/// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. +/// If None, returns SplitMode.C. +/// +/// :type mode: str | None #[pyclass(module = "sudachipy.tokenizer", name = "SplitMode", frozen)] #[derive(Clone, PartialEq, Eq, Copy, Debug)] #[repr(u8)] @@ -68,7 +72,17 @@ impl From for PySplitMode { #[pymethods] impl PySplitMode { + /// Creates a split mode from a string value. + /// + /// :param mode: string representation of the split mode. One of [A,B,C] in captital or lower case. + /// If None, returns SplitMode.C. + /// + /// :type mode: str | None #[new] + #[pyo3( + text_signature="(mode=None) -> SplitMode", + signature=(mode=None) + )] fn new(mode: Option<&str>) -> PyResult { let mode = match mode { Some(m) => m, @@ -78,7 +92,9 @@ impl PySplitMode { } } -/// Sudachi Tokenizer, Python version +/// A sudachi tokenizer +/// +/// Create using Dictionary.create method. #[pyclass(module = "sudachipy.tokenizer", name = "Tokenizer")] pub(crate) struct PyTokenizer { tokenizer: StatefulTokenizer>, @@ -111,35 +127,35 @@ impl PyTokenizer { /// Break text into morphemes. /// - /// SudachiPy 0.5.* had logger parameter, it is accepted, but ignored. - /// - /// :param text: text to analyze + /// :param text: text to analyze. /// :param mode: analysis mode. /// This parameter is deprecated. /// Pass the analysis mode at the Tokenizer creation time and create different tokenizers for different modes. /// If you need multi-level splitting, prefer using :py:meth:`Morpheme.split` method instead. + /// :param logger: Arg for v0.5.* compatibility. Ignored. /// :param out: tokenization results will be written into this MorphemeList, a new one will be created instead. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. + /// /// :type text: str - /// :type mode: sudachipy.SplitMode - /// :type out: sudachipy.MorphemeList + /// :type mode: SplitMode | str | None + /// :type out: MorphemeList #[pyo3( - text_signature = "($self, text: str, mode = None, logger = None, out = None) -> sudachipy.MorphemeList", - signature = (text, mode = None, logger = None, out = None) + text_signature="(self, /, text: str, mode=None, logger=None, out=None) -> MorphemeList", + signature=(text, mode=None, logger=None, out=None) )] #[allow(unused_variables)] fn tokenize<'py>( &'py mut self, py: Python<'py>, text: &'py str, - mode: Option<&PyAny>, + mode: Option<&Bound<'py, PyAny>>, logger: Option, - out: Option<&'py PyCell>, - ) -> PyResult<&'py PyCell> { + out: Option>, + ) -> PyResult> { // restore default mode on scope exit let mode = match mode { None => None, - Some(m) => Some(extract_mode(py, m)?), + Some(m) => Some(extract_mode(m)?), }; let default_mode = mode.map(|m| self.tokenizer.set_mode(m.into())); let mut tokenizer = scopeguard::guard(&mut self.tokenizer, |t| { @@ -161,7 +177,7 @@ impl PyTokenizer { let morphemes = MorphemeList::empty(dict); let wrapper = PyMorphemeListWrapper::from_components(morphemes, self.projection.clone()); - PyCell::new(py, wrapper)? + Bound::new(py, wrapper)? } Some(list) => list, }; @@ -180,6 +196,7 @@ impl PyTokenizer { Ok(out_list) } + /// SplitMode of the tokenizer. #[getter] fn mode(&self) -> PySplitMode { self.tokenizer.mode().into() diff --git a/python/src/word_info.rs b/python/src/word_info.rs index 4f74d0f1..eb51a28d 100644 --- a/python/src/word_info.rs +++ b/python/src/word_info.rs @@ -18,29 +18,18 @@ use pyo3::prelude::*; use sudachi::dic::lexicon::word_infos::{WordInfo, WordInfoData}; -#[pyclass(module = "sudachipy.wordinfo", name = "WordInfo")] +#[pyclass(module = "sudachipy.wordinfo", name = "WordInfo", get_all)] pub struct PyWordInfo { - #[pyo3(get)] surface: String, - #[pyo3(get)] head_word_length: u16, - #[pyo3(get)] pos_id: u16, - #[pyo3(get)] normalized_form: String, - #[pyo3(get)] dictionary_form_word_id: i32, - #[pyo3(get)] dictionary_form: String, - #[pyo3(get)] reading_form: String, - #[pyo3(get)] a_unit_split: Vec, - #[pyo3(get)] b_unit_split: Vec, - #[pyo3(get)] word_structure: Vec, - #[pyo3(get)] synonym_group_ids: Vec, } diff --git a/sudachi-cli/Cargo.toml b/sudachi-cli/Cargo.toml index c5070424..14aeebb5 100644 --- a/sudachi-cli/Cargo.toml +++ b/sudachi-cli/Cargo.toml @@ -14,8 +14,8 @@ license.workspace = true sudachi = { path = "../sudachi" } cfg-if = "1.0.0" # MIT/Apache 2.0 -memmap2 = "0.9" # MIT/Apache 2.0 clap = { version = "4.5", features = ["derive"] } # MIT/Apache 2.0 +memmap2 = "0.9" # MIT/Apache 2.0 [[bin]] name = "sudachi" diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index eea11ecf..dbb03444 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -27,6 +27,8 @@ use sudachi::dic::build::report::DictPartReport; use sudachi::dic::build::DictBuilder; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::dic::grammar::Grammar; +use sudachi::dic::header::HeaderVersion; +use sudachi::dic::lexicon::word_infos::WordInfo; use sudachi::dic::lexicon_set::LexiconSet; use sudachi::dic::word_id::WordId; use sudachi::dic::DictionaryLoader; @@ -76,9 +78,17 @@ pub(crate) enum BuildCli { #[command(name = "dump")] Dump { - dict: PathBuf, + /// target dictionary to dump + dictionary: PathBuf, + /// dump target (matrix, pos, winfo) part: String, + /// output file output: PathBuf, + + /// reference system dictionary. + /// required to dump winfo of an user dictionary + #[arg(short = 's', long = "system")] + system: Option, }, } @@ -101,7 +111,12 @@ pub fn build_main(subcommand: BuildCli) { match subcommand { BuildCli::System { common, matrix } => build_system(common, matrix), BuildCli::User { common, dictionary } => build_user(common, dictionary), - BuildCli::Dump { dict, part, output } => dump_part(dict, part, output), + BuildCli::Dump { + dictionary, + part, + output, + system, + } => dump_part(dictionary, system, part, output), } } @@ -172,31 +187,36 @@ fn output_file(p: &Path) -> File { OpenOptions::new() .write(true) .create_new(true) - .open(&p) + .open(p) .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e)) } -fn dump_part(dict: PathBuf, part: String, output: PathBuf) { - let file = File::open(&dict).expect("open failed"); - let data = unsafe { Mmap::map(&file) }.expect("mmap failed"); +fn dump_part(dict: PathBuf, system: Option, part: String, output: PathBuf) { + let file = File::open(dict).expect("open dict failed"); + let data = unsafe { Mmap::map(&file) }.expect("mmap dict failed"); let loader = unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary"); - let dict = loader.to_loaded().expect("should contain grammar"); let outf = output_file(&output); let mut writer = BufWriter::new(outf); match part.as_str() { - "pos" => dump_pos(dict.grammar(), &mut writer), - "matrix" => dump_matrix(dict.grammar(), &mut writer), - "winfo" => dump_word_info(dict.lexicon(), &mut writer).unwrap(), + "pos" => dump_pos(loader, &mut writer), + "matrix" => dump_matrix(loader, &mut writer), + "winfo" => dump_word_info(loader, system, &mut writer).unwrap(), _ => unimplemented!(), } writer.flush().unwrap(); } -fn dump_pos(grammar: &Grammar, w: &mut W) { - for p in grammar.pos_list.iter() { +fn dump_pos(dict: DictionaryLoader, w: &mut W) { + let dict = dict + .to_loaded() + .expect("target dict should contain grammar"); + let grammar = dict.grammar(); + + for (id, p) in grammar.pos_list.iter().enumerate() { + write!(w, "{},", id).unwrap(); for (i, e) in p.iter().enumerate() { w.write_all(e.as_bytes()).unwrap(); if (i + 1) == p.len() { @@ -208,35 +228,86 @@ fn dump_pos(grammar: &Grammar, w: &mut W) { } } -fn dump_matrix(grammar: &Grammar, w: &mut W) { +fn dump_matrix(dict: DictionaryLoader, w: &mut W) { + if let HeaderVersion::UserDict(_) = dict.header.version { + panic!("user dictionary does not have connection matrix.") + } + + let dict = dict + .to_loaded() + .expect("target dict should contain grammar"); + let grammar = dict.grammar(); let conn = grammar.conn_matrix(); - write!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap(); + writeln!(w, "{} {}", conn.num_left(), conn.num_right()).unwrap(); for left in 0..conn.num_left() { for right in 0..conn.num_right() { let cost = conn.cost(left as _, right as _); - write!(w, "{} {} {}\n", left, right, cost).unwrap(); + writeln!(w, "{} {} {}", left, right, cost).unwrap(); } } } -fn dump_word_info(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> { - let size = lex.size(); +fn dump_word_info( + dict: DictionaryLoader, + system: Option, + w: &mut W, +) -> SudachiResult<()> { + let is_user = match dict.header.version { + HeaderVersion::UserDict(_) => true, + HeaderVersion::SystemDict(_) => false, + }; + let did = if is_user { 1 } else { 0 }; + let size = dict.lexicon.size(); + + let data = system.map(|system_path| { + let file = File::open(system_path).expect("open system failed"); + unsafe { Mmap::map(&file) }.expect("mmap system failed") + }); + let system = data.as_ref().map(|data| { + let loader = DictionaryLoader::read_system_dictionary(data) + .expect("failed to load system dictionary"); + loader + .to_loaded() + .expect("failed to load system dictionary") + }); + + let (base, user) = if is_user { + ( + system.expect("system dictionary is required to dump user dictionary lexicon"), + Some(dict), + ) + } else { + (dict.to_loaded().expect("failed to load dictionary"), None) + }; + + let mut lex = base.lexicon_set; + let mut grammar = base.grammar; + if let Some(udic) = user { + lex.append(udic.lexicon, grammar.pos_list.len())?; + if let Some(g) = udic.grammar { + grammar.merge(g) + } + } + for i in 0..size { - let wid = WordId::checked(0, i)?; + let wid = WordId::checked(did, i)?; let (left, right, cost) = lex.get_word_param(wid); let winfo = lex.get_word_info(wid)?; + write!(w, "{},", unicode_escape(winfo.surface()))?; write!(w, "{},{},{},", left, right, cost)?; - write!(w, "{},", winfo.surface())?; - write!(w, "{},", winfo.head_word_length())?; - write!(w, "{},", winfo.normalized_form())?; - write!(w, "{},", winfo.dictionary_form_word_id())?; - write!(w, "{},", winfo.reading_form())?; - dump_wids(w, winfo.a_unit_split())?; + write!(w, "{},", unicode_escape(winfo.surface()))?; // writing + write!(w, "{},", pos_string(&grammar, winfo.pos_id()))?; + write!(w, "{},", unicode_escape(winfo.reading_form()))?; + write!(w, "{},", unicode_escape(winfo.normalized_form()))?; + let dict_form = dictionary_form_string(&grammar, &lex, winfo.dictionary_form_word_id()); + write!(w, "{},", dict_form)?; + write!(w, "{},", split_mode(&winfo))?; + dump_wids(w, &grammar, &lex, winfo.a_unit_split())?; w.write_all(b",")?; - dump_wids(w, winfo.b_unit_split())?; + dump_wids(w, &grammar, &lex, winfo.b_unit_split())?; w.write_all(b",")?; - dump_wids(w, winfo.word_structure())?; + dump_wids(w, &grammar, &lex, winfo.word_structure())?; w.write_all(b",")?; dump_gids(w, winfo.synonym_group_ids())?; w.write_all(b"\n")?; @@ -244,23 +315,76 @@ fn dump_word_info(lex: &LexiconSet, w: &mut W) -> SudachiResult<()> { Ok(()) } -fn dump_wids(w: &mut W, data: &[WordId]) -> SudachiResult<()> { +fn unicode_escape(raw: &str) -> String { + // replace '"' and ',' + raw.to_string() + .replace('"', "\\u0022") + .replace(',', "\\u002c") +} + +fn split_mode(winfo: &WordInfo) -> &str { + let asplits = winfo.a_unit_split(); + if asplits.is_empty() { + return "A"; + } + let bsplits = winfo.b_unit_split(); + if bsplits.is_empty() { + return "B"; + } + "C" +} + +fn pos_string(grammar: &Grammar, posid: u16) -> String { + let pos_parts = grammar.pos_components(posid); + pos_parts.join(",") +} + +fn dictionary_form_string(grammar: &Grammar, lex: &LexiconSet, wid: i32) -> String { + if wid < 0 { + return "*".to_string(); + } + let wid_with_dic = WordId::checked(0, wid as u32).expect("invalid wordid"); + format!("\"{}\"", wordref_string(grammar, lex, &wid_with_dic)) +} + +fn wordref_string(grammar: &Grammar, lex: &LexiconSet, wid: &WordId) -> String { + let winfo = lex.get_word_info(*wid).expect("failed to get wordinfo"); + format!( + "{},{},{}", + unicode_escape(winfo.surface()), + pos_string(grammar, winfo.pos_id()), + unicode_escape(winfo.reading_form()), + ) +} + +fn dump_wids( + w: &mut W, + grammar: &Grammar, + lex: &LexiconSet, + data: &[WordId], +) -> SudachiResult<()> { + if data.is_empty() { + write!(w, "*")?; + return Ok(()); + } + w.write_all(b"\"")?; for (i, e) in data.iter().enumerate() { - let prefix = match e.dic() { - 0 => "", - _ => "U", - }; - write!(w, "{}{}", prefix, e.word())?; + write!(w, "{}", wordref_string(grammar, lex, e))?; if i + 1 != data.len() { w.write_all(b"/")?; } } + w.write_all(b"\"")?; Ok(()) } fn dump_gids(w: &mut W, data: &[u32]) -> SudachiResult<()> { + if data.is_empty() { + write!(w, "*")?; + return Ok(()); + } for (i, e) in data.iter().enumerate() { - write!(w, "{}", e)?; + write!(w, "{:06}", e)?; if i + 1 != data.len() { w.write_all(b"/")?; } diff --git a/sudachi/Cargo.toml b/sudachi/Cargo.toml index 76b4cfe4..76e5f72c 100644 --- a/sudachi/Cargo.toml +++ b/sudachi/Cargo.toml @@ -12,15 +12,15 @@ license.workspace = true [dependencies] # this should be sorted aho-corasick = "1" # MIT/Apache 2.0 -bitflags = "2.0" # MIT/Apache 2.0 -csv = "1.1" # Unilicense/MIT +bitflags = "2.5" # MIT/Apache 2.0 +csv = "1.3" # Unilicense/MIT fancy-regex = "0.13" # MIT -indexmap = "2.0" # MIT/Apache 2.0 -itertools = "0.12" # MIT/Apachie 2.0 +indexmap = "2.2" # MIT/Apache 2.0 +itertools = "0.13" # MIT/Apachie 2.0 lazy_static = "1.4" # MIT/Apache 2.0 libloading = "0.8" # ISC (MIT-compatible) -nom = "7" # MIT memmap2 = "0.9" # MIT/Apache 2.0 +nom = "7" # MIT regex = "1" # MIT/Apache 2.0 serde = { version = "1.0", features = ["derive"] } # MIT/Apache 2.0 serde_json = "1.0" # MIT/Apache 2.0