From 96d317408a5fad342639d7c5b105b8c002a48310 Mon Sep 17 00:00:00 2001 From: Masakazu Nagaya Date: Wed, 14 Mar 2018 13:45:32 +0900 Subject: [PATCH 1/4] Add licenses to pom.xml --- pom.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pom.xml b/pom.xml index 3a131ff..8869082 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,13 @@ kuromoji-linguistics http://maven.apache.org + + + The Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + + + UTF-8 From 453aa96284746469fdde5f9d5eb365e55a8ba4ab Mon Sep 17 00:00:00 2001 From: Masakazu Nagaya Date: Wed, 14 Mar 2018 13:48:50 +0900 Subject: [PATCH 2/4] Update url in pom.xml --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8869082..947816f 100644 --- a/pom.xml +++ b/pom.xml @@ -14,7 +14,7 @@ container-plugin kuromoji-linguistics - http://maven.apache.org + https://github.com/yahoojapan/vespa-kuromoji-linguistics From ae7e4951192fc62b934a80d14139bb1e0edfb029 Mon Sep 17 00:00:00 2001 From: Masakazu Nagaya Date: Wed, 14 Mar 2018 13:57:53 +0900 Subject: [PATCH 3/4] Remove unused dependency --- pom.xml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pom.xml b/pom.xml index 947816f..1d08df8 100644 --- a/pom.xml +++ b/pom.xml @@ -62,12 +62,6 @@ ${vespa.version} provided - - com.yahoo.vespa - container-di - ${vespa.version} - provided - com.yahoo.vespa linguistics From 3551cf122dfbdc20b43275b4beb63c202369d2f2 Mon Sep 17 00:00:00 2001 From: Masakazu Nagaya Date: Wed, 14 Mar 2018 13:58:50 +0900 Subject: [PATCH 4/4] Update description --- README.md | 10 +++++----- .../language/lib/kuromoji/KuromojiLinguistics.java | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index ea76a13..7b64e60 100644 --- a/README.md +++ b/README.md @@ -54,11 +54,11 @@ You can configure package by <config name="language.lib.kuromoji.kuromoji"> |parameter|type|default|description| |:--------|:---|:------|:----------| |mode|string|search|mode of Kuromoji (normal OR search OR extended)| -|kanji.length_threshold|int|2|TODO| -|kanji.penalty|int|3000|TODO| -|other.length_threshold|int|7|TODO| -|other.penalty|int|1700|TODO| -|nakaguro_split|bool|false|TODO| +|kanji.length_threshold|int|2|threshold of the length of kanji tokens which is penalized while running the Viterbi search (expert feature).| +|kanji.penalty|int|3000|additional cost for kanji tokens which is longer than the pre-defined length threshold (expert feature).| +|other.length_threshold|int|7|threshold of the length of non-kanji tokens which is penalized while running the Viterbi search (expert feature).| +|other.penalty|int|1700|additional cost for non-kanji tokens which is longer than the pre-defined length threshold (expert feature).| +|nakaguro_split|bool|false|whether splits unknown words on the middle dot character (U+30FB KATAKANA MIDDLE DOT)| |user_dict|string|-|path of user dictionary| |tokenlist_name|string|default|target specialtokens name| |all_language|bool|false|apply kuromoji tokenizer to all language or only Japanese| diff --git a/src/main/java/jp/co/yahoo/vespa/language/lib/kuromoji/KuromojiLinguistics.java b/src/main/java/jp/co/yahoo/vespa/language/lib/kuromoji/KuromojiLinguistics.java index 5e5080f..eb5a8fe 100644 --- a/src/main/java/jp/co/yahoo/vespa/language/lib/kuromoji/KuromojiLinguistics.java +++ b/src/main/java/jp/co/yahoo/vespa/language/lib/kuromoji/KuromojiLinguistics.java @@ -37,11 +37,11 @@ * * * - * - * - * - * - * + * + * + * + * + * * * *
parameterdefaultdescription
modesearchmode of Kuromoji (normal|search|extended)
kanji.length_threshold2TODO
kanji.penalty3000TODO
other.length_threshold7TODO
other.penalty1700TODO
nakaguro_splitfalseTODO
kanji.length_threshold2threshold of the length of kanji tokens which is penalized while running the Viterbi search (expert feature).
kanji.penalty3000additional cost for kanji tokens which is longer than the pre-defined length threshold (expert feature).
other.length_threshold7threshold of the length of non-kanji tokens which is penalized while running the Viterbi search (expert feature).
other.penalty1700additional cost for non-kanji tokens which is longer than the pre-defined length threshold (expert feature).
nakaguro_splitfalsewhether splits unknown words on the middle dot character (U+30FB KATAKANA MIDDLE DOT)
user_dict-path of user dictionary
tokenlist_namedefaulttarget specialtokens name
all_languagefalseapply kuromoji tokenizer to all language