From ce2a1efa5255adc3e9dafde0ff24d529d21620d7 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 3 Nov 2024 02:19:37 +0900 Subject: [PATCH 1/4] Improve saisiot accuracy --- src/PathEvaluator.hpp | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/PathEvaluator.hpp b/src/PathEvaluator.hpp index a912ca0f..d89ffe36 100644 --- a/src/PathEvaluator.hpp +++ b/src/PathEvaluator.hpp @@ -872,7 +872,6 @@ namespace kiwi for (auto& curMorph : cands) { if (splitComplex && curMorph->getCombined()->complex) continue; - if (splitSaisiot && curMorph->getCombined()->saisiot) continue; if (blocklist && blocklist->count(curMorph->getCombined())) continue; // 덧붙은 받침(zCoda)을 위한 지름길 @@ -1007,7 +1006,8 @@ namespace kiwi const Vector& ownFormList, float typoCostWeight, const Morpheme* morphFirst, - size_t langVocabSize) + size_t langVocabSize, + bool splitSaisiot) { Vector*> steps; for (auto s = result->parent; s->parent; s = s->parent) @@ -1029,13 +1029,32 @@ namespace kiwi float scoreDiff = cur->accScore - prev->accScore; float typoCostDiff = cur->accTypoCost - prev->accTypoCost; auto morpheme = cur->morpheme; - const size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) ? 1 : morpheme->chunks.size(); + const size_t numNewTokens = (splitSaisiot && morpheme->saisiot) || !(morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) + ? morpheme->chunks.size() : 1; auto& gNode = graph[csearcher(cur)]; scoreDiff += typoCostDiff * typoCostWeight; scoreDiff /= numNewTokens; typoCostDiff /= numNewTokens; - if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) + if (splitSaisiot && morpheme->saisiot) + { + for (size_t ch = 0; ch < numNewTokens; ++ch) + { + auto& p = morpheme->chunks.getSecond(ch); + ret.emplace_back( + unifyMorpheme(morpheme->chunks[ch]), + KString{}, + gNode.startPos + p.first, + gNode.startPos + p.second, + scoreDiff, + typoCostDiff, + typoCostDiff ? gNode.typoFormId : 0, + &gNode - graph + ); + } + ret.back().end = gNode.endPos; + } + else if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) { ret.emplace_back( unifyMorpheme(morpheme), @@ -1274,7 +1293,7 @@ namespace kiwi { auto tokens = generateTokenList( &cand[i], csearcher, graph, ownFormList, kw->typoCostWeight, - kw->morphemes.data(), langVocabSize + kw->morphemes.data(), langVocabSize, splitSaisiot ); ret.emplace_back(move(tokens), cand[i].accScore, uniqStates[cand[i].rootId], cand[i].spState); } From da237f468f61897fbf0b439bbbea13abae145ce3 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 3 Nov 2024 02:20:10 +0900 Subject: [PATCH 2/4] Fix #200 --- src/KTrie.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/KTrie.cpp b/src/KTrie.cpp index 3f93f3a9..99bd7762 100644 --- a/src/KTrie.cpp +++ b/src/KTrie.cpp @@ -1090,11 +1090,11 @@ size_t kiwi::splitByTrie( if (!!(matchOptions & Match::zCoda) && zCodaFollowable && isHangulCoda(c) && (n + 1 >= str.size() || !isHangulSyllable(str[n + 1]))) { - candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1); + candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, (nonSpaces.size() - 1) * posMultiplier); } else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1])) { - candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1); + candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, (nonSpaces.size() - 1) * posMultiplier); } zCodaFollowable = false; zSiotFollowable = false; From bcae95c99d4f598d007cb71ed8bba3f65c218d96 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 3 Nov 2024 02:20:55 +0900 Subject: [PATCH 3/4] Fix segfault with pretokenized spans on gcc linux --- src/Kiwi.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Kiwi.cpp b/src/Kiwi.cpp index 61f7fb16..7543038d 100644 --- a/src/Kiwi.cpp +++ b/src/Kiwi.cpp @@ -905,6 +905,7 @@ namespace kiwi morph.vowel = CondVowel::none; morph.polar = CondPolarity::none; morph.complex = 0; + morph.saisiot = 0; morph.lmMorphemeId = getDefaultMorphemeId(s.tokenization[0].tag); form.candidate[0] = &morph; } @@ -921,6 +922,7 @@ namespace kiwi morph.vowel = CondVowel::none; morph.polar = CondPolarity::none; morph.complex = 0; + morph.saisiot = 0; morph.chunks = FixedPairVector>{ s.tokenization.size() }; for (size_t i = 0; i < s.tokenization.size(); ++i) { @@ -949,6 +951,7 @@ namespace kiwi cmorph.vowel = CondVowel::none; cmorph.polar = CondPolarity::none; cmorph.complex = 0; + cmorph.saisiot = 0; cmorph.tag = t.tag; cmorph.lmMorphemeId = getDefaultMorphemeId(t.tag); foundMorph = &cmorph; From 5a5cb41d2fd94bfe3bbfed95e6145388bca25a80 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 3 Nov 2024 02:22:03 +0900 Subject: [PATCH 4/4] Add missing constants for C API --- include/kiwi/TypoTransformer.h | 1 + include/kiwi/capi.h | 4 +++- src/TypoTransformer.cpp | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/kiwi/TypoTransformer.h b/include/kiwi/TypoTransformer.h index 6c216a10..dc599c18 100644 --- a/include/kiwi/TypoTransformer.h +++ b/include/kiwi/TypoTransformer.h @@ -393,6 +393,7 @@ namespace kiwi continualTypoSet, basicTypoSetWithContinual, lengtheningTypoSet, + basicTypoSetWithContinualAndLengthening, }; /** diff --git a/include/kiwi/capi.h b/include/kiwi/capi.h index b1f53b28..5bfa6e11 100644 --- a/include/kiwi/capi.h +++ b/include/kiwi/capi.h @@ -124,6 +124,7 @@ enum KIWI_MATCH_EMAIL = 2, KIWI_MATCH_HASHTAG = 4, KIWI_MATCH_MENTION = 8, + KIWI_MATCH_SERIAL = 16, KIWI_MATCH_NORMALIZE_CODA = 1 << 16, KIWI_MATCH_JOIN_NOUN_PREFIX = 1 << 17, @@ -139,7 +140,7 @@ enum KIWI_MATCH_SPLIT_SAISIOT = 1 << 25, KIWI_MATCH_MERGE_SAISIOT = 1 << 26, - KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_Z_CODA, + KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_SERIAL | KIWI_MATCH_Z_CODA, KIWI_MATCH_ALL_WITH_NORMALIZING = KIWI_MATCH_ALL | KIWI_MATCH_NORMALIZE_CODA, }; @@ -361,6 +362,7 @@ enum KIWI_TYPO_CONTINUAL_TYPO_SET = 2, KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL = 3, KIWI_TYPO_LENGTHENING_TYPO_SET = 4, + KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL_AND_LENGTHENING = 5, }; /** diff --git a/src/TypoTransformer.cpp b/src/TypoTransformer.cpp index a6386c66..863328e7 100644 --- a/src/TypoTransformer.cpp +++ b/src/TypoTransformer.cpp @@ -662,6 +662,8 @@ namespace kiwi static const TypoTransformer lengtheningTypoSet = TypoTransformer::fromLengtheningTypoCost(0.25f); + static const TypoTransformer basicTypoSetWithContinualAndLengthening = basicTypoSetWithContinual | lengtheningTypoSet; + switch (set) { case kiwi::DefaultTypoSet::withoutTypo: @@ -674,6 +676,8 @@ namespace kiwi return basicTypoSetWithContinual; case kiwi::DefaultTypoSet::lengtheningTypoSet: return lengtheningTypoSet; + case kiwi::DefaultTypoSet::basicTypoSetWithContinualAndLengthening: + return basicTypoSetWithContinualAndLengthening; default: throw invalid_argument{ "Invalid `DefaultTypoSet`" }; }