Skip to content

Commit

Permalink
Merge pull request #201 from bab2min/dev/minor_fix
Browse files Browse the repository at this point in the history
Minor fix
  • Loading branch information
bab2min authored Nov 3, 2024
2 parents d88f0e9 + 5a5cb41 commit f07989e
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 8 deletions.
1 change: 1 addition & 0 deletions include/kiwi/TypoTransformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ namespace kiwi
continualTypoSet,
basicTypoSetWithContinual,
lengtheningTypoSet,
basicTypoSetWithContinualAndLengthening,
};

/**
Expand Down
4 changes: 3 additions & 1 deletion include/kiwi/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ enum
KIWI_MATCH_EMAIL = 2,
KIWI_MATCH_HASHTAG = 4,
KIWI_MATCH_MENTION = 8,
KIWI_MATCH_SERIAL = 16,

KIWI_MATCH_NORMALIZE_CODA = 1 << 16,
KIWI_MATCH_JOIN_NOUN_PREFIX = 1 << 17,
Expand All @@ -139,7 +140,7 @@ enum
KIWI_MATCH_SPLIT_SAISIOT = 1 << 25,
KIWI_MATCH_MERGE_SAISIOT = 1 << 26,

KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_Z_CODA,
KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_SERIAL | KIWI_MATCH_Z_CODA,
KIWI_MATCH_ALL_WITH_NORMALIZING = KIWI_MATCH_ALL | KIWI_MATCH_NORMALIZE_CODA,
};

Expand Down Expand Up @@ -361,6 +362,7 @@ enum
KIWI_TYPO_CONTINUAL_TYPO_SET = 2,
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL = 3,
KIWI_TYPO_LENGTHENING_TYPO_SET = 4,
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL_AND_LENGTHENING = 5,
};

/**
Expand Down
4 changes: 2 additions & 2 deletions src/KTrie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1090,11 +1090,11 @@ size_t kiwi::splitByTrie(

if (!!(matchOptions & Match::zCoda) && zCodaFollowable && isHangulCoda(c) && (n + 1 >= str.size() || !isHangulSyllable(str[n + 1])))
{
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, (nonSpaces.size() - 1) * posMultiplier);
}
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
{
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, (nonSpaces.size() - 1) * posMultiplier);
}
zCodaFollowable = false;
zSiotFollowable = false;
Expand Down
3 changes: 3 additions & 0 deletions src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -905,6 +905,7 @@ namespace kiwi
morph.vowel = CondVowel::none;
morph.polar = CondPolarity::none;
morph.complex = 0;
morph.saisiot = 0;
morph.lmMorphemeId = getDefaultMorphemeId(s.tokenization[0].tag);
form.candidate[0] = &morph;
}
Expand All @@ -921,6 +922,7 @@ namespace kiwi
morph.vowel = CondVowel::none;
morph.polar = CondPolarity::none;
morph.complex = 0;
morph.saisiot = 0;
morph.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ s.tokenization.size() };
for (size_t i = 0; i < s.tokenization.size(); ++i)
{
Expand Down Expand Up @@ -949,6 +951,7 @@ namespace kiwi
cmorph.vowel = CondVowel::none;
cmorph.polar = CondPolarity::none;
cmorph.complex = 0;
cmorph.saisiot = 0;
cmorph.tag = t.tag;
cmorph.lmMorphemeId = getDefaultMorphemeId(t.tag);
foundMorph = &cmorph;
Expand Down
29 changes: 24 additions & 5 deletions src/PathEvaluator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,6 @@ namespace kiwi
for (auto& curMorph : cands)
{
if (splitComplex && curMorph->getCombined()->complex) continue;
if (splitSaisiot && curMorph->getCombined()->saisiot) continue;
if (blocklist && blocklist->count(curMorph->getCombined())) continue;

// 덧붙은 받침(zCoda)을 위한 지름길
Expand Down Expand Up @@ -1007,7 +1006,8 @@ namespace kiwi
const Vector<U16StringView>& ownFormList,
float typoCostWeight,
const Morpheme* morphFirst,
size_t langVocabSize)
size_t langVocabSize,
bool splitSaisiot)
{
Vector<const WordLL<LmState>*> steps;
for (auto s = result->parent; s->parent; s = s->parent)
Expand All @@ -1029,13 +1029,32 @@ namespace kiwi
float scoreDiff = cur->accScore - prev->accScore;
float typoCostDiff = cur->accTypoCost - prev->accTypoCost;
auto morpheme = cur->morpheme;
const size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) ? 1 : morpheme->chunks.size();
const size_t numNewTokens = (splitSaisiot && morpheme->saisiot) || !(morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
? morpheme->chunks.size() : 1;
auto& gNode = graph[csearcher(cur)];
scoreDiff += typoCostDiff * typoCostWeight;
scoreDiff /= numNewTokens;
typoCostDiff /= numNewTokens;

if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
if (splitSaisiot && morpheme->saisiot)
{
for (size_t ch = 0; ch < numNewTokens; ++ch)
{
auto& p = morpheme->chunks.getSecond(ch);
ret.emplace_back(
unifyMorpheme(morpheme->chunks[ch]),
KString{},
gNode.startPos + p.first,
gNode.startPos + p.second,
scoreDiff,
typoCostDiff,
typoCostDiff ? gNode.typoFormId : 0,
&gNode - graph
);
}
ret.back().end = gNode.endPos;
}
else if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
{
ret.emplace_back(
unifyMorpheme(morpheme),
Expand Down Expand Up @@ -1274,7 +1293,7 @@ namespace kiwi
{
auto tokens = generateTokenList(
&cand[i], csearcher, graph, ownFormList, kw->typoCostWeight,
kw->morphemes.data(), langVocabSize
kw->morphemes.data(), langVocabSize, splitSaisiot
);
ret.emplace_back(move(tokens), cand[i].accScore, uniqStates[cand[i].rootId], cand[i].spState);
}
Expand Down
4 changes: 4 additions & 0 deletions src/TypoTransformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,8 @@ namespace kiwi

static const TypoTransformer lengtheningTypoSet = TypoTransformer::fromLengtheningTypoCost(0.25f);

static const TypoTransformer basicTypoSetWithContinualAndLengthening = basicTypoSetWithContinual | lengtheningTypoSet;

switch (set)
{
case kiwi::DefaultTypoSet::withoutTypo:
Expand All @@ -674,6 +676,8 @@ namespace kiwi
return basicTypoSetWithContinual;
case kiwi::DefaultTypoSet::lengtheningTypoSet:
return lengtheningTypoSet;
case kiwi::DefaultTypoSet::basicTypoSetWithContinualAndLengthening:
return basicTypoSetWithContinualAndLengthening;
default:
throw invalid_argument{ "Invalid `DefaultTypoSet`" };
}
Expand Down

0 comments on commit f07989e

Please sign in to comment.