Skip to content

Commit

Permalink
separated Morpheme::origMorphemeId from Morpheme::lmMorphemeId
Browse files Browse the repository at this point in the history
  • Loading branch information
bab2min committed Feb 12, 2024
1 parent 233606d commit e2aec74
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 13 deletions.
3 changes: 2 additions & 1 deletion include/kiwi/Form.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ namespace kiwi
* 이형태 형태소의 경우 원형 형태소의 인덱스 값을 가진다.
*/
uint32_t lmMorphemeId = 0;
uint32_t origMorphemeId = 0;

/**
* @brief 형태소의 그룹 인덱스.
Expand Down Expand Up @@ -147,7 +148,7 @@ namespace kiwi
int32_t combined = 0;
FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>> chunks;
float userScore = 0;
uint32_t lmMorphemeId = 0;
uint32_t lmMorphemeId = 0, origMorphemeId = 0;

Morpheme();
~Morpheme();
Expand Down
6 changes: 3 additions & 3 deletions include/kiwi/Kiwi.h
Original file line number Diff line number Diff line change
Expand Up @@ -553,8 +553,8 @@ namespace kiwi

size_t findMorpheme(U16StringView form, POSTag tag) const;

std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, size_t origMorphemeId);
std::pair<uint32_t, bool> addWord(const std::u16string& newForm, POSTag tag, float score, size_t origMorphemeId);
std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, size_t origMorphemeId, size_t lmMorphemeId);
std::pair<uint32_t, bool> addWord(const std::u16string& newForm, POSTag tag, float score, size_t origMorphemeId, size_t lmMorphemeId);
std::pair<uint32_t, bool> addWord(U16StringView form, POSTag tag = POSTag::nnp, float score = 0);
std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, U16StringView origForm);

Expand Down Expand Up @@ -747,7 +747,7 @@ namespace kiwi
std::u16string output = repl(input);
if (input == output) continue;
size_t morphemeId = m->lmMorphemeId ? m->lmMorphemeId : (size_t)(m - morphemes.data());
auto added = addWord(output, tag, score + (m->lmMorphemeId ? m->userScore : 0), morphemeId);
auto added = addWord(output, tag, score + (m->lmMorphemeId ? m->userScore : 0), morphemeId, 0);
if (added.second)
{
ret.emplace_back(added.first, output);
Expand Down
5 changes: 3 additions & 2 deletions src/Form.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include <cassert>
#include <cassert>
#include <algorithm>
#include <kiwi/Utils.h>
#include <kiwi/Form.h>
Expand Down Expand Up @@ -33,7 +33,7 @@ namespace kiwi
setComplex(_complex);
}

DEFINE_SERIALIZER_OUTSIDE(MorphemeRaw, kform, tag, vpPack, senseId, combineSocket, combined, userScore, chunks, chunkPositions, lmMorphemeId, groupId);
DEFINE_SERIALIZER_OUTSIDE(MorphemeRaw, kform, tag, vpPack, senseId, combineSocket, combined, userScore, chunks, chunkPositions, lmMorphemeId, /*origMorphemeId,*/ groupId);

Morpheme::Morpheme() = default;

Expand Down Expand Up @@ -117,6 +117,7 @@ namespace kiwi
ret.combined = o.combined;
ret.userScore = o.userScore;
ret.lmMorphemeId = o.lmMorphemeId;
ret.origMorphemeId = o.origMorphemeId;
ret.senseId = o.senseId;
ret.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ o.chunks.size() };
for (size_t i = 0; i < o.chunks.size(); ++i)
Expand Down
15 changes: 8 additions & 7 deletions src/KiwiBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,7 @@ size_t KiwiBuilder::addForm(Vector<FormRaw>& newForms, UnorderedMap<KString, siz
return ret.first->second;
}

pair<uint32_t, bool> KiwiBuilder::addWord(U16StringView newForm, POSTag tag, float score, size_t origMorphemeId)
pair<uint32_t, bool> KiwiBuilder::addWord(U16StringView newForm, POSTag tag, float score, size_t origMorphemeId, size_t lmMorphemeId)
{
if (newForm.empty()) return make_pair((uint32_t)0, false);

Expand All @@ -1064,7 +1064,7 @@ pair<uint32_t, bool> KiwiBuilder::addWord(U16StringView newForm, POSTag tag, flo
for (auto p : f.candidate)
{
// if `form` already has the same `tag`, skip adding
if (morphemes[p].tag == tag && morphemes[p].lmMorphemeId == origMorphemeId)
if (morphemes[p].tag == tag && morphemes[p].lmMorphemeId == lmMorphemeId)
{
morphemes[p].userScore = score;
return make_pair((uint32_t)p, false);
Expand All @@ -1078,13 +1078,14 @@ pair<uint32_t, bool> KiwiBuilder::addWord(U16StringView newForm, POSTag tag, flo
auto& newMorph = morphemes.back();
newMorph.kform = &f - &forms[0];
newMorph.userScore = score;
newMorph.lmMorphemeId = origMorphemeId;
newMorph.lmMorphemeId = origMorphemeId ? morphemes[origMorphemeId].lmMorphemeId : lmMorphemeId;
newMorph.origMorphemeId = origMorphemeId;
return make_pair((uint32_t)newMorphId, true);
}

pair<uint32_t, bool> KiwiBuilder::addWord(const std::u16string& newForm, POSTag tag, float score, size_t origMorphemeId)
pair<uint32_t, bool> KiwiBuilder::addWord(const std::u16string& newForm, POSTag tag, float score, size_t origMorphemeId, size_t lmMorphemeId)
{
return addWord(nonstd::to_string_view(newForm), tag, score, origMorphemeId);
return addWord(nonstd::to_string_view(newForm), tag, score, origMorphemeId, lmMorphemeId);
}

void KiwiBuilder::addCombinedMorpheme(
Expand Down Expand Up @@ -1381,7 +1382,7 @@ void KiwiBuilder::buildCombinedMorphemes(

pair<uint32_t, bool> KiwiBuilder::addWord(U16StringView form, POSTag tag, float score)
{
return addWord(form, tag, score, getDefaultMorphemeId(tag));
return addWord(form, tag, score, 0, getDefaultMorphemeId(tag));
}

pair<uint32_t, bool> KiwiBuilder::addWord(const u16string& form, POSTag tag, float score)
Expand Down Expand Up @@ -1419,7 +1420,7 @@ pair<uint32_t, bool> KiwiBuilder::addWord(U16StringView newForm, POSTag tag, flo
throw UnknownMorphemeException{ "cannot find the original morpheme " + utf16To8(origForm) + "/" + tagToString(tag) };
}

return addWord(newForm, tag, score, origMorphemeId);
return addWord(newForm, tag, score, origMorphemeId, 0);
}

pair<uint32_t, bool> KiwiBuilder::addWord(const u16string& newForm, POSTag tag, float score, const u16string& origForm)
Expand Down

0 comments on commit e2aec74

Please sign in to comment.