Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

공백이 포함된 형태소 등록 기능 구현 #155

Merged
merged 13 commits into from
Feb 12, 2024
274,599 changes: 274,599 additions & 0 deletions ModelGenerator/multi.dict

Large diffs are not rendered by default.

16 changes: 9 additions & 7 deletions include/kiwi/Form.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ namespace kiwi
* 이형태 형태소의 경우 원형 형태소의 인덱스 값을 가진다.
*/
uint32_t lmMorphemeId = 0;
uint32_t origMorphemeId = 0;

/**
* @brief 형태소의 그룹 인덱스.
Expand Down Expand Up @@ -147,7 +148,7 @@ namespace kiwi
int32_t combined = 0;
FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>> chunks;
float userScore = 0;
uint32_t lmMorphemeId = 0;
uint32_t lmMorphemeId = 0, origMorphemeId = 0;

Morpheme();
~Morpheme();
Expand Down Expand Up @@ -191,15 +192,16 @@ namespace kiwi
};

/**
* @brief 형태에 관한 모든 정보를 담는 구조체의 템플릿
* @brief 형태에 관한 모든 정보를 담는 구조체
*
* @note 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신
* @note 이 구조체는 변경 불가능한 상태로 사용된다. 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신
* 각 값에 효율적으로 빠르게 접근 가능하다. `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다.
*/
struct Form
{
KString form;
FixedVector<const Morpheme*> candidate;
uint32_t numSpaces = 0;
CondVowel vowel = CondVowel::none;
CondPolarity polar = CondPolarity::none;
uint8_t formHash = 0;
Expand All @@ -212,10 +214,10 @@ namespace kiwi
Form& operator=(const Form&);
Form& operator=(Form&&);

bool operator<(const Form& o) const
{
return form < o.form;
}
// Form을 정렬하는 데에 사용. Form::form에서 공백 문자를 제거한 뒤 사전식으로 정렬.
bool operator<(const Form& o) const;

size_t sizeWithoutSpace() const { return form.size() - numSpaces; }
};

struct TypoForm
Expand Down
6 changes: 3 additions & 3 deletions include/kiwi/Kiwi.h
Original file line number Diff line number Diff line change
Expand Up @@ -553,8 +553,8 @@ namespace kiwi

size_t findMorpheme(U16StringView form, POSTag tag) const;

std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, size_t origMorphemeId);
std::pair<uint32_t, bool> addWord(const std::u16string& newForm, POSTag tag, float score, size_t origMorphemeId);
std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, size_t origMorphemeId, size_t lmMorphemeId);
std::pair<uint32_t, bool> addWord(const std::u16string& newForm, POSTag tag, float score, size_t origMorphemeId, size_t lmMorphemeId);
std::pair<uint32_t, bool> addWord(U16StringView form, POSTag tag = POSTag::nnp, float score = 0);
std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, U16StringView origForm);

Expand Down Expand Up @@ -747,7 +747,7 @@ namespace kiwi
std::u16string output = repl(input);
if (input == output) continue;
size_t morphemeId = m->lmMorphemeId ? m->lmMorphemeId : (size_t)(m - morphemes.data());
auto added = addWord(output, tag, score + (m->lmMorphemeId ? m->userScore : 0), morphemeId);
auto added = addWord(output, tag, score + (m->lmMorphemeId ? m->userScore : 0), morphemeId, 0);
if (added.second)
{
ret.emplace_back(added.first, output);
Expand Down
29 changes: 23 additions & 6 deletions include/kiwi/Trie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,22 +402,39 @@ namespace kiwi

template<class Cont>
struct CacheStore
{
Cont cont;
std::vector<size_t> ptrs;

operator bool() const { return !cont.empty(); }
const Cont& operator*() const { return cont; }
void set(const Cont& _cont) { cont = _cont; }
};

template<class Cont>
struct CacheStore<Cont*>
{
Cont* cont = nullptr;
std::vector<size_t> ptrs;

operator bool() const { return cont; }
const Cont& operator*() const { return *cont; }
void set(const Cont& _cont) { cont = &_cont; }
};

template<class Cont, class Value>
Node* buildWithCaching(Cont& cont, Value&& val, CacheStore<Cont>& cache)
template<class Cont, class Value, class CacheCont>
Node* buildWithCaching(Cont&& cont, Value&& val, CacheStore<CacheCont>& cache)
{
static_assert(std::is_pointer<CacheCont>::value ? std::is_reference<Cont>::value && !std::is_rvalue_reference<Cont>::value : true,
"Cont should reference type if using pointer type CacheStore.");
auto allocNode = [&]() { return newNode(); };
//reserveMore(cont.size());

size_t commonPrefix = 0;
if (cache.cont)
if (!!cache)
{
while (commonPrefix < std::min(cache.cont->size(), cont.size())
&& cont[commonPrefix] == (*cache.cont)[commonPrefix]
while (commonPrefix < std::min((*cache).size(), cont.size())
&& cont[commonPrefix] == (*cache)[commonPrefix]
) ++commonPrefix;
}

Expand All @@ -430,7 +447,7 @@ namespace kiwi
cache.ptrs[i] = node - nodes.data();
}
if (!node->val) node->val = val;
cache.cont = &cont;
cache.set(cont);
return node;
}

Expand Down
4 changes: 3 additions & 1 deletion include/kiwi/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,9 @@ namespace kiwi

loadTypoDict = 1 << 2, /**< 오타 사전(typo.dict)의 로딩 여부를 설정한다.*/

default_ = integrateAllomorph | loadDefaultDict | loadTypoDict,
loadMultiDict = 1 << 3, /**< 복합명사 사전(multi.dict)의 로딩 여부를 설정한다. 복합명사 사전은 복합명사의 구성 형태소를 저장하고 있다. */

default_ = integrateAllomorph | loadDefaultDict | loadTypoDict | loadMultiDict,
};

struct Morpheme;
Expand Down
8 changes: 8 additions & 0 deletions include/kiwi/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ namespace kiwi
return within(chr, 0x302E, 0x3030);
}

struct ComparatorIgnoringSpace
{
static bool less(const KString& a, const KString& b, const kchar_t space = u' ');
static bool equal(const KString& a, const KString& b, const kchar_t space = u' ');
};

KString removeSpace(const KString& str, const kchar_t space = u' ');

inline std::ostream& operator <<(std::ostream& os, const KString& str)
{
return os << utf16To8({ str.begin(), str.end() });
Expand Down
3 changes: 2 additions & 1 deletion include/kiwi/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ enum
KIWI_BUILD_INTEGRATE_ALLOMORPH = 1,
KIWI_BUILD_LOAD_DEFAULT_DICT = 2,
KIWI_BUILD_LOAD_TYPO_DICT = 4,
KIWI_BUILD_DEFAULT = 7,
KIWI_BUILD_LOAD_MULTI_DICT = 8,
KIWI_BUILD_DEFAULT = 15,
KIWI_BUILD_MODEL_TYPE_KNLM = 0x0000,
KIWI_BUILD_MODEL_TYPE_SBG = 0x0100,
};
Expand Down
12 changes: 10 additions & 2 deletions src/Form.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <cassert>
#include <cassert>
#include <algorithm>
#include <kiwi/Utils.h>
#include <kiwi/Form.h>
#include "serializer.hpp"
Expand Down Expand Up @@ -32,7 +33,7 @@ namespace kiwi
setComplex(_complex);
}

DEFINE_SERIALIZER_OUTSIDE(MorphemeRaw, kform, tag, vpPack, senseId, combineSocket, combined, userScore, chunks, chunkPositions, lmMorphemeId, groupId);
DEFINE_SERIALIZER_OUTSIDE(MorphemeRaw, kform, tag, vpPack, senseId, combineSocket, combined, userScore, chunks, chunkPositions, lmMorphemeId, /*origMorphemeId,*/ groupId);

Morpheme::Morpheme() = default;

Expand Down Expand Up @@ -81,9 +82,15 @@ namespace kiwi

Form& Form::operator=(Form&&) = default;

bool Form::operator<(const Form& o) const
{
return ComparatorIgnoringSpace::less(form, o.form);
}

Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, const Vector<uint32_t>& additionalCands)
{
Form ret;
ret.numSpaces = count(o.form.begin(), o.form.end(), u' ');
ret.form = o.form;
ret.candidate = FixedVector<const Morpheme*>{ o.candidate.size() + additionalCands.size()};
for (size_t i = 0; i < o.candidate.size(); ++i)
Expand All @@ -110,6 +117,7 @@ namespace kiwi
ret.combined = o.combined;
ret.userScore = o.userScore;
ret.lmMorphemeId = o.lmMorphemeId;
ret.origMorphemeId = o.origMorphemeId;
ret.senseId = o.senseId;
ret.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ o.chunks.size() };
for (size_t i = 0; i < o.chunks.size(); ++i)
Expand Down
61 changes: 49 additions & 12 deletions src/KTrie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,35 @@ namespace kiwi

}

// nonSpaces idx 데이터로부터 글자 수 + 공백 블록 수를 계산한다.
template<class It>
inline size_t countChrWithNormalizedSpace(It first, It last)
{
size_t n = std::distance(first, last);
auto prevIdx = *first++;
for (; first != last; ++first)
{
if (*first != prevIdx + 1) ++n;
prevIdx = *first;
}
return n;
}

// 공백 문자의 위치가 형태소의 공백 위치와 불일치하는 개수를 센다.
inline size_t countSpaceErrors(const KString& form, const uint32_t* spaceIdxFirst, const uint32_t* spaceIdxLast)
{
size_t n = 0;
size_t spaceOffset = 0;
const size_t size = std::distance(spaceIdxFirst, spaceIdxLast);
for (size_t i = 1; i < size; ++i)
{
const bool hasSpace = spaceIdxFirst[i] - spaceIdxFirst[i - 1] > 1;
if (hasSpace && form[i + spaceOffset] != u' ') ++n;
spaceOffset += form[i + spaceOffset] == u' ' ? 1 : 0;
}
return n;
}

template<ArchType arch, bool typoTolerant>
size_t kiwi::splitByTrie(
Vector<KGraphNode>& ret,
Expand Down Expand Up @@ -229,10 +258,10 @@ size_t kiwi::splitByTrie(
bool alreadySpecialChrProcessed = false;
for (auto& cand : candidates)
{
size_t nBegin = typoTolerant ? candTypoCostStarts[&cand - candidates.data()].start : (nonSpaces.size() - cand->form.size());
bool longestMatched = any_of(out.begin() + 1, out.end(), [&](const KGraphNode& g)
const size_t nBegin = typoTolerant ? candTypoCostStarts[&cand - candidates.data()].start : (nonSpaces.size() - cand->sizeWithoutSpace());
const bool longestMatched = any_of(out.begin() + 1, out.end(), [&](const KGraphNode& g)
{
return nBegin == g.endPos && lastSpecialEndPos == g.endPos - (g.uform.empty() ? g.form->form.size() : g.uform.size());
return nBegin == g.endPos && lastSpecialEndPos == g.endPos - (g.uform.empty() ? g.form->sizeWithoutSpace() : g.uform.size());
});

// insert unknown form
Expand All @@ -252,7 +281,7 @@ size_t kiwi::splitByTrie(
}
}

size_t newNodeLength = nBegin - lastSpecialEndPos;
const size_t newNodeLength = nBegin - lastSpecialEndPos;
if (maxUnkFormSize && newNodeLength <= maxUnkFormSize)
{
appendNewNode(out, endPosMap, lastSpecialEndPos, str.substr(nonSpaces[lastSpecialEndPos], nonSpaces[nBegin] - nonSpaces[lastSpecialEndPos]), (uint16_t)nBegin);
Expand All @@ -275,13 +304,21 @@ size_t kiwi::splitByTrie(
}
else
{
size_t lengthWithSpaces = nonSpaces.back() + 1 - nonSpaces[nBegin];
if (lengthWithSpaces <= cand->form.size() + spaceTolerance)
// TO DO: 아래의 spaceErrors 계산방식은 오타 교정 모드에서는 부정확한 값을 낼 수 있음. 더 정교한 방식으로 개선 필요
const size_t lengthWithSpaces = countChrWithNormalizedSpace(nonSpaces.begin() + nBegin, nonSpaces.end());
size_t spaceErrors = 0;
if (lengthWithSpaces <= cand->form.size() + spaceTolerance
&& (!cand->numSpaces || (spaceErrors = countSpaceErrors(cand->form, nonSpaces.data() + nBegin, nonSpaces.data() + nonSpaces.size())) <= spaceTolerance))
{
float typoCost = typoTolerant ? candTypoCostStarts[&cand - candidates.data()].cost : 0.f;
if (appendNewNode(out, endPosMap, nBegin, cand, (uint16_t)nonSpaces.size(), typoCost) && typoTolerant)
if (!cand->numSpaces && lengthWithSpaces > cand->form.size()) spaceErrors = lengthWithSpaces - cand->form.size();
const float typoCost = typoTolerant ? candTypoCostStarts[&cand - candidates.data()].cost : 0.f;
if (appendNewNode(out, endPosMap, nBegin, cand, (uint16_t)nonSpaces.size(), typoCost))
{
out.back().typoFormId = candTypoCostStarts[&cand - candidates.data()].typoId;
out.back().spaceErrors = spaceErrors;
if (typoTolerant)
{
out.back().typoFormId = candTypoCostStarts[&cand - candidates.data()].typoId;
}
}
}
}
Expand All @@ -300,7 +337,7 @@ size_t kiwi::splitByTrie(

bool duplicated = any_of(out.begin() + 1, out.end(), [&](const KGraphNode& g)
{
size_t startPos = g.endPos - (g.uform.empty() ? g.form->form.size() : g.uform.size());
size_t startPos = g.endPos - (g.uform.empty() ? g.form->sizeWithoutSpace() : g.uform.size());
return startPos == lastSpecialEndPos && g.endPos == unkFormEndPos;
});
if (unkFormEndPos > lastSpecialEndPos && !duplicated)
Expand Down Expand Up @@ -478,8 +515,8 @@ size_t kiwi::splitByTrie(
}
}

// spaceTolerance > 0이면 공백문자를 무시하고 분할 진행
if (spaceTolerance > 0 && chrType == POSTag::unknown)
// 공백문자를 무시하고 분할 진행
if (chrType == POSTag::unknown)
{
branchOut(nonSpaces.size(), n);
lastSpecialEndPos = nonSpaces.size();
Expand Down
1 change: 1 addition & 0 deletions src/KTrie.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ namespace kiwi
uint32_t startPos = 0, endPos = 0;
float typoCost = 0;
uint32_t typoFormId = 0;
uint32_t spaceErrors = 0;

KGraphNode(const Form* _form = nullptr, uint16_t _endPos = 0, float _typoCost = 0) : form(_form), endPos(_endPos), typoCost(_typoCost) {}
KGraphNode(U16StringView _uform, uint16_t _endPos, float _typoCost = 0) : uform(_uform), endPos(_endPos), typoCost(_typoCost) {}
Expand Down
Loading
Loading