Skip to content

Commit

Permalink
Merge pull request #155 from bab2min/dev_words_with_spaces
Browse files Browse the repository at this point in the history
공백이 포함된 형태소 등록 기능 구현
  • Loading branch information
bab2min authored Feb 12, 2024
2 parents 1e055bc + e2aec74 commit 992bed6
Show file tree
Hide file tree
Showing 15 changed files with 275,052 additions and 71 deletions.
274,599 changes: 274,599 additions & 0 deletions ModelGenerator/multi.dict

Large diffs are not rendered by default.

16 changes: 9 additions & 7 deletions include/kiwi/Form.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ namespace kiwi
* 이형태 형태소의 경우 원형 형태소의 인덱스 값을 가진다.
*/
uint32_t lmMorphemeId = 0;
uint32_t origMorphemeId = 0;

/**
* @brief 형태소의 그룹 인덱스.
Expand Down Expand Up @@ -147,7 +148,7 @@ namespace kiwi
int32_t combined = 0;
FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>> chunks;
float userScore = 0;
uint32_t lmMorphemeId = 0;
uint32_t lmMorphemeId = 0, origMorphemeId = 0;

Morpheme();
~Morpheme();
Expand Down Expand Up @@ -191,15 +192,16 @@ namespace kiwi
};

/**
* @brief 형태에 관한 모든 정보를 담는 구조체의 템플릿
* @brief 형태에 관한 모든 정보를 담는 구조체
*
* @note 변경 불가능한 상태로 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신
* @note 이 구조체는 변경 불가능한 상태로 사용된다. 인덱스는 모두 포인터로, std::vector는 FixedVector로 변경되어 수정이 불가능한 대신
* 각 값에 효율적으로 빠르게 접근 가능하다. `kiwi::Kiwi` 내 실제 형태소 분석 단계에 쓰인다.
*/
struct Form
{
KString form;
FixedVector<const Morpheme*> candidate;
uint32_t numSpaces = 0;
CondVowel vowel = CondVowel::none;
CondPolarity polar = CondPolarity::none;
uint8_t formHash = 0;
Expand All @@ -212,10 +214,10 @@ namespace kiwi
Form& operator=(const Form&);
Form& operator=(Form&&);

bool operator<(const Form& o) const
{
return form < o.form;
}
// Form을 정렬하는 데에 사용. Form::form에서 공백 문자를 제거한 뒤 사전식으로 정렬.
bool operator<(const Form& o) const;

size_t sizeWithoutSpace() const { return form.size() - numSpaces; }
};

struct TypoForm
Expand Down
6 changes: 3 additions & 3 deletions include/kiwi/Kiwi.h
Original file line number Diff line number Diff line change
Expand Up @@ -553,8 +553,8 @@ namespace kiwi

size_t findMorpheme(U16StringView form, POSTag tag) const;

std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, size_t origMorphemeId);
std::pair<uint32_t, bool> addWord(const std::u16string& newForm, POSTag tag, float score, size_t origMorphemeId);
std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, size_t origMorphemeId, size_t lmMorphemeId);
std::pair<uint32_t, bool> addWord(const std::u16string& newForm, POSTag tag, float score, size_t origMorphemeId, size_t lmMorphemeId);
std::pair<uint32_t, bool> addWord(U16StringView form, POSTag tag = POSTag::nnp, float score = 0);
std::pair<uint32_t, bool> addWord(U16StringView newForm, POSTag tag, float score, U16StringView origForm);

Expand Down Expand Up @@ -747,7 +747,7 @@ namespace kiwi
std::u16string output = repl(input);
if (input == output) continue;
size_t morphemeId = m->lmMorphemeId ? m->lmMorphemeId : (size_t)(m - morphemes.data());
auto added = addWord(output, tag, score + (m->lmMorphemeId ? m->userScore : 0), morphemeId);
auto added = addWord(output, tag, score + (m->lmMorphemeId ? m->userScore : 0), morphemeId, 0);
if (added.second)
{
ret.emplace_back(added.first, output);
Expand Down
29 changes: 23 additions & 6 deletions include/kiwi/Trie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,22 +402,39 @@ namespace kiwi

template<class Cont>
struct CacheStore
{
Cont cont;
std::vector<size_t> ptrs;

operator bool() const { return !cont.empty(); }
const Cont& operator*() const { return cont; }
void set(const Cont& _cont) { cont = _cont; }
};

template<class Cont>
struct CacheStore<Cont*>
{
Cont* cont = nullptr;
std::vector<size_t> ptrs;

operator bool() const { return cont; }
const Cont& operator*() const { return *cont; }
void set(const Cont& _cont) { cont = &_cont; }
};

template<class Cont, class Value>
Node* buildWithCaching(Cont& cont, Value&& val, CacheStore<Cont>& cache)
template<class Cont, class Value, class CacheCont>
Node* buildWithCaching(Cont&& cont, Value&& val, CacheStore<CacheCont>& cache)
{
static_assert(std::is_pointer<CacheCont>::value ? std::is_reference<Cont>::value && !std::is_rvalue_reference<Cont>::value : true,
"Cont should reference type if using pointer type CacheStore.");
auto allocNode = [&]() { return newNode(); };
//reserveMore(cont.size());

size_t commonPrefix = 0;
if (cache.cont)
if (!!cache)
{
while (commonPrefix < std::min(cache.cont->size(), cont.size())
&& cont[commonPrefix] == (*cache.cont)[commonPrefix]
while (commonPrefix < std::min((*cache).size(), cont.size())
&& cont[commonPrefix] == (*cache)[commonPrefix]
) ++commonPrefix;
}

Expand All @@ -430,7 +447,7 @@ namespace kiwi
cache.ptrs[i] = node - nodes.data();
}
if (!node->val) node->val = val;
cache.cont = &cont;
cache.set(cont);
return node;
}

Expand Down
4 changes: 3 additions & 1 deletion include/kiwi/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,9 @@ namespace kiwi

loadTypoDict = 1 << 2, /**< 오타 사전(typo.dict)의 로딩 여부를 설정한다.*/

default_ = integrateAllomorph | loadDefaultDict | loadTypoDict,
loadMultiDict = 1 << 3, /**< 복합명사 사전(multi.dict)의 로딩 여부를 설정한다. 복합명사 사전은 복합명사의 구성 형태소를 저장하고 있다. */

default_ = integrateAllomorph | loadDefaultDict | loadTypoDict | loadMultiDict,
};

struct Morpheme;
Expand Down
8 changes: 8 additions & 0 deletions include/kiwi/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ namespace kiwi
return within(chr, 0x302E, 0x3030);
}

struct ComparatorIgnoringSpace
{
static bool less(const KString& a, const KString& b, const kchar_t space = u' ');
static bool equal(const KString& a, const KString& b, const kchar_t space = u' ');
};

KString removeSpace(const KString& str, const kchar_t space = u' ');

inline std::ostream& operator <<(std::ostream& os, const KString& str)
{
return os << utf16To8({ str.begin(), str.end() });
Expand Down
3 changes: 2 additions & 1 deletion include/kiwi/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ enum
KIWI_BUILD_INTEGRATE_ALLOMORPH = 1,
KIWI_BUILD_LOAD_DEFAULT_DICT = 2,
KIWI_BUILD_LOAD_TYPO_DICT = 4,
KIWI_BUILD_DEFAULT = 7,
KIWI_BUILD_LOAD_MULTI_DICT = 8,
KIWI_BUILD_DEFAULT = 15,
KIWI_BUILD_MODEL_TYPE_KNLM = 0x0000,
KIWI_BUILD_MODEL_TYPE_SBG = 0x0100,
};
Expand Down
12 changes: 10 additions & 2 deletions src/Form.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <cassert>
#include <cassert>
#include <algorithm>
#include <kiwi/Utils.h>
#include <kiwi/Form.h>
#include "serializer.hpp"
Expand Down Expand Up @@ -32,7 +33,7 @@ namespace kiwi
setComplex(_complex);
}

DEFINE_SERIALIZER_OUTSIDE(MorphemeRaw, kform, tag, vpPack, senseId, combineSocket, combined, userScore, chunks, chunkPositions, lmMorphemeId, groupId);
DEFINE_SERIALIZER_OUTSIDE(MorphemeRaw, kform, tag, vpPack, senseId, combineSocket, combined, userScore, chunks, chunkPositions, lmMorphemeId, /*origMorphemeId,*/ groupId);

Morpheme::Morpheme() = default;

Expand Down Expand Up @@ -81,9 +82,15 @@ namespace kiwi

Form& Form::operator=(Form&&) = default;

bool Form::operator<(const Form& o) const
{
return ComparatorIgnoringSpace::less(form, o.form);
}

Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, const Vector<uint32_t>& additionalCands)
{
Form ret;
ret.numSpaces = count(o.form.begin(), o.form.end(), u' ');
ret.form = o.form;
ret.candidate = FixedVector<const Morpheme*>{ o.candidate.size() + additionalCands.size()};
for (size_t i = 0; i < o.candidate.size(); ++i)
Expand All @@ -110,6 +117,7 @@ namespace kiwi
ret.combined = o.combined;
ret.userScore = o.userScore;
ret.lmMorphemeId = o.lmMorphemeId;
ret.origMorphemeId = o.origMorphemeId;
ret.senseId = o.senseId;
ret.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ o.chunks.size() };
for (size_t i = 0; i < o.chunks.size(); ++i)
Expand Down
61 changes: 49 additions & 12 deletions src/KTrie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,35 @@ namespace kiwi

}

// nonSpaces idx 데이터로부터 글자 수 + 공백 블록 수를 계산한다.
template<class It>
inline size_t countChrWithNormalizedSpace(It first, It last)
{
size_t n = std::distance(first, last);
auto prevIdx = *first++;
for (; first != last; ++first)
{
if (*first != prevIdx + 1) ++n;
prevIdx = *first;
}
return n;
}

// 공백 문자의 위치가 형태소의 공백 위치와 불일치하는 개수를 센다.
inline size_t countSpaceErrors(const KString& form, const uint32_t* spaceIdxFirst, const uint32_t* spaceIdxLast)
{
size_t n = 0;
size_t spaceOffset = 0;
const size_t size = std::distance(spaceIdxFirst, spaceIdxLast);
for (size_t i = 1; i < size; ++i)
{
const bool hasSpace = spaceIdxFirst[i] - spaceIdxFirst[i - 1] > 1;
if (hasSpace && form[i + spaceOffset] != u' ') ++n;
spaceOffset += form[i + spaceOffset] == u' ' ? 1 : 0;
}
return n;
}

template<ArchType arch, bool typoTolerant>
size_t kiwi::splitByTrie(
Vector<KGraphNode>& ret,
Expand Down Expand Up @@ -229,10 +258,10 @@ size_t kiwi::splitByTrie(
bool alreadySpecialChrProcessed = false;
for (auto& cand : candidates)
{
size_t nBegin = typoTolerant ? candTypoCostStarts[&cand - candidates.data()].start : (nonSpaces.size() - cand->form.size());
bool longestMatched = any_of(out.begin() + 1, out.end(), [&](const KGraphNode& g)
const size_t nBegin = typoTolerant ? candTypoCostStarts[&cand - candidates.data()].start : (nonSpaces.size() - cand->sizeWithoutSpace());
const bool longestMatched = any_of(out.begin() + 1, out.end(), [&](const KGraphNode& g)
{
return nBegin == g.endPos && lastSpecialEndPos == g.endPos - (g.uform.empty() ? g.form->form.size() : g.uform.size());
return nBegin == g.endPos && lastSpecialEndPos == g.endPos - (g.uform.empty() ? g.form->sizeWithoutSpace() : g.uform.size());
});

// insert unknown form
Expand All @@ -252,7 +281,7 @@ size_t kiwi::splitByTrie(
}
}

size_t newNodeLength = nBegin - lastSpecialEndPos;
const size_t newNodeLength = nBegin - lastSpecialEndPos;
if (maxUnkFormSize && newNodeLength <= maxUnkFormSize)
{
appendNewNode(out, endPosMap, lastSpecialEndPos, str.substr(nonSpaces[lastSpecialEndPos], nonSpaces[nBegin] - nonSpaces[lastSpecialEndPos]), (uint16_t)nBegin);
Expand All @@ -275,13 +304,21 @@ size_t kiwi::splitByTrie(
}
else
{
size_t lengthWithSpaces = nonSpaces.back() + 1 - nonSpaces[nBegin];
if (lengthWithSpaces <= cand->form.size() + spaceTolerance)
// TO DO: 아래의 spaceErrors 계산방식은 오타 교정 모드에서는 부정확한 값을 낼 수 있음. 더 정교한 방식으로 개선 필요
const size_t lengthWithSpaces = countChrWithNormalizedSpace(nonSpaces.begin() + nBegin, nonSpaces.end());
size_t spaceErrors = 0;
if (lengthWithSpaces <= cand->form.size() + spaceTolerance
&& (!cand->numSpaces || (spaceErrors = countSpaceErrors(cand->form, nonSpaces.data() + nBegin, nonSpaces.data() + nonSpaces.size())) <= spaceTolerance))
{
float typoCost = typoTolerant ? candTypoCostStarts[&cand - candidates.data()].cost : 0.f;
if (appendNewNode(out, endPosMap, nBegin, cand, (uint16_t)nonSpaces.size(), typoCost) && typoTolerant)
if (!cand->numSpaces && lengthWithSpaces > cand->form.size()) spaceErrors = lengthWithSpaces - cand->form.size();
const float typoCost = typoTolerant ? candTypoCostStarts[&cand - candidates.data()].cost : 0.f;
if (appendNewNode(out, endPosMap, nBegin, cand, (uint16_t)nonSpaces.size(), typoCost))
{
out.back().typoFormId = candTypoCostStarts[&cand - candidates.data()].typoId;
out.back().spaceErrors = spaceErrors;
if (typoTolerant)
{
out.back().typoFormId = candTypoCostStarts[&cand - candidates.data()].typoId;
}
}
}
}
Expand All @@ -300,7 +337,7 @@ size_t kiwi::splitByTrie(

bool duplicated = any_of(out.begin() + 1, out.end(), [&](const KGraphNode& g)
{
size_t startPos = g.endPos - (g.uform.empty() ? g.form->form.size() : g.uform.size());
size_t startPos = g.endPos - (g.uform.empty() ? g.form->sizeWithoutSpace() : g.uform.size());
return startPos == lastSpecialEndPos && g.endPos == unkFormEndPos;
});
if (unkFormEndPos > lastSpecialEndPos && !duplicated)
Expand Down Expand Up @@ -478,8 +515,8 @@ size_t kiwi::splitByTrie(
}
}

// spaceTolerance > 0이면 공백문자를 무시하고 분할 진행
if (spaceTolerance > 0 && chrType == POSTag::unknown)
// 공백문자를 무시하고 분할 진행
if (chrType == POSTag::unknown)
{
branchOut(nonSpaces.size(), n);
lastSpecialEndPos = nonSpaces.size();
Expand Down
1 change: 1 addition & 0 deletions src/KTrie.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ namespace kiwi
uint32_t startPos = 0, endPos = 0;
float typoCost = 0;
uint32_t typoFormId = 0;
uint32_t spaceErrors = 0;

KGraphNode(const Form* _form = nullptr, uint16_t _endPos = 0, float _typoCost = 0) : form(_form), endPos(_endPos), typoCost(_typoCost) {}
KGraphNode(U16StringView _uform, uint16_t _endPos, float _typoCost = 0) : uform(_uform), endPos(_endPos), typoCost(_typoCost) {}
Expand Down
Loading

0 comments on commit 992bed6

Please sign in to comment.