Skip to content

Commit

Permalink
Merge pull request #206 from bab2min/dev/fix_205
Browse files Browse the repository at this point in the history
오타 교정 사용시 복합 명사가 인식되지 않는 버그 수정
  • Loading branch information
bab2min authored Dec 17, 2024
2 parents 89f4015 + 8eb13c6 commit d00519d
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 12 deletions.
9 changes: 5 additions & 4 deletions include/kiwi/Form.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,17 +228,18 @@ namespace kiwi
uint32_t formId = 0;
float scoreHash = 0;
uint32_t typoId = 0;
uint16_t numSpaces = 0;
CondVowel leftCond = CondVowel::none;

TypoForm() = default;

TypoForm(const std::tuple<uint32_t, float, CondVowel>& p)
: formId{ std::get<0>(p) }, scoreHash{ std::get<1>(p) }, leftCond{ std::get<2>(p) }
TypoForm(const std::tuple<uint32_t, float, uint16_t, CondVowel>& p)
: formId{ std::get<0>(p) }, scoreHash{ std::get<1>(p) }, numSpaces{ std::get<2>(p)}, leftCond{std::get<3>(p)}
{
}

TypoForm(uint32_t _formId, float _score = 0, bool _hash = 0, uint32_t _typoId = 0, CondVowel _leftCond = CondVowel::none)
: formId{ _formId }, scoreHash{ _hash ? -_score : _score }, typoId{ _typoId }, leftCond{ _leftCond }
TypoForm(uint32_t _formId, float _score = 0, bool _hash = 0, uint32_t _typoId = 0, uint16_t _numSpaces = 0, CondVowel _leftCond = CondVowel::none)
: formId{ _formId }, scoreHash{ _hash ? -_score : _score }, typoId{ _typoId }, numSpaces{ _numSpaces }, leftCond{ _leftCond }
{
}

Expand Down
13 changes: 9 additions & 4 deletions src/KTrie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,21 @@ namespace kiwi
uint32_t start = 0;
uint32_t typoId = 0;
uint32_t end = 0; // only used in continual typo tolerant mode
uint32_t numSpaces = 0;

FormCandidate(const Form* _form = nullptr,
float _cost = 0,
uint32_t _start = 0,
uint32_t _typoId = 0,
uint32_t _end = 0,
uint32_t _numSpaces = 0,
uint32_t = 0)
: form{ _form },
cost{ _cost },
start{ _start },
typoId{ _typoId },
end{ _end }
end{ _end },
numSpaces{ _numSpaces }
{}

size_t getStartPos(size_t ) const
Expand All @@ -86,7 +89,7 @@ namespace kiwi

size_t getFormSizeWithTypos(const size_t* typoPtrs) const
{
return typoPtrs[typoId + 1] - typoPtrs[typoId];
return typoPtrs[typoId + 1] - typoPtrs[typoId] + numSpaces;
}

bool operator==(const Form* f) const
Expand All @@ -100,7 +103,7 @@ namespace kiwi
{
const Form* form = nullptr;

FormCandidate(const Form* _form = nullptr, float = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0)
FormCandidate(const Form* _form = nullptr, float = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0, uint32_t = 0)
: form{ _form }
{}

Expand Down Expand Up @@ -146,8 +149,9 @@ namespace kiwi
uint32_t _start = 0,
uint32_t _typoId = 0,
uint32_t _end = 0,
uint32_t _numSpaces = 0,
uint32_t _lengthenedSize = 0)
: FormCandidate<typoTolerant, continualTypoTolerant, false>{ _form, _cost, _start, _typoId, _end, _lengthenedSize },
: FormCandidate<typoTolerant, continualTypoTolerant, false>{ _form, _cost, _start, _typoId, _end, _numSpaces, _lengthenedSize },
lengthenedSize{ _lengthenedSize }
{}

Expand Down Expand Up @@ -203,6 +207,7 @@ namespace kiwi
startPosition ? startPosition : ((nonSpaces.size() - typoFormSize) * posMultiplier),
tCand->typoId,
endPosition,
tCand->numSpaces,
lengthenedSize);
}
if (tCand[0].hash() != tCand[1].hash()) break;
Expand Down
8 changes: 4 additions & 4 deletions src/KiwiBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2028,7 +2028,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
// 오타 교정이 있는 경우 가능한 모든 오타에 대해 Trie 생성
else
{
using TypoInfo = tuple<uint32_t, float, CondVowel>;
using TypoInfo = tuple<uint32_t, float, uint16_t, CondVowel>;
UnorderedMap<KString, Vector<TypoInfo>> typoGroup;
auto ptypos = typos.prepare();
ret.continualTypoCost = ptypos.getContinualTypoCost();
Expand All @@ -2043,12 +2043,12 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
for (auto t : ptypos._generate(f->form, typoCostThreshold))
{
if (t.leftCond != CondVowel::none && f->vowel != CondVowel::none && t.leftCond != f->vowel) continue;
typoGroup[removeSpace(t.str)].emplace_back(f - ret.forms.data(), t.cost, t.leftCond);
typoGroup[removeSpace(t.str)].emplace_back(f - ret.forms.data(), t.cost, f->numSpaces, t.leftCond);
}
}
else
{
typoGroup[removeSpace(f->form)].emplace_back(f - ret.forms.data(), 0, CondVowel::none);
typoGroup[removeSpace(f->form)].emplace_back(f - ret.forms.data(), 0, f->numSpaces, CondVowel::none);
}
}

Expand Down Expand Up @@ -2107,7 +2107,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
estimatedNodeSize += f->first.size() - commonPrefix;
prevForm = &f->first;
}
ret.typoForms.emplace_back(0, 0, hash);
ret.typoForms.emplace_back(0, 0, 0, hash);
ret.typoPtrs.emplace_back(ret.typoPool.size());
formTrie.reserveMore(estimatedNodeSize);

Expand Down
15 changes: 15 additions & 0 deletions test/test_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1657,3 +1657,18 @@ TEST(KiwiCpp, IssueP189)
EXPECT_EQ(res[3].str, u"");
EXPECT_EQ(res[4].str, u"무료");
}

TEST(KiwiCpp, Issue205)
{
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
builder.addWord(u"함박 스테이크");
auto kiwi1 = builder.build();
auto res1 = kiwi1.analyze(u"함박 스테이크를 먹었습니다", Match::allWithNormalizing).first;

EXPECT_EQ(res1[0].str, u"함박 스테이크");

auto kiwi2 = builder.build(DefaultTypoSet::basicTypoSetWithContinual);
auto res2 = kiwi2.analyze(u"함박 스테이크를 먹었습니다", Match::allWithNormalizing).first;

EXPECT_EQ(res2[0].str, u"함박 스테이크");
}

0 comments on commit d00519d

Please sign in to comment.