Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SB 태그 오류 수정 #147

Merged
merged 2 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,18 @@ namespace kiwi
q.accScore -= 2;
}
}

// discount for SB in form "[가-하]."
if (curMorphSbType == 5)
{
q.accScore -= 5;
}

if (curMorphSbType && isEClass(q.lastMorpheme->tag) && q.lastMorpheme->tag != POSTag::ef)
{
q.accScore -= 10;
}

if (curMorphSbType && q.spState.bulletHash == hashSbTypeOrder(curMorphSbType, curMorphSbOrder))
{
q.accScore += 3;
Expand Down Expand Up @@ -1245,7 +1257,7 @@ namespace kiwi
{
lastNgram[j - 1] = it->morpheme - kw->morphemes.data();
}
lastNgram[3] |= (uint8_t)c.spState;
lastNgram[3] ^= (uint8_t)c.spState;
auto insertResult = bestPathes.emplace(lastNgram, make_pair(&c, c.accScore));
if (!insertResult.second)
{
Expand Down Expand Up @@ -1848,6 +1860,11 @@ namespace kiwi
fillSentLineInfo(r.first, newlines);
}

sort(ret.begin(), ret.end(), [](const TokenResult& a, const TokenResult& b)
{
return a.second > b.second;
});

if (ret.empty()) ret.emplace_back();
return ret;
}
Expand Down
3 changes: 2 additions & 1 deletion src/TagUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ namespace kiwi

TagSequenceScorer::TagSequenceScorer(float _weight) : weight{ _weight }
{
for (auto t : { POSTag::nnp, POSTag::np, POSTag::ic, POSTag::sb})
for (auto t : { POSTag::nnp, POSTag::np, POSTag::ic })
{
leftBoundaryScores[0][(size_t)t] = -1;
}
leftBoundaryScores[0][(size_t)POSTag::sb] = -3;

for (size_t r = 0; r < (size_t)POSTag::max; ++r)
{
Expand Down
24 changes: 24 additions & 0 deletions test/test_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,30 @@ TEST(KiwiCpp, SentenceBoundaryWithOrderedBullet)
}
}


TEST(KiwiCpp, FalsePositiveSB)
{
Kiwi& kiwi = reuseKiwiInstance();

for (auto str : {
u"하다. 가운데 비닐을 요렇게 벗겨주고요!",
u"자, 이것이 `열쇠`다.`` 암상인 앞의 캡슐이 열리며 그곳에서 새로운 파워업 아이템 `쿠나이`를 얻을 수 있다.",
u"기계는 명령만 듣는다.라는 생각이 이제 사람들에게 완전히 정착이 되었다는 상황인데, 그럴싸하죠",
u"후반 빨간 모아이들의 공격은 엄청나게 거세다.상하로 점프하며 이온링을 발사하는 중보스 모아이상들.",
u"또 전화세, 전기세, 보험료등 월 정기지출도 지출통장으로 바꾼 다. 셋째, 물건을 살땐 무조건 카드로 긁는다.",
u"에티하드항공이 최고의 이코노미 클래스 상을 두 번째로 받은 해는 2020년이다. 이전에는 2012년과 2013년에 최고의 이코노미 클래스 상을 수상한 적이 있어요.",

})
{
auto tokens = kiwi.analyze(str, 10, Match::allWithNormalizing)[0].first;
auto sbCount = std::count_if(tokens.begin(), tokens.end(), [](const TokenInfo& t)
{
return t.tag == POSTag::sb;
});
EXPECT_EQ(sbCount, 0);
}
}

TEST(KiwiCpp, SplitByPolarity)
{
Kiwi& kiwi = reuseKiwiInstance();
Expand Down
Loading