Skip to content

Commit

Permalink
Merge pull request #183 from bab2min/dev/compatible_jamo
Browse files Browse the repository at this point in the history
출력을 호환자모로 통일하는 옵션 추가
  • Loading branch information
bab2min authored Sep 8, 2024
2 parents f6a714f + 4b31b54 commit 186bb57
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 17 deletions.
3 changes: 2 additions & 1 deletion include/kiwi/PatternMatcher.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#pragma once
#pragma once

#include <vector>
#include <string>
Expand All @@ -24,6 +24,7 @@ namespace kiwi
joinAdvSuffix = 1 << 21, /**< 부사파생접미사(XSM)를 분리하지 않고 합쳐서 매칭한다 */
splitComplex = 1 << 22, /**< 더 작은 단위로 분할될 수 있는 형태소는 더 분할하여 매칭한다 */
zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */
compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
all = url | email | hashtag | mention | serial | emoji | zCoda,
Expand Down
24 changes: 23 additions & 1 deletion include/kiwi/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,29 @@ namespace kiwi
return within(chr, 0xAC00, 0xD7A4);
}

inline bool isHangulOnset(char16_t chr)
{
return within(chr, 0x1100, 0x1100 + 19);
}

inline bool isHangulCoda(char16_t chr)
{
return within(chr, 0x11A8, 0x11A7 + 28);
return within(chr, 0x11A8, 0x11A8 + 27);
}

inline bool isHangulVowel(char16_t chr)
{
return within(chr, 0x314F, 0x3164);
}

inline char16_t joinOnsetVowel(size_t onset, size_t vowel)
{
return 0xAC00 + (char16_t)((onset * 21 + vowel) * 28);
}

inline int extractVowel(char16_t chr)
{
return ((chr - 0xAC00) / 28) % 21;
}

inline bool isOldHangulOnset(char16_t chr)
Expand Down Expand Up @@ -88,6 +108,8 @@ namespace kiwi
return within(chr, 0x3131, 0x314E) || within(chr, 0x3165, 0x3186);
}

char16_t toCompatibleHangulConsonant(char16_t chr);

struct ComparatorIgnoringSpace
{
static bool less(const KString& a, const KString& b, const kchar_t space = u' ');
Expand Down
14 changes: 14 additions & 0 deletions src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,14 @@ namespace kiwi
}
}

inline void toCompatibleJamo(u16string& str)
{
for (auto& c : str)
{
c = toCompatibleHangulConsonant(c);
}
}

inline void insertPathIntoResults(
vector<TokenResult>& ret,
Vector<SpecialState>& spStatesByRet,
Expand Down Expand Up @@ -726,6 +734,12 @@ namespace kiwi
}
joined = joinHangul(s.str.empty() ? *s.morph->kform : s.str);
} while (0);

if (!!(matchOptions & Match::compatibleJamo))
{
toCompatibleJamo(joined);
}

rarr.emplace_back(joined, s.morph->tag);
auto& token = rarr.back();
token.morph = within(s.morph, pretokenizedGroup.morphemes) ? nullptr : s.morph;
Expand Down
15 changes: 0 additions & 15 deletions src/StrUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -728,21 +728,6 @@ namespace kiwi
}
}

inline bool isHangulOnset(char16_t c)
{
return u'' <= c && c <= u'';
}

inline bool isHangulVowel(char16_t c)
{
return u'' <= c && c <= u'';
}

inline char16_t joinOnsetVowel(size_t onset, size_t vowel)
{
return u'' + (char16_t)((onset * 21 + vowel) * 28);
}

inline bool isChineseChr(char32_t c)
{
return (0x4E00 <= c && c <= 0x9FFF)
Expand Down
12 changes: 12 additions & 0 deletions src/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,4 +498,16 @@ namespace kiwi
return ret;
}

char16_t toCompatibleHangulConsonant(char16_t chr)
{
if (isHangulOnset(chr))
{
return u"ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"[chr - 0x1100];
}
else if (isHangulCoda(chr))
{
return u"ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ"[chr - 0x11A8];
}
return chr;
}
}
21 changes: 21 additions & 0 deletions test/test_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1228,6 +1228,27 @@ TEST(KiwiCpp, JoinAffix)
EXPECT_EQ(res5.first[5].str, u"배송되");
}

TEST(KiwiCpp, CompatibleJamo)
{
Kiwi& kiwi = reuseKiwiInstance();
auto res1 = kiwi.analyze(u"이긴다. 이김. 이길것.", Match::none).first;
EXPECT_EQ(res1.size(), 10);
EXPECT_EQ(res1[1].str, u"ᆫ다");
EXPECT_EQ(res1[4].str, u"");
EXPECT_EQ(res1[7].str, u"");

auto res2 = kiwi.analyze(u"이긴다. 이김. 이길것.", Match::compatibleJamo).first;
EXPECT_EQ(res2.size(), 10);
EXPECT_EQ(res2[1].str, u"ㄴ다");
EXPECT_EQ(res2[4].str, u"");
EXPECT_EQ(res2[7].str, u"");

auto res3 = kiwi.analyze(u"ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑᄒ ᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ", Match::compatibleJamo).first;
EXPECT_EQ(res3.size(), 2);
EXPECT_EQ(res3[0].str, u"ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ");
EXPECT_EQ(res3[1].str, u"ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ");
}

TEST(KiwiCpp, AutoJoiner)
{
Kiwi& kiwi = reuseKiwiInstance();
Expand Down

0 comments on commit 186bb57

Please sign in to comment.