Skip to content

Commit

Permalink
Merge pull request #167 from bab2min/dev_emoji
Browse files Browse the repository at this point in the history
emoji 태그 추가
  • Loading branch information
bab2min authored May 19, 2024
2 parents 0db05c4 + 8ce3c21 commit 5713b10
Show file tree
Hide file tree
Showing 15 changed files with 310 additions and 215 deletions.
4 changes: 2 additions & 2 deletions ModelGenerator/sj.knlm
Git LFS file not shown
4 changes: 2 additions & 2 deletions ModelGenerator/sj.morph
Git LFS file not shown
4 changes: 2 additions & 2 deletions ModelGenerator/skipbigram.mdl
Git LFS file not shown
16 changes: 9 additions & 7 deletions bindings/java/kr/pe/bab2min/Kiwi.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public static class Match {
hashtag = 1 << 2,
mention = 1 << 3,
serial = 1 << 4,
emoji = 1 << 5,
normalizeCoda = 1 << 16,
joinNounPrefix = 1 << 17,
joinNounSuffix = 1 << 18,
Expand Down Expand Up @@ -48,13 +49,13 @@ public static class POSTag {
vcp = 19, vcn = 20,
sf = 21, sp = 22, ss = 23, sso = 24, ssc = 25, se = 26, so = 27, sw = 28, sb = 29,
sl = 30, sh = 31, sn = 32,
w_url = 33, w_email = 34, w_mention = 35, w_hashtag = 36, w_serial = 37,
jks = 38, jkc = 39, jkg = 40, jko = 41, jkb = 42, jkv = 43, jkq = 44, jx = 45, jc = 46,
ep = 47, ef = 48, ec = 49, etn = 50, etm = 51,
z_coda = 52,
user0 = 53, user1 = 54, user2 = 55, user3 = 56, user4 = 57,
p = 58,
max = 59,
w_url = 33, w_email = 34, w_mention = 35, w_hashtag = 36, w_serial = 37, w_emoji = 38,
jks = 39, jkc = 40, jkg = 41, jko = 42, jkb = 43, jkv = 44, jkq = 45, jx = 46, jc = 47,
ep = 48, ef = 49, ec = 50, etn = 51, etm = 52,
z_coda = 53,
user0 = 54, user1 = 55, user2 = 56, user3 = 57, user4 = 58,
p = 59,
max = 60,
pv = p,
pa = (byte)(p + 1),
irregular = - 128,
Expand Down Expand Up @@ -106,6 +107,7 @@ static String toString(byte tag) {
case w_mention: return "W_MENTION";
case w_hashtag: return "W_HASHTAG";
case w_serial: return "W_SERIAL";
case w_emoji: return "W_EMOJI";
case jks: return "JKS";
case jkc: return "JKC";
case jkg: return "JKG";
Expand Down
3 changes: 2 additions & 1 deletion bindings/java/kr/pe/bab2min/KiwiBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ public static class BuildOption {
integrateAllomorph = 1 << 0,
loadDefaultDict = 1 << 1,
loadTypoDict = 1 << 2,
default_ = integrateAllomorph | loadDefaultDict | loadTypoDict;
loadMultiDict = 1 << 3,
default_ = integrateAllomorph | loadDefaultDict | loadTypoDict | loadMultiDict;
}

public static class AnalyzedMorph {
Expand Down
3 changes: 2 additions & 1 deletion include/kiwi/PatternMatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ namespace kiwi
hashtag = 1 << 2, /**< 해시태그 형태의 텍스트(#해시)를 w_hashtag 태그에 매칭한다 */
mention = 1 << 3, /**< 멘션 형태의 텍스트(@멘션)를 w_mention 태그에 매칭한다 */
serial = 1 << 4, /**< 일련 번호 형태의 텍스트를 w_serial 태그에 매칭한다 */
emoji = 1 << 5, /**< 이모지 문자를 w_emoji 태그에 매칭한다 */
normalizeCoda = 1 << 16, /**< 초성체가 앞 어절의 받침에 따라붙은 경우를 정규화하여 매칭한다 */
joinNounPrefix = 1 << 17, /**< 체언접두사(XPN)를 분리하지 않고 합쳐서 매칭한다 */
joinNounSuffix = 1 << 18, /**< 명사파생접미사(XSN)를 분리하지 않고 합쳐서 매칭한다 */
Expand All @@ -25,7 +26,7 @@ namespace kiwi
zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
all = url | email | hashtag | mention | serial | zCoda,
all = url | email | hashtag | mention | serial | emoji | zCoda,
allWithNormalizing = all | normalizeCoda,
};

Expand Down
7 changes: 6 additions & 1 deletion include/kiwi/ScriptType.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,5 +241,10 @@ namespace kiwi

const char* getScriptName(ScriptType type);

bool isEmoji(char32_t c0, char32_t c1 = 0);
/**
* @brief Check if the character is an emoji
*
* @return 0 if the character is not an emoji, 1 if c0 is an emoji, 2 if c0 and c1 are combined to form an emoji.
*/
int isEmoji(char32_t c0, char32_t c1 = 0);
}
2 changes: 1 addition & 1 deletion include/kiwi/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ namespace kiwi
vcp, vcn,
sf, sp, ss, sso, ssc, se, so, sw, sb,
sl, sh, sn,
w_url, w_email, w_mention, w_hashtag, w_serial,
w_url, w_email, w_mention, w_hashtag, w_serial, w_emoji,
jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc,
ep, ef, ec, etn, etm,
z_coda,
Expand Down
4 changes: 2 additions & 2 deletions include/kiwi/Utils.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#pragma once
#pragma once
#include <iostream>
#include <string>
#include <memory>
Expand Down Expand Up @@ -30,7 +30,7 @@ namespace kiwi

inline bool isWebTag(POSTag t)
{
return POSTag::w_url <= t && t <= POSTag::w_hashtag;
return POSTag::w_url <= t && t <= POSTag::w_emoji;
}

POSTag toPOSTag(const std::u16string& tagStr);
Expand Down
2 changes: 1 addition & 1 deletion src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ namespace kiwi

inline void updateTokenInfoScript(TokenInfo& info)
{
if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw)) return;
if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw || info.tag == POSTag::w_emoji)) return;
if ((info.morph && info.morph->kform && !info.morph->kform->empty())) return;
if (info.str.empty()) return;
char32_t c = info.str[0];
Expand Down
75 changes: 75 additions & 0 deletions src/PatternMatcher.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <kiwi/PatternMatcher.h>
#include <kiwi/Utils.h>
#include <kiwi/ScriptType.h>
#include "pattern.hpp"
#include "StrUtils.h"

using namespace std;
using namespace kiwi;
Expand All @@ -26,6 +28,7 @@ namespace kiwi
size_t testNumeric(const char16_t left, const char16_t* first, const char16_t* last) const;
size_t testSerial(const char16_t* first, const char16_t* last) const;
size_t testAbbr(const char16_t* first, const char16_t* last) const;
size_t testEmoji(const char16_t* first, const char16_t* last) const;

public:
std::pair<size_t, POSTag> match(char16_t left, const char16_t* first, const char16_t* last, Match matchOptions) const;
Expand Down Expand Up @@ -290,6 +293,77 @@ size_t PatternMatcherImpl::testAbbr(const char16_t* first, const char16_t* last)
return b - first;
}

size_t PatternMatcherImpl::testEmoji(const char16_t* first, const char16_t* last) const
{
const char16_t* b = first;
while (b + 1 < last)
{
char32_t c0 = 0, c1 = 0;
const char16_t* b1 = b;
if (isHighSurrogate(*b1))
{
c0 = mergeSurrogate(b1[0], b1[1]);
b1 += 2;
}
else
{
c0 = *b1++;
}

const char16_t* b2 = b1;
if (b2 < last)
{
if (isHighSurrogate(*b2) && b2 + 1 < last)
{
c1 = mergeSurrogate(b2[0], b2[1]);
b2 += 2;
}
else
{
c1 = *b2++;
}
}

auto r = isEmoji(c0, c1);
if (r == 1)
{
b = b1;
}
else if (r == 2)
{
b = b2;
}
else
{
break;
}

if (b == last) return b - first;
if (0xfe00 <= *b && *b <= 0xfe0f) // variation selectors
{
++b;
if (b == last) return b - first;
}
else if (b + 1 < last && isHighSurrogate(b[0]))
{
c1 = mergeSurrogate(b[0], b[1]);
if (0x1f3fb <= c1 && c1 <= 0x1f3ff) // skin color modifier
{
b += 2;
if (b == last) return b - first;
}
}

if (*b == 0x200d) // zero width joiner
{
++b;
continue;
}
break;
}
return b - first;
}

pair<size_t, POSTag> PatternMatcherImpl::match(char16_t left, const char16_t * first, const char16_t * last, Match matchOptions) const
{
size_t size;
Expand All @@ -299,6 +373,7 @@ pair<size_t, POSTag> PatternMatcherImpl::match(char16_t left, const char16_t * f
if (!!(matchOptions & Match::email) && (size = testEmail(first, last))) return make_pair(size, POSTag::w_email);
if (!!(matchOptions & Match::mention) && (size = testMention(first, last))) return make_pair(size, POSTag::w_mention);
if (!!(matchOptions & Match::url) && (size = testUrl(first, last))) return make_pair(size, POSTag::w_url);
if (!!(matchOptions & Match::emoji) && (size = testEmoji(first, last))) return make_pair(size, POSTag::w_emoji);
if ((size = testAbbr(first, last))) return make_pair(size, POSTag::sl);
return make_pair(0, POSTag::unknown);
}
Expand Down
Loading

0 comments on commit 5713b10

Please sign in to comment.