From e02799c66abd14bbfd5c913373b4dcf5e1e9b422 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 13 May 2024 01:51:36 +0900 Subject: [PATCH 1/5] added `TokenInfo::script` (#19) --- include/kiwi/ScriptType.h | 243 ++++++++++++ include/kiwi/Types.h | 6 +- src/KTrie.cpp | 22 +- src/Kiwi.cpp | 19 + src/ScriptType.cpp | 569 +++++++++++++++++++++++++++++ vsproj/kiwi_shared_library.vcxproj | 5 +- 6 files changed, 860 insertions(+), 4 deletions(-) create mode 100644 include/kiwi/ScriptType.h create mode 100644 src/ScriptType.cpp diff --git a/include/kiwi/ScriptType.h b/include/kiwi/ScriptType.h new file mode 100644 index 00000000..73f54715 --- /dev/null +++ b/include/kiwi/ScriptType.h @@ -0,0 +1,243 @@ +#pragma once +#include + +namespace kiwi +{ + enum class ScriptType : uint8_t + { + unknown, + latin, + ipa_extensions, + spacing_modifier_letters, + combining_diacritical_marks, + greek_and_coptic, + cyrillic, + armenian, + hebrew, + arabic, + syriac, + thaana, + nko, + samaritan, + mandaic, + devanagari, + bengali, + gurmukhi, + gujarati, + oriya, + tamil, + telugu, + kannada, + malayalam, + sinhala, + thai, + lao, + tibetan, + myanmar, + georgian, + hangul, + ethiopic, + cherokee, + unified_canadian_aboriginal_syllabics, + ogham, + runic, + tagalog, + hanunoo, + buhid, + tagbanwa, + khmer, + mongolian, + limbu, + tai_le, + new_tai_lue, + khmer_symbols, + buginese, + tai_tham, + balinese, + sundanese, + batak, + lepcha, + ol_chiki, + phonetic_extensions, + punctuation, + superscripts_and_subscripts, + currency_symbols, + combining_diacritical_marks_for_symbols, + letterlike_symbols, + number_forms, + arrows, + mathematical, + miscellaneous_technical, + control_pictures, + optical_character_recognition, + enclosed_alphanumerics, + box_drawing, + block_elements, + geometric_shapes, + miscellaneous_symbols, + dingbats, + braille_patterns, + glagolitic, + tifinagh, + hanja, + ideographic_description_characters, + kana, + bopomofo, + kanbun, + yijing_hexagram_symbols, + yi, + lisu, + vai, + bamum, + modifier_tone_letters, + syloti_nagri, + common_indic_number_forms, + phags_pa, + saurashtra, + kayah_li, + rejang, + javanese, + cham, + tai_viet, + meetei_mayek, + private_use_area, + alphabetic_presentation_forms, + arabic_presentation_forms_a, + variation_selectors, + vertical_forms, + combining_half_marks, + small_form_variants, + arabic_presentation_forms_b, + halfwidth_and_fullwidth_forms, + specials, + linear_b, + aegean_numbers, + ancient_greek_numbers, + ancient_symbols, + phaistos_disc, + lycian, + carian, + coptic_epact_numbers, + old_italic, + gothic, + old_permic, + ugaritic, + old_persian, + deseret, + shavian, + osmanya, + osage, + elbasan, + caucasian_albanian, + vithkuqi, + linear_a, + cypriot_syllabary, + imperial_aramaic, + palmyrene, + nabataean, + hatran, + phoenician, + lydian, + meroitic_hieroglyphs, + meroitic_cursive, + kharoshthi, + old_south_arabian, + old_north_arabian, + manichaean, + avestan, + inscriptional_parthian, + inscriptional_pahlavi, + psalter_pahlavi, + old_turkic, + old_hungarian, + hanifi_rohingya, + rumi_numeral_symbols, + yezidi, + old_sogdian, + sogdian, + old_uyghur, + chorasmian, + elymaic, + brahmi, + kaithi, + sora_sompeng, + chakma, + mahajani, + sharada, + sinhala_archaic_numbers, + khojki, + multani, + khudawadi, + grantha, + newa, + tirhuta, + siddham, + modi, + takri, + ahom, + dogra, + warang_citi, + dives_akuru, + nandinagari, + zanabazar_square, + soyombo, + pau_cin_hau, + bhaiksuki, + marchen, + masaram_gondi, + gunjala_gondi, + makasar, + kawi, + cuneiform, + early_dynastic_cuneiform, + cypro_minoan, + egyptian_hieroglyphs, + anatolian_hieroglyphs, + mro, + tangsa, + bassa_vah, + pahawh_hmong, + medefaidrin, + miao, + ideographic_symbols_and_punctuation, + tangut, + khitan_small_script, + nushu, + duployan, + shorthand_format_controls, + znamenny_musical_notation, + byzantine_musical_symbols, + musical_symbols, + ancient_greek_musical_notation, + kaktovik_numerals, + mayan_numerals, + tai_xuan_jing_symbols, + counting_rod_numerals, + mathematical_alphanumeric_symbols, + sutton_signwriting, + nyiakeng_puachue_hmong, + toto, + wancho, + nag_mundari, + mende_kikakui, + adlam, + indic_siyaq_numbers, + ottoman_siyaq_numbers, + arabic_mathematical_alphabetic_symbols, + mahjong_tiles, + domino_tiles, + playing_cards, + enclosed_ideographic_supplement, + symbols_and_pictographs, + emoticons, + transport_and_map_symbols, + alchemical_symbols, + chess_symbols, + symbols_for_legacy_computing, + tags, + }; + + ScriptType chr2ScriptType(char32_t c); + + const char* getScriptName(ScriptType type); +} diff --git a/include/kiwi/Types.h b/include/kiwi/Types.h index 6bbd3dcb..b3ab7412 100644 --- a/include/kiwi/Types.h +++ b/include/kiwi/Types.h @@ -26,6 +26,7 @@ #endif #include "TemplateUtils.hpp" +#include "ScriptType.h" #define KIWI_DEFINE_ENUM_FLAG_OPERATORS(Type) \ inline Type operator~(Type a)\ @@ -306,7 +307,10 @@ namespace kiwi uint32_t lineNumber = 0; /**< 줄 번호*/ uint16_t length = 0; /**< 길이(UTF16 문자 기준) */ POSTag tag = POSTag::unknown; /**< 품사 태그 */ - uint8_t senseId = 0; /**< 의미 번호 */ + union { + uint8_t senseId = 0; /**< 의미 번호 */ + ScriptType script; /**< 유니코드 영역에 기반한 문자 타입 */ + }; float score = 0; /**< 해당 형태소의 언어모델 점수 */ float typoCost = 0; /**< 오타가 교정된 경우 오타 비용. 그렇지 않은 경우 0 */ uint32_t typoFormId = 0; /**< 교정 전 오타의 형태에 대한 정보 (typoCost가 0인 경우 PreTokenizedSpan의 ID값) */ diff --git a/src/KTrie.cpp b/src/KTrie.cpp index 08cfbf7f..d02b16c7 100644 --- a/src/KTrie.cpp +++ b/src/KTrie.cpp @@ -527,6 +527,16 @@ namespace kiwi } } +inline bool isDiscontinuous(POSTag prevTag, POSTag curTag, ScriptType prevScript, ScriptType curScript) +{ + if ((prevTag == POSTag::sl || prevTag == POSTag::sh || prevTag == POSTag::sw) && + (curTag == POSTag::sl || curTag == POSTag::sh || curTag == POSTag::sw)) + { + return prevScript != curScript; + } + return prevTag != curTag; +} + template size_t kiwi::splitByTrie( Vector& ret, @@ -585,6 +595,7 @@ size_t kiwi::splitByTrie( size_t lastSpecialEndPos = 0, specialStartPos = 0; POSTag chrType, lastChrType = POSTag::unknown, lastMatchedPattern = POSTag::unknown; + ScriptType scriptType, lastScriptType = ScriptType::unknown; auto flushBranch = [&](size_t unkFormEndPos = 0, size_t unkFormEndPosWithSpace = 0, bool specialMatched = false) { if (!candidates.empty()) @@ -836,8 +847,14 @@ size_t kiwi::splitByTrie( } chrType = identifySpecialChr(c32); + scriptType = chr2ScriptType(c32); + if (lastChrType == POSTag::sw && c32 == 0x200d) // zero width joiner + { + chrType = lastChrType; + scriptType = lastScriptType; + } - if (lastChrType != chrType || lastChrType == POSTag::sso || lastChrType == POSTag::ssc) + if (isDiscontinuous(lastChrType, chrType, lastScriptType, scriptType) || lastChrType == POSTag::sso || lastChrType == POSTag::ssc) { // sequence of speical characters found if (lastChrType != POSTag::max && lastChrType != POSTag::unknown && lastChrType != lastMatchedPattern) @@ -875,6 +892,7 @@ size_t kiwi::splitByTrie( if (!isSpace(str[n - 3]) && !isSpace(str[n - 2])) { lastChrType = chrType; + lastScriptType = scriptType; break; } } @@ -882,6 +900,7 @@ size_t kiwi::splitByTrie( else if (n >= 8192) { lastChrType = chrType; + lastScriptType = scriptType; break; } @@ -1021,6 +1040,7 @@ size_t kiwi::splitByTrie( } continueFor: lastChrType = chrType; + lastScriptType = scriptType; } // sequence of speical characters found diff --git a/src/Kiwi.cpp b/src/Kiwi.cpp index f3bde52d..6a8428bb 100644 --- a/src/Kiwi.cpp +++ b/src/Kiwi.cpp @@ -598,6 +598,23 @@ namespace kiwi return ++first; } + inline void updateTokenInfoScript(TokenInfo& info) + { + if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw)) return; + if ((info.morph && info.morph->kform && !info.morph->kform->empty())) return; + if (info.str.empty()) return; + char32_t c = info.str[0]; + if (isHighSurrogate(c)) + { + c = mergeSurrogate(c, info.str[1]); + } + info.script = chr2ScriptType(c); + if (info.script == ScriptType::latin) + { + info.tag = POSTag::sl; + } + } + inline void insertPathIntoResults( vector& ret, Vector& spStatesByRet, @@ -718,6 +735,8 @@ namespace kiwi token.score = s.wordScore; token.typoCost = s.typoCost; token.typoFormId = s.typoFormId; + token.senseId = s.morph->senseId; + updateTokenInfoScript(token); auto ptId = nodeInWhichPretokenized[s.nodeId] + 1; if (ptId) { diff --git a/src/ScriptType.cpp b/src/ScriptType.cpp new file mode 100644 index 00000000..c346a592 --- /dev/null +++ b/src/ScriptType.cpp @@ -0,0 +1,569 @@ +#pragma once +#include + +namespace kiwi +{ + ScriptType chr2ScriptType(char32_t c) + { + if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) return ScriptType::latin; + if (0x80 <= c && c <= 0xff) return ScriptType::latin; + if (0x100 <= c && c <= 0x17f) return ScriptType::latin; + if (0x180 <= c && c <= 0x24f) return ScriptType::latin; + if (0x1e00 <= c && c <= 0x1eff) return ScriptType::latin; + if (0x2c60 <= c && c <= 0x2c7f) return ScriptType::latin; + if (0xa720 <= c && c <= 0xa7ff) return ScriptType::latin; + if (0xab30 <= c && c <= 0xab6f) return ScriptType::latin; + if (0x10780 <= c && c <= 0x107bf) return ScriptType::latin; + if (0x1df00 <= c && c <= 0x1dfff) return ScriptType::latin; + if (0x250 <= c && c <= 0x2af) return ScriptType::ipa_extensions; + if (0x2b0 <= c && c <= 0x2ff) return ScriptType::spacing_modifier_letters; + if (0x300 <= c && c <= 0x36f) return ScriptType::combining_diacritical_marks; + if (0x1ab0 <= c && c <= 0x1aff) return ScriptType::combining_diacritical_marks; + if (0x1dc0 <= c && c <= 0x1dff) return ScriptType::combining_diacritical_marks; + if (0x370 <= c && c <= 0x3ff) return ScriptType::greek_and_coptic; + if (0x1f00 <= c && c <= 0x1fff) return ScriptType::greek_and_coptic; + if (0x2c80 <= c && c <= 0x2cff) return ScriptType::greek_and_coptic; + if (0x400 <= c && c <= 0x4ff) return ScriptType::cyrillic; + if (0x500 <= c && c <= 0x52f) return ScriptType::cyrillic; + if (0x1c80 <= c && c <= 0x1c8f) return ScriptType::cyrillic; + if (0x2de0 <= c && c <= 0x2dff) return ScriptType::cyrillic; + if (0xa640 <= c && c <= 0xa69f) return ScriptType::cyrillic; + if (0x1e030 <= c && c <= 0x1e08f) return ScriptType::cyrillic; + if (0x530 <= c && c <= 0x58f) return ScriptType::armenian; + if (0x590 <= c && c <= 0x5ff) return ScriptType::hebrew; + if (0x600 <= c && c <= 0x6ff) return ScriptType::arabic; + if (0x750 <= c && c <= 0x77f) return ScriptType::arabic; + if (0x870 <= c && c <= 0x89f) return ScriptType::arabic; + if (0x8a0 <= c && c <= 0x8ff) return ScriptType::arabic; + if (0x10ec0 <= c && c <= 0x10eff) return ScriptType::arabic; + if (0x700 <= c && c <= 0x74f) return ScriptType::syriac; + if (0x860 <= c && c <= 0x86f) return ScriptType::syriac; + if (0x780 <= c && c <= 0x7bf) return ScriptType::thaana; + if (0x7c0 <= c && c <= 0x7ff) return ScriptType::nko; + if (0x800 <= c && c <= 0x83f) return ScriptType::samaritan; + if (0x840 <= c && c <= 0x85f) return ScriptType::mandaic; + if (0x900 <= c && c <= 0x97f) return ScriptType::devanagari; + if (0x1cd0 <= c && c <= 0x1cff) return ScriptType::devanagari; + if (0xa8e0 <= c && c <= 0xa8ff) return ScriptType::devanagari; + if (0x11b00 <= c && c <= 0x11b5f) return ScriptType::devanagari; + if (0x980 <= c && c <= 0x9ff) return ScriptType::bengali; + if (0xa00 <= c && c <= 0xa7f) return ScriptType::gurmukhi; + if (0xa80 <= c && c <= 0xaff) return ScriptType::gujarati; + if (0xb00 <= c && c <= 0xb7f) return ScriptType::oriya; + if (0xb80 <= c && c <= 0xbff) return ScriptType::tamil; + if (0x11fc0 <= c && c <= 0x11fff) return ScriptType::tamil; + if (0xc00 <= c && c <= 0xc7f) return ScriptType::telugu; + if (0xc80 <= c && c <= 0xcff) return ScriptType::kannada; + if (0xd00 <= c && c <= 0xd7f) return ScriptType::malayalam; + if (0xd80 <= c && c <= 0xdff) return ScriptType::sinhala; + if (0xe00 <= c && c <= 0xe7f) return ScriptType::thai; + if (0xe80 <= c && c <= 0xeff) return ScriptType::lao; + if (0xf00 <= c && c <= 0xfff) return ScriptType::tibetan; + if (0x1000 <= c && c <= 0x109f) return ScriptType::myanmar; + if (0xa9e0 <= c && c <= 0xa9ff) return ScriptType::myanmar; + if (0xaa60 <= c && c <= 0xaa7f) return ScriptType::myanmar; + if (0x10a0 <= c && c <= 0x10ff) return ScriptType::georgian; + if (0x1c90 <= c && c <= 0x1cbf) return ScriptType::georgian; + if (0x2d00 <= c && c <= 0x2d2f) return ScriptType::georgian; + if (0x1100 <= c && c <= 0x11ff) return ScriptType::hangul; + if (0x3130 <= c && c <= 0x318f) return ScriptType::hangul; + if (0xa960 <= c && c <= 0xa97f) return ScriptType::hangul; + if (0xac00 <= c && c <= 0xd7af) return ScriptType::hangul; + if (0xd7b0 <= c && c <= 0xd7ff) return ScriptType::hangul; + if (0x1200 <= c && c <= 0x137f) return ScriptType::ethiopic; + if (0x1380 <= c && c <= 0x139f) return ScriptType::ethiopic; + if (0x2d80 <= c && c <= 0x2ddf) return ScriptType::ethiopic; + if (0xab00 <= c && c <= 0xab2f) return ScriptType::ethiopic; + if (0x1e7e0 <= c && c <= 0x1e7ff) return ScriptType::ethiopic; + if (0x13a0 <= c && c <= 0x13ff) return ScriptType::cherokee; + if (0xab70 <= c && c <= 0xabbf) return ScriptType::cherokee; + if (0x1400 <= c && c <= 0x167f) return ScriptType::unified_canadian_aboriginal_syllabics; + if (0x18b0 <= c && c <= 0x18ff) return ScriptType::unified_canadian_aboriginal_syllabics; + if (0x11ab0 <= c && c <= 0x11abf) return ScriptType::unified_canadian_aboriginal_syllabics; + if (0x1680 <= c && c <= 0x169f) return ScriptType::ogham; + if (0x16a0 <= c && c <= 0x16ff) return ScriptType::runic; + if (0x1700 <= c && c <= 0x171f) return ScriptType::tagalog; + if (0x1720 <= c && c <= 0x173f) return ScriptType::hanunoo; + if (0x1740 <= c && c <= 0x175f) return ScriptType::buhid; + if (0x1760 <= c && c <= 0x177f) return ScriptType::tagbanwa; + if (0x1780 <= c && c <= 0x17ff) return ScriptType::khmer; + if (0x1800 <= c && c <= 0x18af) return ScriptType::mongolian; + if (0x11660 <= c && c <= 0x1167f) return ScriptType::mongolian; + if (0x1900 <= c && c <= 0x194f) return ScriptType::limbu; + if (0x1950 <= c && c <= 0x197f) return ScriptType::tai_le; + if (0x1980 <= c && c <= 0x19df) return ScriptType::new_tai_lue; + if (0x19e0 <= c && c <= 0x19ff) return ScriptType::khmer_symbols; + if (0x1a00 <= c && c <= 0x1a1f) return ScriptType::buginese; + if (0x1a20 <= c && c <= 0x1aaf) return ScriptType::tai_tham; + if (0x1b00 <= c && c <= 0x1b7f) return ScriptType::balinese; + if (0x1b80 <= c && c <= 0x1bbf) return ScriptType::sundanese; + if (0x1cc0 <= c && c <= 0x1ccf) return ScriptType::sundanese; + if (0x1bc0 <= c && c <= 0x1bff) return ScriptType::batak; + if (0x1c00 <= c && c <= 0x1c4f) return ScriptType::lepcha; + if (0x1c50 <= c && c <= 0x1c7f) return ScriptType::ol_chiki; + if (0x1d00 <= c && c <= 0x1d7f) return ScriptType::phonetic_extensions; + if (0x1d80 <= c && c <= 0x1dbf) return ScriptType::phonetic_extensions; + if (0x2000 <= c && c <= 0x206f) return ScriptType::punctuation; + if (0x2e00 <= c && c <= 0x2e7f) return ScriptType::punctuation; + if (0x2070 <= c && c <= 0x209f) return ScriptType::superscripts_and_subscripts; + if (0x20a0 <= c && c <= 0x20cf) return ScriptType::currency_symbols; + if (0x20d0 <= c && c <= 0x20ff) return ScriptType::combining_diacritical_marks_for_symbols; + if (0x2100 <= c && c <= 0x214f) return ScriptType::letterlike_symbols; + if (0x2150 <= c && c <= 0x218f) return ScriptType::number_forms; + if (0x2190 <= c && c <= 0x21ff) return ScriptType::arrows; + if (0x27f0 <= c && c <= 0x27ff) return ScriptType::arrows; + if (0x2900 <= c && c <= 0x297f) return ScriptType::arrows; + if (0x2b00 <= c && c <= 0x2bff) return ScriptType::arrows; + if (0x1f800 <= c && c <= 0x1f8ff) return ScriptType::arrows; + if (0x2200 <= c && c <= 0x22ff) return ScriptType::mathematical; + if (0x27c0 <= c && c <= 0x27ef) return ScriptType::mathematical; + if (0x2980 <= c && c <= 0x29ff) return ScriptType::mathematical; + if (0x2a00 <= c && c <= 0x2aff) return ScriptType::mathematical; + if (0x2300 <= c && c <= 0x23ff) return ScriptType::miscellaneous_technical; + if (0x2400 <= c && c <= 0x243f) return ScriptType::control_pictures; + if (0x2440 <= c && c <= 0x245f) return ScriptType::optical_character_recognition; + if (0x2460 <= c && c <= 0x24ff) return ScriptType::enclosed_alphanumerics; + if (0x1f100 <= c && c <= 0x1f1ff) return ScriptType::enclosed_alphanumerics; + if (0x2500 <= c && c <= 0x257f) return ScriptType::box_drawing; + if (0x2580 <= c && c <= 0x259f) return ScriptType::block_elements; + if (0x25a0 <= c && c <= 0x25ff) return ScriptType::geometric_shapes; + if (0x1f780 <= c && c <= 0x1f7ff) return ScriptType::geometric_shapes; + if (0x2600 <= c && c <= 0x26ff) return ScriptType::miscellaneous_symbols; + if (0x2700 <= c && c <= 0x27bf) return ScriptType::dingbats; + if (0x1f650 <= c && c <= 0x1f67f) return ScriptType::dingbats; + if (0x2800 <= c && c <= 0x28ff) return ScriptType::braille_patterns; + if (0x2c00 <= c && c <= 0x2c5f) return ScriptType::glagolitic; + if (0x1e000 <= c && c <= 0x1e02f) return ScriptType::glagolitic; + if (0x2d30 <= c && c <= 0x2d7f) return ScriptType::tifinagh; + if (0x2e80 <= c && c <= 0x2eff) return ScriptType::hanja; + if (0x2f00 <= c && c <= 0x2fdf) return ScriptType::hanja; + if (0x3000 <= c && c <= 0x303f) return ScriptType::hanja; + if (0x31c0 <= c && c <= 0x31ef) return ScriptType::hanja; + if (0x3200 <= c && c <= 0x32ff) return ScriptType::hanja; + if (0x3300 <= c && c <= 0x33ff) return ScriptType::hanja; + if (0x3400 <= c && c <= 0x4dbf) return ScriptType::hanja; + if (0x4e00 <= c && c <= 0x9fff) return ScriptType::hanja; + if (0xf900 <= c && c <= 0xfaff) return ScriptType::hanja; + if (0xfe30 <= c && c <= 0xfe4f) return ScriptType::hanja; + if (0x20000 <= c && c <= 0x2a6df) return ScriptType::hanja; + if (0x2a700 <= c && c <= 0x2b73f) return ScriptType::hanja; + if (0x2b740 <= c && c <= 0x2b81f) return ScriptType::hanja; + if (0x2b820 <= c && c <= 0x2ceaf) return ScriptType::hanja; + if (0x2ceb0 <= c && c <= 0x2ebef) return ScriptType::hanja; + if (0x2ebf0 <= c && c <= 0x2ee5f) return ScriptType::hanja; + if (0x2f800 <= c && c <= 0x2fa1f) return ScriptType::hanja; + if (0x30000 <= c && c <= 0x3134f) return ScriptType::hanja; + if (0x31350 <= c && c <= 0x323af) return ScriptType::hanja; + if (0x2ff0 <= c && c <= 0x2fff) return ScriptType::ideographic_description_characters; + if (0x3040 <= c && c <= 0x309f) return ScriptType::kana; + if (0x30a0 <= c && c <= 0x30ff) return ScriptType::kana; + if (0x31f0 <= c && c <= 0x31ff) return ScriptType::kana; + if (0x1aff0 <= c && c <= 0x1afff) return ScriptType::kana; + if (0x1b000 <= c && c <= 0x1b0ff) return ScriptType::kana; + if (0x1b100 <= c && c <= 0x1b12f) return ScriptType::kana; + if (0x1b130 <= c && c <= 0x1b16f) return ScriptType::kana; + if (0x3100 <= c && c <= 0x312f) return ScriptType::bopomofo; + if (0x31a0 <= c && c <= 0x31bf) return ScriptType::bopomofo; + if (0x3190 <= c && c <= 0x319f) return ScriptType::kanbun; + if (0x4dc0 <= c && c <= 0x4dff) return ScriptType::yijing_hexagram_symbols; + if (0xa000 <= c && c <= 0xa48f) return ScriptType::yi; + if (0xa490 <= c && c <= 0xa4cf) return ScriptType::yi; + if (0xa4d0 <= c && c <= 0xa4ff) return ScriptType::lisu; + if (0x11fb0 <= c && c <= 0x11fbf) return ScriptType::lisu; + if (0xa500 <= c && c <= 0xa63f) return ScriptType::vai; + if (0xa6a0 <= c && c <= 0xa6ff) return ScriptType::bamum; + if (0x16800 <= c && c <= 0x16a3f) return ScriptType::bamum; + if (0xa700 <= c && c <= 0xa71f) return ScriptType::modifier_tone_letters; + if (0xa800 <= c && c <= 0xa82f) return ScriptType::syloti_nagri; + if (0xa830 <= c && c <= 0xa83f) return ScriptType::common_indic_number_forms; + if (0xa840 <= c && c <= 0xa87f) return ScriptType::phags_pa; + if (0xa880 <= c && c <= 0xa8df) return ScriptType::saurashtra; + if (0xa900 <= c && c <= 0xa92f) return ScriptType::kayah_li; + if (0xa930 <= c && c <= 0xa95f) return ScriptType::rejang; + if (0xa980 <= c && c <= 0xa9df) return ScriptType::javanese; + if (0xaa00 <= c && c <= 0xaa5f) return ScriptType::cham; + if (0xaa80 <= c && c <= 0xaadf) return ScriptType::tai_viet; + if (0xaae0 <= c && c <= 0xaaff) return ScriptType::meetei_mayek; + if (0xabc0 <= c && c <= 0xabff) return ScriptType::meetei_mayek; + if (0xe000 <= c && c <= 0xf8ff) return ScriptType::private_use_area; + if (0xf0000 <= c && c <= 0xfffff) return ScriptType::private_use_area; + if (0x100000 <= c && c <= 0x10ffff) return ScriptType::private_use_area; + if (0xfb00 <= c && c <= 0xfb4f) return ScriptType::alphabetic_presentation_forms; + if (0xfb50 <= c && c <= 0xfdff) return ScriptType::arabic_presentation_forms_a; + if (0xfe00 <= c && c <= 0xfe0f) return ScriptType::variation_selectors; + if (0xe0100 <= c && c <= 0xe01ef) return ScriptType::variation_selectors; + if (0xfe10 <= c && c <= 0xfe1f) return ScriptType::vertical_forms; + if (0xfe20 <= c && c <= 0xfe2f) return ScriptType::combining_half_marks; + if (0xfe50 <= c && c <= 0xfe6f) return ScriptType::small_form_variants; + if (0xfe70 <= c && c <= 0xfeff) return ScriptType::arabic_presentation_forms_b; + if (0xff00 <= c && c <= 0xffef) return ScriptType::halfwidth_and_fullwidth_forms; + if (0xfff0 <= c && c <= 0xffff) return ScriptType::specials; + if (0x10000 <= c && c <= 0x1007f) return ScriptType::linear_b; + if (0x10080 <= c && c <= 0x100ff) return ScriptType::linear_b; + if (0x10100 <= c && c <= 0x1013f) return ScriptType::aegean_numbers; + if (0x10140 <= c && c <= 0x1018f) return ScriptType::ancient_greek_numbers; + if (0x10190 <= c && c <= 0x101cf) return ScriptType::ancient_symbols; + if (0x101d0 <= c && c <= 0x101ff) return ScriptType::phaistos_disc; + if (0x10280 <= c && c <= 0x1029f) return ScriptType::lycian; + if (0x102a0 <= c && c <= 0x102df) return ScriptType::carian; + if (0x102e0 <= c && c <= 0x102ff) return ScriptType::coptic_epact_numbers; + if (0x10300 <= c && c <= 0x1032f) return ScriptType::old_italic; + if (0x10330 <= c && c <= 0x1034f) return ScriptType::gothic; + if (0x10350 <= c && c <= 0x1037f) return ScriptType::old_permic; + if (0x10380 <= c && c <= 0x1039f) return ScriptType::ugaritic; + if (0x103a0 <= c && c <= 0x103df) return ScriptType::old_persian; + if (0x10400 <= c && c <= 0x1044f) return ScriptType::deseret; + if (0x10450 <= c && c <= 0x1047f) return ScriptType::shavian; + if (0x10480 <= c && c <= 0x104af) return ScriptType::osmanya; + if (0x104b0 <= c && c <= 0x104ff) return ScriptType::osage; + if (0x10500 <= c && c <= 0x1052f) return ScriptType::elbasan; + if (0x10530 <= c && c <= 0x1056f) return ScriptType::caucasian_albanian; + if (0x10570 <= c && c <= 0x105bf) return ScriptType::vithkuqi; + if (0x10600 <= c && c <= 0x1077f) return ScriptType::linear_a; + if (0x10800 <= c && c <= 0x1083f) return ScriptType::cypriot_syllabary; + if (0x10840 <= c && c <= 0x1085f) return ScriptType::imperial_aramaic; + if (0x10860 <= c && c <= 0x1087f) return ScriptType::palmyrene; + if (0x10880 <= c && c <= 0x108af) return ScriptType::nabataean; + if (0x108e0 <= c && c <= 0x108ff) return ScriptType::hatran; + if (0x10900 <= c && c <= 0x1091f) return ScriptType::phoenician; + if (0x10920 <= c && c <= 0x1093f) return ScriptType::lydian; + if (0x10980 <= c && c <= 0x1099f) return ScriptType::meroitic_hieroglyphs; + if (0x109a0 <= c && c <= 0x109ff) return ScriptType::meroitic_cursive; + if (0x10a00 <= c && c <= 0x10a5f) return ScriptType::kharoshthi; + if (0x10a60 <= c && c <= 0x10a7f) return ScriptType::old_south_arabian; + if (0x10a80 <= c && c <= 0x10a9f) return ScriptType::old_north_arabian; + if (0x10ac0 <= c && c <= 0x10aff) return ScriptType::manichaean; + if (0x10b00 <= c && c <= 0x10b3f) return ScriptType::avestan; + if (0x10b40 <= c && c <= 0x10b5f) return ScriptType::inscriptional_parthian; + if (0x10b60 <= c && c <= 0x10b7f) return ScriptType::inscriptional_pahlavi; + if (0x10b80 <= c && c <= 0x10baf) return ScriptType::psalter_pahlavi; + if (0x10c00 <= c && c <= 0x10c4f) return ScriptType::old_turkic; + if (0x10c80 <= c && c <= 0x10cff) return ScriptType::old_hungarian; + if (0x10d00 <= c && c <= 0x10d3f) return ScriptType::hanifi_rohingya; + if (0x10e60 <= c && c <= 0x10e7f) return ScriptType::rumi_numeral_symbols; + if (0x10e80 <= c && c <= 0x10ebf) return ScriptType::yezidi; + if (0x10f00 <= c && c <= 0x10f2f) return ScriptType::old_sogdian; + if (0x10f30 <= c && c <= 0x10f6f) return ScriptType::sogdian; + if (0x10f70 <= c && c <= 0x10faf) return ScriptType::old_uyghur; + if (0x10fb0 <= c && c <= 0x10fdf) return ScriptType::chorasmian; + if (0x10fe0 <= c && c <= 0x10fff) return ScriptType::elymaic; + if (0x11000 <= c && c <= 0x1107f) return ScriptType::brahmi; + if (0x11080 <= c && c <= 0x110cf) return ScriptType::kaithi; + if (0x110d0 <= c && c <= 0x110ff) return ScriptType::sora_sompeng; + if (0x11100 <= c && c <= 0x1114f) return ScriptType::chakma; + if (0x11150 <= c && c <= 0x1117f) return ScriptType::mahajani; + if (0x11180 <= c && c <= 0x111df) return ScriptType::sharada; + if (0x111e0 <= c && c <= 0x111ff) return ScriptType::sinhala_archaic_numbers; + if (0x11200 <= c && c <= 0x1124f) return ScriptType::khojki; + if (0x11280 <= c && c <= 0x112af) return ScriptType::multani; + if (0x112b0 <= c && c <= 0x112ff) return ScriptType::khudawadi; + if (0x11300 <= c && c <= 0x1137f) return ScriptType::grantha; + if (0x11400 <= c && c <= 0x1147f) return ScriptType::newa; + if (0x11480 <= c && c <= 0x114df) return ScriptType::tirhuta; + if (0x11580 <= c && c <= 0x115ff) return ScriptType::siddham; + if (0x11600 <= c && c <= 0x1165f) return ScriptType::modi; + if (0x11680 <= c && c <= 0x116cf) return ScriptType::takri; + if (0x11700 <= c && c <= 0x1174f) return ScriptType::ahom; + if (0x11800 <= c && c <= 0x1184f) return ScriptType::dogra; + if (0x118a0 <= c && c <= 0x118ff) return ScriptType::warang_citi; + if (0x11900 <= c && c <= 0x1195f) return ScriptType::dives_akuru; + if (0x119a0 <= c && c <= 0x119ff) return ScriptType::nandinagari; + if (0x11a00 <= c && c <= 0x11a4f) return ScriptType::zanabazar_square; + if (0x11a50 <= c && c <= 0x11aaf) return ScriptType::soyombo; + if (0x11ac0 <= c && c <= 0x11aff) return ScriptType::pau_cin_hau; + if (0x11c00 <= c && c <= 0x11c6f) return ScriptType::bhaiksuki; + if (0x11c70 <= c && c <= 0x11cbf) return ScriptType::marchen; + if (0x11d00 <= c && c <= 0x11d5f) return ScriptType::masaram_gondi; + if (0x11d60 <= c && c <= 0x11daf) return ScriptType::gunjala_gondi; + if (0x11ee0 <= c && c <= 0x11eff) return ScriptType::makasar; + if (0x11f00 <= c && c <= 0x11f5f) return ScriptType::kawi; + if (0x12000 <= c && c <= 0x123ff) return ScriptType::cuneiform; + if (0x12400 <= c && c <= 0x1247f) return ScriptType::cuneiform; + if (0x12480 <= c && c <= 0x1254f) return ScriptType::early_dynastic_cuneiform; + if (0x12f90 <= c && c <= 0x12fff) return ScriptType::cypro_minoan; + if (0x13000 <= c && c <= 0x1342f) return ScriptType::egyptian_hieroglyphs; + if (0x13430 <= c && c <= 0x1345f) return ScriptType::egyptian_hieroglyphs; + if (0x14400 <= c && c <= 0x1467f) return ScriptType::anatolian_hieroglyphs; + if (0x16a40 <= c && c <= 0x16a6f) return ScriptType::mro; + if (0x16a70 <= c && c <= 0x16acf) return ScriptType::tangsa; + if (0x16ad0 <= c && c <= 0x16aff) return ScriptType::bassa_vah; + if (0x16b00 <= c && c <= 0x16b8f) return ScriptType::pahawh_hmong; + if (0x16e40 <= c && c <= 0x16e9f) return ScriptType::medefaidrin; + if (0x16f00 <= c && c <= 0x16f9f) return ScriptType::miao; + if (0x16fe0 <= c && c <= 0x16fff) return ScriptType::ideographic_symbols_and_punctuation; + if (0x17000 <= c && c <= 0x187ff) return ScriptType::tangut; + if (0x18800 <= c && c <= 0x18aff) return ScriptType::tangut; + if (0x18d00 <= c && c <= 0x18d7f) return ScriptType::tangut; + if (0x18b00 <= c && c <= 0x18cff) return ScriptType::khitan_small_script; + if (0x1b170 <= c && c <= 0x1b2ff) return ScriptType::nushu; + if (0x1bc00 <= c && c <= 0x1bc9f) return ScriptType::duployan; + if (0x1bca0 <= c && c <= 0x1bcaf) return ScriptType::shorthand_format_controls; + if (0x1cf00 <= c && c <= 0x1cfcf) return ScriptType::znamenny_musical_notation; + if (0x1d000 <= c && c <= 0x1d0ff) return ScriptType::byzantine_musical_symbols; + if (0x1d100 <= c && c <= 0x1d1ff) return ScriptType::musical_symbols; + if (0x1d200 <= c && c <= 0x1d24f) return ScriptType::ancient_greek_musical_notation; + if (0x1d2c0 <= c && c <= 0x1d2df) return ScriptType::kaktovik_numerals; + if (0x1d2e0 <= c && c <= 0x1d2ff) return ScriptType::mayan_numerals; + if (0x1d300 <= c && c <= 0x1d35f) return ScriptType::tai_xuan_jing_symbols; + if (0x1d360 <= c && c <= 0x1d37f) return ScriptType::counting_rod_numerals; + if (0x1d400 <= c && c <= 0x1d7ff) return ScriptType::mathematical_alphanumeric_symbols; + if (0x1d800 <= c && c <= 0x1daaf) return ScriptType::sutton_signwriting; + if (0x1e100 <= c && c <= 0x1e14f) return ScriptType::nyiakeng_puachue_hmong; + if (0x1e290 <= c && c <= 0x1e2bf) return ScriptType::toto; + if (0x1e2c0 <= c && c <= 0x1e2ff) return ScriptType::wancho; + if (0x1e4d0 <= c && c <= 0x1e4ff) return ScriptType::nag_mundari; + if (0x1e800 <= c && c <= 0x1e8df) return ScriptType::mende_kikakui; + if (0x1e900 <= c && c <= 0x1e95f) return ScriptType::adlam; + if (0x1ec70 <= c && c <= 0x1ecbf) return ScriptType::indic_siyaq_numbers; + if (0x1ed00 <= c && c <= 0x1ed4f) return ScriptType::ottoman_siyaq_numbers; + if (0x1ee00 <= c && c <= 0x1eeff) return ScriptType::arabic_mathematical_alphabetic_symbols; + if (0x1f000 <= c && c <= 0x1f02f) return ScriptType::mahjong_tiles; + if (0x1f030 <= c && c <= 0x1f09f) return ScriptType::domino_tiles; + if (0x1f0a0 <= c && c <= 0x1f0ff) return ScriptType::playing_cards; + if (0x1f200 <= c && c <= 0x1f2ff) return ScriptType::enclosed_ideographic_supplement; + if (0x1f300 <= c && c <= 0x1f5ff) return ScriptType::symbols_and_pictographs; + if (0x1f900 <= c && c <= 0x1f9ff) return ScriptType::symbols_and_pictographs; + if (0x1fa70 <= c && c <= 0x1faff) return ScriptType::symbols_and_pictographs; + if (0x1f600 <= c && c <= 0x1f64f) return ScriptType::emoticons; + if (0x1f680 <= c && c <= 0x1f6ff) return ScriptType::transport_and_map_symbols; + if (0x1f700 <= c && c <= 0x1f77f) return ScriptType::alchemical_symbols; + if (0x1fa00 <= c && c <= 0x1fa6f) return ScriptType::chess_symbols; + if (0x1fb00 <= c && c <= 0x1fbff) return ScriptType::symbols_for_legacy_computing; + if (0xe0000 <= c && c <= 0xe007f) return ScriptType::tags; + return ScriptType::unknown; + } + + const char* getScriptName(ScriptType type) + { + if (type == ScriptType::latin) return "Latin"; + if (type == ScriptType::ipa_extensions) return "IPA Extensions"; + if (type == ScriptType::spacing_modifier_letters) return "Spacing Modifier Letters"; + if (type == ScriptType::combining_diacritical_marks) return "Combining Diacritical Marks"; + if (type == ScriptType::greek_and_coptic) return "Greek and Coptic"; + if (type == ScriptType::cyrillic) return "Cyrillic"; + if (type == ScriptType::armenian) return "Armenian"; + if (type == ScriptType::hebrew) return "Hebrew"; + if (type == ScriptType::arabic) return "Arabic"; + if (type == ScriptType::syriac) return "Syriac"; + if (type == ScriptType::thaana) return "Thaana"; + if (type == ScriptType::nko) return "NKo"; + if (type == ScriptType::samaritan) return "Samaritan"; + if (type == ScriptType::mandaic) return "Mandaic"; + if (type == ScriptType::devanagari) return "Devanagari"; + if (type == ScriptType::bengali) return "Bengali"; + if (type == ScriptType::gurmukhi) return "Gurmukhi"; + if (type == ScriptType::gujarati) return "Gujarati"; + if (type == ScriptType::oriya) return "Oriya"; + if (type == ScriptType::tamil) return "Tamil"; + if (type == ScriptType::telugu) return "Telugu"; + if (type == ScriptType::kannada) return "Kannada"; + if (type == ScriptType::malayalam) return "Malayalam"; + if (type == ScriptType::sinhala) return "Sinhala"; + if (type == ScriptType::thai) return "Thai"; + if (type == ScriptType::lao) return "Lao"; + if (type == ScriptType::tibetan) return "Tibetan"; + if (type == ScriptType::myanmar) return "Myanmar"; + if (type == ScriptType::georgian) return "Georgian"; + if (type == ScriptType::hangul) return "Hangul"; + if (type == ScriptType::ethiopic) return "Ethiopic"; + if (type == ScriptType::cherokee) return "Cherokee"; + if (type == ScriptType::unified_canadian_aboriginal_syllabics) return "Unified Canadian Aboriginal Syllabics"; + if (type == ScriptType::ogham) return "Ogham"; + if (type == ScriptType::runic) return "Runic"; + if (type == ScriptType::tagalog) return "Tagalog"; + if (type == ScriptType::hanunoo) return "Hanunoo"; + if (type == ScriptType::buhid) return "Buhid"; + if (type == ScriptType::tagbanwa) return "Tagbanwa"; + if (type == ScriptType::khmer) return "Khmer"; + if (type == ScriptType::mongolian) return "Mongolian"; + if (type == ScriptType::limbu) return "Limbu"; + if (type == ScriptType::tai_le) return "Tai Le"; + if (type == ScriptType::new_tai_lue) return "New Tai Lue"; + if (type == ScriptType::khmer_symbols) return "Khmer Symbols"; + if (type == ScriptType::buginese) return "Buginese"; + if (type == ScriptType::tai_tham) return "Tai Tham"; + if (type == ScriptType::balinese) return "Balinese"; + if (type == ScriptType::sundanese) return "Sundanese"; + if (type == ScriptType::batak) return "Batak"; + if (type == ScriptType::lepcha) return "Lepcha"; + if (type == ScriptType::ol_chiki) return "Ol Chiki"; + if (type == ScriptType::phonetic_extensions) return "Phonetic Extensions"; + if (type == ScriptType::punctuation) return "Punctuation"; + if (type == ScriptType::superscripts_and_subscripts) return "Superscripts and Subscripts"; + if (type == ScriptType::currency_symbols) return "Currency Symbols"; + if (type == ScriptType::combining_diacritical_marks_for_symbols) return "Combining Diacritical Marks for Symbols"; + if (type == ScriptType::letterlike_symbols) return "Letterlike Symbols"; + if (type == ScriptType::number_forms) return "Number Forms"; + if (type == ScriptType::arrows) return "Arrows"; + if (type == ScriptType::mathematical) return "Mathematical"; + if (type == ScriptType::miscellaneous_technical) return "Miscellaneous Technical"; + if (type == ScriptType::control_pictures) return "Control Pictures"; + if (type == ScriptType::optical_character_recognition) return "Optical Character Recognition"; + if (type == ScriptType::enclosed_alphanumerics) return "Enclosed Alphanumerics"; + if (type == ScriptType::box_drawing) return "Box Drawing"; + if (type == ScriptType::block_elements) return "Block Elements"; + if (type == ScriptType::geometric_shapes) return "Geometric Shapes"; + if (type == ScriptType::miscellaneous_symbols) return "Miscellaneous Symbols"; + if (type == ScriptType::dingbats) return "Dingbats"; + if (type == ScriptType::braille_patterns) return "Braille Patterns"; + if (type == ScriptType::glagolitic) return "Glagolitic"; + if (type == ScriptType::tifinagh) return "Tifinagh"; + if (type == ScriptType::hanja) return "Hanja"; + if (type == ScriptType::ideographic_description_characters) return "Ideographic Description Characters"; + if (type == ScriptType::kana) return "Kana"; + if (type == ScriptType::bopomofo) return "Bopomofo"; + if (type == ScriptType::kanbun) return "Kanbun"; + if (type == ScriptType::yijing_hexagram_symbols) return "Yijing Hexagram Symbols"; + if (type == ScriptType::yi) return "Yi"; + if (type == ScriptType::lisu) return "Lisu"; + if (type == ScriptType::vai) return "Vai"; + if (type == ScriptType::bamum) return "Bamum"; + if (type == ScriptType::modifier_tone_letters) return "Modifier Tone Letters"; + if (type == ScriptType::syloti_nagri) return "Syloti Nagri"; + if (type == ScriptType::common_indic_number_forms) return "Common Indic Number Forms"; + if (type == ScriptType::phags_pa) return "Phags-pa"; + if (type == ScriptType::saurashtra) return "Saurashtra"; + if (type == ScriptType::kayah_li) return "Kayah Li"; + if (type == ScriptType::rejang) return "Rejang"; + if (type == ScriptType::javanese) return "Javanese"; + if (type == ScriptType::cham) return "Cham"; + if (type == ScriptType::tai_viet) return "Tai Viet"; + if (type == ScriptType::meetei_mayek) return "Meetei Mayek"; + if (type == ScriptType::private_use_area) return "Private Use Area"; + if (type == ScriptType::alphabetic_presentation_forms) return "Alphabetic Presentation Forms"; + if (type == ScriptType::arabic_presentation_forms_a) return "Arabic Presentation Forms-A"; + if (type == ScriptType::variation_selectors) return "Variation Selectors"; + if (type == ScriptType::vertical_forms) return "Vertical Forms"; + if (type == ScriptType::combining_half_marks) return "Combining Half Marks"; + if (type == ScriptType::small_form_variants) return "Small Form Variants"; + if (type == ScriptType::arabic_presentation_forms_b) return "Arabic Presentation Forms-B"; + if (type == ScriptType::halfwidth_and_fullwidth_forms) return "Halfwidth and Fullwidth Forms"; + if (type == ScriptType::specials) return "Specials"; + if (type == ScriptType::linear_b) return "Linear B"; + if (type == ScriptType::aegean_numbers) return "Aegean Numbers"; + if (type == ScriptType::ancient_greek_numbers) return "Ancient Greek Numbers"; + if (type == ScriptType::ancient_symbols) return "Ancient Symbols"; + if (type == ScriptType::phaistos_disc) return "Phaistos Disc"; + if (type == ScriptType::lycian) return "Lycian"; + if (type == ScriptType::carian) return "Carian"; + if (type == ScriptType::coptic_epact_numbers) return "Coptic Epact Numbers"; + if (type == ScriptType::old_italic) return "Old Italic"; + if (type == ScriptType::gothic) return "Gothic"; + if (type == ScriptType::old_permic) return "Old Permic"; + if (type == ScriptType::ugaritic) return "Ugaritic"; + if (type == ScriptType::old_persian) return "Old Persian"; + if (type == ScriptType::deseret) return "Deseret"; + if (type == ScriptType::shavian) return "Shavian"; + if (type == ScriptType::osmanya) return "Osmanya"; + if (type == ScriptType::osage) return "Osage"; + if (type == ScriptType::elbasan) return "Elbasan"; + if (type == ScriptType::caucasian_albanian) return "Caucasian Albanian"; + if (type == ScriptType::vithkuqi) return "Vithkuqi"; + if (type == ScriptType::linear_a) return "Linear A"; + if (type == ScriptType::cypriot_syllabary) return "Cypriot Syllabary"; + if (type == ScriptType::imperial_aramaic) return "Imperial Aramaic"; + if (type == ScriptType::palmyrene) return "Palmyrene"; + if (type == ScriptType::nabataean) return "Nabataean"; + if (type == ScriptType::hatran) return "Hatran"; + if (type == ScriptType::phoenician) return "Phoenician"; + if (type == ScriptType::lydian) return "Lydian"; + if (type == ScriptType::meroitic_hieroglyphs) return "Meroitic Hieroglyphs"; + if (type == ScriptType::meroitic_cursive) return "Meroitic Cursive"; + if (type == ScriptType::kharoshthi) return "Kharoshthi"; + if (type == ScriptType::old_south_arabian) return "Old South Arabian"; + if (type == ScriptType::old_north_arabian) return "Old North Arabian"; + if (type == ScriptType::manichaean) return "Manichaean"; + if (type == ScriptType::avestan) return "Avestan"; + if (type == ScriptType::inscriptional_parthian) return "Inscriptional Parthian"; + if (type == ScriptType::inscriptional_pahlavi) return "Inscriptional Pahlavi"; + if (type == ScriptType::psalter_pahlavi) return "Psalter Pahlavi"; + if (type == ScriptType::old_turkic) return "Old Turkic"; + if (type == ScriptType::old_hungarian) return "Old Hungarian"; + if (type == ScriptType::hanifi_rohingya) return "Hanifi Rohingya"; + if (type == ScriptType::rumi_numeral_symbols) return "Rumi Numeral Symbols"; + if (type == ScriptType::yezidi) return "Yezidi"; + if (type == ScriptType::old_sogdian) return "Old Sogdian"; + if (type == ScriptType::sogdian) return "Sogdian"; + if (type == ScriptType::old_uyghur) return "Old Uyghur"; + if (type == ScriptType::chorasmian) return "Chorasmian"; + if (type == ScriptType::elymaic) return "Elymaic"; + if (type == ScriptType::brahmi) return "Brahmi"; + if (type == ScriptType::kaithi) return "Kaithi"; + if (type == ScriptType::sora_sompeng) return "Sora Sompeng"; + if (type == ScriptType::chakma) return "Chakma"; + if (type == ScriptType::mahajani) return "Mahajani"; + if (type == ScriptType::sharada) return "Sharada"; + if (type == ScriptType::sinhala_archaic_numbers) return "Sinhala Archaic Numbers"; + if (type == ScriptType::khojki) return "Khojki"; + if (type == ScriptType::multani) return "Multani"; + if (type == ScriptType::khudawadi) return "Khudawadi"; + if (type == ScriptType::grantha) return "Grantha"; + if (type == ScriptType::newa) return "Newa"; + if (type == ScriptType::tirhuta) return "Tirhuta"; + if (type == ScriptType::siddham) return "Siddham"; + if (type == ScriptType::modi) return "Modi"; + if (type == ScriptType::takri) return "Takri"; + if (type == ScriptType::ahom) return "Ahom"; + if (type == ScriptType::dogra) return "Dogra"; + if (type == ScriptType::warang_citi) return "Warang Citi"; + if (type == ScriptType::dives_akuru) return "Dives Akuru"; + if (type == ScriptType::nandinagari) return "Nandinagari"; + if (type == ScriptType::zanabazar_square) return "Zanabazar Square"; + if (type == ScriptType::soyombo) return "Soyombo"; + if (type == ScriptType::pau_cin_hau) return "Pau Cin Hau"; + if (type == ScriptType::bhaiksuki) return "Bhaiksuki"; + if (type == ScriptType::marchen) return "Marchen"; + if (type == ScriptType::masaram_gondi) return "Masaram Gondi"; + if (type == ScriptType::gunjala_gondi) return "Gunjala Gondi"; + if (type == ScriptType::makasar) return "Makasar"; + if (type == ScriptType::kawi) return "Kawi"; + if (type == ScriptType::cuneiform) return "Cuneiform"; + if (type == ScriptType::early_dynastic_cuneiform) return "Early Dynastic Cuneiform"; + if (type == ScriptType::cypro_minoan) return "Cypro-Minoan"; + if (type == ScriptType::egyptian_hieroglyphs) return "Egyptian Hieroglyphs"; + if (type == ScriptType::anatolian_hieroglyphs) return "Anatolian Hieroglyphs"; + if (type == ScriptType::mro) return "Mro"; + if (type == ScriptType::tangsa) return "Tangsa"; + if (type == ScriptType::bassa_vah) return "Bassa Vah"; + if (type == ScriptType::pahawh_hmong) return "Pahawh Hmong"; + if (type == ScriptType::medefaidrin) return "Medefaidrin"; + if (type == ScriptType::miao) return "Miao"; + if (type == ScriptType::ideographic_symbols_and_punctuation) return "Ideographic Symbols and Punctuation"; + if (type == ScriptType::tangut) return "Tangut"; + if (type == ScriptType::khitan_small_script) return "Khitan Small Script"; + if (type == ScriptType::nushu) return "Nushu"; + if (type == ScriptType::duployan) return "Duployan"; + if (type == ScriptType::shorthand_format_controls) return "Shorthand Format Controls"; + if (type == ScriptType::znamenny_musical_notation) return "Znamenny Musical Notation"; + if (type == ScriptType::byzantine_musical_symbols) return "Byzantine Musical Symbols"; + if (type == ScriptType::musical_symbols) return "Musical Symbols"; + if (type == ScriptType::ancient_greek_musical_notation) return "Ancient Greek Musical Notation"; + if (type == ScriptType::kaktovik_numerals) return "Kaktovik Numerals"; + if (type == ScriptType::mayan_numerals) return "Mayan Numerals"; + if (type == ScriptType::tai_xuan_jing_symbols) return "Tai Xuan Jing Symbols"; + if (type == ScriptType::counting_rod_numerals) return "Counting Rod Numerals"; + if (type == ScriptType::mathematical_alphanumeric_symbols) return "Mathematical Alphanumeric Symbols"; + if (type == ScriptType::sutton_signwriting) return "Sutton SignWriting"; + if (type == ScriptType::nyiakeng_puachue_hmong) return "Nyiakeng Puachue Hmong"; + if (type == ScriptType::toto) return "Toto"; + if (type == ScriptType::wancho) return "Wancho"; + if (type == ScriptType::nag_mundari) return "Nag Mundari"; + if (type == ScriptType::mende_kikakui) return "Mende Kikakui"; + if (type == ScriptType::adlam) return "Adlam"; + if (type == ScriptType::indic_siyaq_numbers) return "Indic Siyaq Numbers"; + if (type == ScriptType::ottoman_siyaq_numbers) return "Ottoman Siyaq Numbers"; + if (type == ScriptType::arabic_mathematical_alphabetic_symbols) return "Arabic Mathematical Alphabetic Symbols"; + if (type == ScriptType::mahjong_tiles) return "Mahjong Tiles"; + if (type == ScriptType::domino_tiles) return "Domino Tiles"; + if (type == ScriptType::playing_cards) return "Playing Cards"; + if (type == ScriptType::enclosed_ideographic_supplement) return "Enclosed Ideographic Supplement"; + if (type == ScriptType::symbols_and_pictographs) return "Symbols and Pictographs"; + if (type == ScriptType::emoticons) return "Emoticons"; + if (type == ScriptType::transport_and_map_symbols) return "Transport and Map Symbols"; + if (type == ScriptType::alchemical_symbols) return "Alchemical Symbols"; + if (type == ScriptType::chess_symbols) return "Chess Symbols"; + if (type == ScriptType::symbols_for_legacy_computing) return "Symbols for Legacy Computing"; + if (type == ScriptType::tags) return "Tags"; + return "unknown"; + } +} diff --git a/vsproj/kiwi_shared_library.vcxproj b/vsproj/kiwi_shared_library.vcxproj index 73e8effe..c70f01cc 100644 --- a/vsproj/kiwi_shared_library.vcxproj +++ b/vsproj/kiwi_shared_library.vcxproj @@ -40,6 +40,7 @@ + @@ -56,7 +57,6 @@ - @@ -115,10 +115,11 @@ - + + From 14b6fe7e665785e19fc33a4f82974637524ad1c5 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 13 May 2024 01:52:16 +0900 Subject: [PATCH 2/5] add test cases for `TokenInfo::script` --- test/test_cpp.cpp | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index 2bc431b7..c1a75f5e 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -163,12 +163,43 @@ TEST(KiwiCpp, OldHangul) TEST(KiwiCpp, ChineseVsEmoji) { Kiwi& kiwi = reuseKiwiInstance(); - auto res = kiwi.analyze(u"韓𠀀𠀁𠀂𠀃🔥🤔🙃🐶", Match::allWithNormalizing).first; + auto res = kiwi.analyze(u"韓𠀀𠀁𠀂𠀃🔥🤔🐶", Match::allWithNormalizing).first; EXPECT_EQ(res.size(), 2); EXPECT_EQ(res[0].tag, POSTag::sh); EXPECT_EQ(res[1].tag, POSTag::sw); } +TEST(KiwiCpp, Script) +{ + Kiwi& kiwi = reuseKiwiInstance(); + auto res = kiwi.analyze(u"résumé", Match::allWithNormalizing).first; + EXPECT_EQ(res.size(), 1); + EXPECT_EQ(res[0].tag, POSTag::sl); + EXPECT_EQ(res[0].script, ScriptType::latin); + + res = kiwi.analyze(u"中国の歴史における", Match::allWithNormalizing).first; + EXPECT_EQ(res.size(), 4); + EXPECT_EQ(res[0].tag, POSTag::sh); + EXPECT_EQ(res[0].script, ScriptType::hanja); + EXPECT_EQ(res[1].tag, POSTag::sw); + EXPECT_EQ(res[1].script, ScriptType::kana); + EXPECT_EQ(res[2].tag, POSTag::sh); + EXPECT_EQ(res[2].script, ScriptType::hanja); + EXPECT_EQ(res[3].tag, POSTag::sw); + EXPECT_EQ(res[3].script, ScriptType::kana); + + res = kiwi.analyze(u"👍🏻👍🏿 👨‍👩‍👦", Match::allWithNormalizing).first; + EXPECT_EQ(res.size(), 2); + EXPECT_EQ(res[0].tag, POSTag::sw); + EXPECT_EQ(res[0].script, ScriptType::symbols_and_pictographs); + EXPECT_EQ(res[0].position, 0); + EXPECT_EQ(res[0].length, 8); + EXPECT_EQ(res[1].tag, POSTag::sw); + EXPECT_EQ(res[1].script, ScriptType::symbols_and_pictographs); + EXPECT_EQ(res[1].position, 9); + EXPECT_EQ(res[1].length, 8); +} + TEST(KiwiCpp, EmptyToken) { Kiwi& kiwi = reuseKiwiInstance(); From 0a05bad43831c47edbd3d2dd093f9c724333b2a7 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 13 May 2024 02:15:50 +0900 Subject: [PATCH 3/5] prevent splitting emoji modifiers --- src/KTrie.cpp | 5 ++++- test/test_cpp.cpp | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/KTrie.cpp b/src/KTrie.cpp index d02b16c7..bf719db8 100644 --- a/src/KTrie.cpp +++ b/src/KTrie.cpp @@ -848,7 +848,10 @@ size_t kiwi::splitByTrie( chrType = identifySpecialChr(c32); scriptType = chr2ScriptType(c32); - if (lastChrType == POSTag::sw && c32 == 0x200d) // zero width joiner + if (lastChrType == POSTag::sw && + (c32 == 0x200d || // zero width joiner + (0x1f3fb <= c32 && c32 <= 0x1f3ff) || // skin color modifier + scriptType == ScriptType::variation_selectors)) // variation selectors { chrType = lastChrType; scriptType = lastScriptType; diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index c1a75f5e..9e008adb 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -188,8 +188,8 @@ TEST(KiwiCpp, Script) EXPECT_EQ(res[3].tag, POSTag::sw); EXPECT_EQ(res[3].script, ScriptType::kana); - res = kiwi.analyze(u"👍🏻👍🏿 👨‍👩‍👦", Match::allWithNormalizing).first; - EXPECT_EQ(res.size(), 2); + res = kiwi.analyze(u"👍🏻👍🏿 👨‍👩‍👦 ℹ️ ✍🏼", Match::allWithNormalizing).first; + EXPECT_EQ(res.size(), 4); EXPECT_EQ(res[0].tag, POSTag::sw); EXPECT_EQ(res[0].script, ScriptType::symbols_and_pictographs); EXPECT_EQ(res[0].position, 0); @@ -198,6 +198,10 @@ TEST(KiwiCpp, Script) EXPECT_EQ(res[1].script, ScriptType::symbols_and_pictographs); EXPECT_EQ(res[1].position, 9); EXPECT_EQ(res[1].length, 8); + EXPECT_EQ(res[2].tag, POSTag::sw); + EXPECT_EQ(res[2].script, ScriptType::letterlike_symbols); + EXPECT_EQ(res[3].tag, POSTag::sw); + EXPECT_EQ(res[3].script, ScriptType::dingbats); } TEST(KiwiCpp, EmptyToken) From ce552d9d4773c1b65af02687c5a2f3c7373be082 Mon Sep 17 00:00:00 2001 From: bab2min Date: Wed, 15 May 2024 20:04:48 +0900 Subject: [PATCH 4/5] add `kiwi::isEmoji()` --- include/kiwi/ScriptType.h | 2 + src/ScriptType.cpp | 185 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) diff --git a/include/kiwi/ScriptType.h b/include/kiwi/ScriptType.h index 73f54715..5e5edf28 100644 --- a/include/kiwi/ScriptType.h +++ b/include/kiwi/ScriptType.h @@ -240,4 +240,6 @@ namespace kiwi ScriptType chr2ScriptType(char32_t c); const char* getScriptName(ScriptType type); + + bool isEmoji(char32_t c0, char32_t c1 = 0); } diff --git a/src/ScriptType.cpp b/src/ScriptType.cpp index c346a592..4ed906be 100644 --- a/src/ScriptType.cpp +++ b/src/ScriptType.cpp @@ -566,4 +566,189 @@ namespace kiwi if (type == ScriptType::tags) return "Tags"; return "unknown"; } + + bool isEmoji(char32_t c0, char32_t c1) + { + if (0x1f004 == c0) return true; + if (0x1f0cf == c0) return true; + if (0x1f18e == c0) return true; + if (0x1f191 <= c0 && c0 <= 0x1f19a) return true; + if (0x1f201 == c0) return true; + if (0x1f21a == c0) return true; + if (0x1f22f == c0) return true; + if (0x1f232 <= c0 && c0 <= 0x1f236) return true; + if (0x1f238 <= c0 && c0 <= 0x1f23a) return true; + if (0x1f250 <= c0 && c0 <= 0x1f251) return true; + if (0x1f300 <= c0 && c0 <= 0x1f320) return true; + if (0x1f32d <= c0 && c0 <= 0x1f335) return true; + if (0x1f337 <= c0 && c0 <= 0x1f37c) return true; + if (0x1f37e <= c0 && c0 <= 0x1f393) return true; + if (0x1f3a0 <= c0 && c0 <= 0x1f3ca) return true; + if (0x1f3cf <= c0 && c0 <= 0x1f3d3) return true; + if (0x1f3e0 <= c0 && c0 <= 0x1f3f0) return true; + if (0x1f3f4 == c0) return true; + if (0x1f3f8 <= c0 && c0 <= 0x1f43e) return true; + if (0x1f440 == c0) return true; + if (0x1f442 <= c0 && c0 <= 0x1f4fc) return true; + if (0x1f4ff <= c0 && c0 <= 0x1f53d) return true; + if (0x1f54b <= c0 && c0 <= 0x1f54e) return true; + if (0x1f550 <= c0 && c0 <= 0x1f567) return true; + if (0x1f57a == c0) return true; + if (0x1f595 <= c0 && c0 <= 0x1f596) return true; + if (0x1f5a4 == c0) return true; + if (0x1f5fb <= c0 && c0 <= 0x1f64f) return true; + if (0x1f680 <= c0 && c0 <= 0x1f6c5) return true; + if (0x1f6cc == c0) return true; + if (0x1f6d0 <= c0 && c0 <= 0x1f6d2) return true; + if (0x1f6d5 <= c0 && c0 <= 0x1f6d7) return true; + if (0x1f6dc <= c0 && c0 <= 0x1f6df) return true; + if (0x1f6eb <= c0 && c0 <= 0x1f6ec) return true; + if (0x1f6f4 <= c0 && c0 <= 0x1f6fc) return true; + if (0x1f7e0 <= c0 && c0 <= 0x1f7eb) return true; + if (0x1f7f0 == c0) return true; + if (0x1f90c <= c0 && c0 <= 0x1f93a) return true; + if (0x1f93c <= c0 && c0 <= 0x1f945) return true; + if (0x1f947 <= c0 && c0 <= 0x1f9ff) return true; + if (0x1fa70 <= c0 && c0 <= 0x1fa7c) return true; + if (0x1fa80 <= c0 && c0 <= 0x1fa88) return true; + if (0x1fa90 <= c0 && c0 <= 0x1fabd) return true; + if (0x1fabf <= c0 && c0 <= 0x1fac5) return true; + if (0x1face <= c0 && c0 <= 0x1fadb) return true; + if (0x1fae0 <= c0 && c0 <= 0x1fae8) return true; + if (0x1faf0 <= c0 && c0 <= 0x1faf8) return true; + + if (c1 != 0xfe0f) return false; + if (0xa9 == c0) return true; + if (0xae == c0) return true; + if (0x203c == c0) return true; + if (0x2049 == c0) return true; + if (0x2122 == c0) return true; + if (0x2139 == c0) return true; + if (0x2194 <= c0 && c0 <= 0x2199) return true; + if (0x21a9 <= c0 && c0 <= 0x21aa) return true; + if (0x231a <= c0 && c0 <= 0x231b) return true; + if (0x2328 == c0) return true; + if (0x23cf == c0) return true; + if (0x23e9 <= c0 && c0 <= 0x23f3) return true; + if (0x23f8 <= c0 && c0 <= 0x23fa) return true; + if (0x24c2 == c0) return true; + if (0x25aa <= c0 && c0 <= 0x25ab) return true; + if (0x25b6 == c0) return true; + if (0x25c0 == c0) return true; + if (0x25fb <= c0 && c0 <= 0x25fe) return true; + if (0x2600 <= c0 && c0 <= 0x2604) return true; + if (0x260e == c0) return true; + if (0x2611 == c0) return true; + if (0x2614 <= c0 && c0 <= 0x2615) return true; + if (0x2618 == c0) return true; + if (0x261d == c0) return true; + if (0x2620 == c0) return true; + if (0x2622 <= c0 && c0 <= 0x2623) return true; + if (0x2626 == c0) return true; + if (0x262a == c0) return true; + if (0x262e <= c0 && c0 <= 0x262f) return true; + if (0x2638 <= c0 && c0 <= 0x263a) return true; + if (0x2640 == c0) return true; + if (0x2642 == c0) return true; + if (0x2648 <= c0 && c0 <= 0x2653) return true; + if (0x265f <= c0 && c0 <= 0x2660) return true; + if (0x2663 == c0) return true; + if (0x2665 <= c0 && c0 <= 0x2666) return true; + if (0x2668 == c0) return true; + if (0x267b == c0) return true; + if (0x267e <= c0 && c0 <= 0x267f) return true; + if (0x2692 <= c0 && c0 <= 0x2697) return true; + if (0x2699 == c0) return true; + if (0x269b <= c0 && c0 <= 0x269c) return true; + if (0x26a0 <= c0 && c0 <= 0x26a1) return true; + if (0x26a7 == c0) return true; + if (0x26aa <= c0 && c0 <= 0x26ab) return true; + if (0x26b0 <= c0 && c0 <= 0x26b1) return true; + if (0x26bd <= c0 && c0 <= 0x26be) return true; + if (0x26c4 <= c0 && c0 <= 0x26c5) return true; + if (0x26c8 == c0) return true; + if (0x26ce <= c0 && c0 <= 0x26cf) return true; + if (0x26d1 == c0) return true; + if (0x26d3 <= c0 && c0 <= 0x26d4) return true; + if (0x26e9 <= c0 && c0 <= 0x26ea) return true; + if (0x26f0 <= c0 && c0 <= 0x26f5) return true; + if (0x26f7 <= c0 && c0 <= 0x26fa) return true; + if (0x26fd == c0) return true; + if (0x2702 == c0) return true; + if (0x2705 == c0) return true; + if (0x2708 <= c0 && c0 <= 0x270d) return true; + if (0x270f == c0) return true; + if (0x2712 == c0) return true; + if (0x2714 == c0) return true; + if (0x2716 == c0) return true; + if (0x271d == c0) return true; + if (0x2721 == c0) return true; + if (0x2728 == c0) return true; + if (0x2733 <= c0 && c0 <= 0x2734) return true; + if (0x2744 == c0) return true; + if (0x2747 == c0) return true; + if (0x274c == c0) return true; + if (0x274e == c0) return true; + if (0x2753 <= c0 && c0 <= 0x2755) return true; + if (0x2757 == c0) return true; + if (0x2763 <= c0 && c0 <= 0x2764) return true; + if (0x2795 <= c0 && c0 <= 0x2797) return true; + if (0x27a1 == c0) return true; + if (0x27b0 == c0) return true; + if (0x27bf == c0) return true; + if (0x2934 <= c0 && c0 <= 0x2935) return true; + if (0x2b05 <= c0 && c0 <= 0x2b07) return true; + if (0x2b1b <= c0 && c0 <= 0x2b1c) return true; + if (0x2b50 == c0) return true; + if (0x2b55 == c0) return true; + if (0x3030 == c0) return true; + if (0x303d == c0) return true; + if (0x3297 == c0) return true; + if (0x3299 == c0) return true; + if (0x1f170 <= c0 && c0 <= 0x1f171) return true; + if (0x1f17e <= c0 && c0 <= 0x1f17f) return true; + if (0x1f202 == c0) return true; + if (0x1f237 == c0) return true; + if (0x1f321 == c0) return true; + if (0x1f324 <= c0 && c0 <= 0x1f32c) return true; + if (0x1f336 == c0) return true; + if (0x1f37d == c0) return true; + if (0x1f396 <= c0 && c0 <= 0x1f397) return true; + if (0x1f399 <= c0 && c0 <= 0x1f39b) return true; + if (0x1f39e <= c0 && c0 <= 0x1f39f) return true; + if (0x1f3cb <= c0 && c0 <= 0x1f3ce) return true; + if (0x1f3d4 <= c0 && c0 <= 0x1f3df) return true; + if (0x1f3f3 == c0) return true; + if (0x1f3f5 == c0) return true; + if (0x1f3f7 == c0) return true; + if (0x1f43f == c0) return true; + if (0x1f441 == c0) return true; + if (0x1f4fd == c0) return true; + if (0x1f549 <= c0 && c0 <= 0x1f54a) return true; + if (0x1f56f <= c0 && c0 <= 0x1f570) return true; + if (0x1f573 <= c0 && c0 <= 0x1f579) return true; + if (0x1f587 == c0) return true; + if (0x1f58a <= c0 && c0 <= 0x1f58d) return true; + if (0x1f590 == c0) return true; + if (0x1f5a5 == c0) return true; + if (0x1f5a8 == c0) return true; + if (0x1f5b1 <= c0 && c0 <= 0x1f5b2) return true; + if (0x1f5bc == c0) return true; + if (0x1f5c2 <= c0 && c0 <= 0x1f5c4) return true; + if (0x1f5d1 <= c0 && c0 <= 0x1f5d3) return true; + if (0x1f5dc <= c0 && c0 <= 0x1f5de) return true; + if (0x1f5e1 == c0) return true; + if (0x1f5e3 == c0) return true; + if (0x1f5e8 == c0) return true; + if (0x1f5ef == c0) return true; + if (0x1f5f3 == c0) return true; + if (0x1f5fa == c0) return true; + if (0x1f6cb == c0) return true; + if (0x1f6cd <= c0 && c0 <= 0x1f6cf) return true; + if (0x1f6e0 <= c0 && c0 <= 0x1f6e5) return true; + if (0x1f6e9 == c0) return true; + if (0x1f6f0 == c0) return true; + if (0x1f6f3 == c0) return true; + return false; + } } From 230188e3208018588e4563ef4908e6096d4e7223 Mon Sep 17 00:00:00 2001 From: bab2min Date: Wed, 15 May 2024 20:15:56 +0900 Subject: [PATCH 5/5] add ScriptType.cpp to CMakeLists.txt --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6822d749..1d686015 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,7 @@ set ( CORE_SRCS src/KTrie.cpp src/PatternMatcher.cpp src/search.cpp + src/ScriptType.cpp src/SwTokenizer.cpp src/TagUtils.cpp src/TypoTransformer.cpp