Skip to content

Commit

Permalink
Merge pull request #159 from bab2min/dev_continual_writing
Browse files Browse the repository at this point in the history
연철에 대한 오타 교정 추가
  • Loading branch information
bab2min authored Apr 13, 2024
2 parents 707e51b + 4d4a5e3 commit c62b968
Show file tree
Hide file tree
Showing 20 changed files with 1,136 additions and 268 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.12)

project(kiwi VERSION 0.17.0 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
project(kiwi VERSION 0.17.1 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")

set ( CMAKE_CXX_STANDARD 14 )
set ( CMAKE_VERBOSE_MAKEFILE true )
Expand Down
3 changes: 2 additions & 1 deletion bindings/java/kiwi_java.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,8 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved)

jni::define<JTypoTransformer>()
.template ctor<>()
.template method<&JTypoTransformer::addTypo>("_addTypo"),
.template method<&JTypoTransformer::addTypo>("_addTypo")
.template method<&JTypoTransformer::setContinualTypoCost>("_setContinualTypoCost"),

jni::define<JKiwiBuilder>()
.template ctor<std::string, size_t, kiwi::BuildOption, bool>()
Expand Down
2 changes: 1 addition & 1 deletion bindings/java/kr/pe/bab2min/Kiwi.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

public class Kiwi implements AutoCloseable {
private long _inst;
final private static String _version = "0.17.0";
final private static String _version = "0.17.1";

public static class Match {
final static public int none = 0,
Expand Down
123 changes: 123 additions & 0 deletions bindings/java/kr/pe/bab2min/KiwiBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ public boolean isAlive() {
@Override
public native void close() throws Exception;
public native void _addTypo(String orig, String error, float cost, byte convVowel);
public native void _setContinualTypoCost(float cost);

public TypoTransformer addTypo(String orig, String error, float cost, byte convVowel) {
_addTypo(orig, error, cost, convVowel);
Expand All @@ -77,6 +78,11 @@ public TypoTransformer addTypo(String[] orig, String[] error, float cost, byte c
}
return this;
}

public TypoTransformer setContinualTypoCost(float cost) {
_setContinualTypoCost(cost);
return this;
}
}

public KiwiBuilder(long _inst) {
Expand Down Expand Up @@ -219,4 +225,121 @@ public Kiwi build(TypoTransformer typos) {
.addTypo(new String[]{"을", "를"}, new String[]{"을", "를"}, 2.f, CondVowel.none)

.addTypo(new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, 1.5f, CondVowel.none);

final public static TypoTransformer continualTypoSet = new TypoTransformer()
.setContinualTypoCost(1.f)
.addTypo(new String[]{"ᆪ"}, new String[]{"ᆨᆺ", "ᆨᆻ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆬ"}, new String[]{"ᆫᆽ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆭ"}, new String[]{"ᆫᇂ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆰ"}, new String[]{"ᆯᆨ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆱ"}, new String[]{"ᆯᆷ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆲ"}, new String[]{"ᆯᆸ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆳ"}, new String[]{"ᆯᆺ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆴ"}, new String[]{"ᆯᇀ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆵ"}, new String[]{"ᆯᇁ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none);

final public static TypoTransformer basicTypoSetWithContinual = new TypoTransformer()
.addTypo(new String[]{"ㅐ", "ㅔ"}, new String[]{"ㅐ", "ㅔ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ㅐ", "ㅔ"}, new String[]{"ㅒ", "ㅖ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ㅒ", "ㅖ"}, new String[]{"ㅐ", "ㅔ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ㅒ", "ㅖ"}, new String[]{"ㅒ", "ㅖ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ㅚ", "ㅙ", "ㅞ"}, new String[]{"ㅚ", "ㅙ", "ㅞ", "ㅐ", "ㅔ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ㅝ"}, new String[]{"ㅗ", "ㅓ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ㅟ", "ㅢ"}, new String[]{"ㅣ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"위", "의"}, new String[]{"이"}, Float.POSITIVE_INFINITY, CondVowel.none)
.addTypo(new String[]{"위", "의"}, new String[]{"이"}, 1.f, CondVowel.any)
.addTypo(new String[]{"자", "쟈"}, new String[]{"자", "쟈"}, 1.f, CondVowel.none)
.addTypo(new String[]{"재", "쟤"}, new String[]{"재", "쟤"}, 1.f, CondVowel.none)
.addTypo(new String[]{"저", "져"}, new String[]{"저", "져"}, 1.f, CondVowel.none)
.addTypo(new String[]{"제", "졔"}, new String[]{"제", "졔"}, 1.f, CondVowel.none)
.addTypo(new String[]{"조", "죠", "줘"}, new String[]{"조", "죠", "줘"}, 1.f, CondVowel.none)
.addTypo(new String[]{"주", "쥬"}, new String[]{"주", "쥬"}, 1.f, CondVowel.none)
.addTypo(new String[]{"차", "챠"}, new String[]{"차", "챠"}, 1.f, CondVowel.none)
.addTypo(new String[]{"채", "챼"}, new String[]{"채", "챼"}, 1.f, CondVowel.none)
.addTypo(new String[]{"처", "쳐"}, new String[]{"처", "쳐"}, 1.f, CondVowel.none)
.addTypo(new String[]{"체", "쳬"}, new String[]{"체", "쳬"}, 1.f, CondVowel.none)
.addTypo(new String[]{"초", "쵸", "춰"}, new String[]{"초", "쵸", "춰"}, 1.f, CondVowel.none)
.addTypo(new String[]{"추", "츄"}, new String[]{"추", "츄"}, 1.f, CondVowel.none)
.addTypo(new String[]{"유", "류"}, new String[]{"유", "류"}, 1.f, CondVowel.none)
.addTypo(new String[]{"므", "무"}, new String[]{"므", "무"}, 1.f, CondVowel.none)
.addTypo(new String[]{"브", "부"}, new String[]{"브", "부"}, 1.f, CondVowel.none)
.addTypo(new String[]{"프", "푸"}, new String[]{"프", "푸"}, 1.f, CondVowel.none)
.addTypo(new String[]{"르", "루"}, new String[]{"르", "루"}, 1.f, CondVowel.none)
.addTypo(new String[]{"러", "뤄"}, new String[]{"러", "뤄"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆩ", "ᆪ"}, new String[]{"ᆨ", "ᆩ", "ᆪ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ᆬ", "ᆭ"}, new String[]{"ᆫ", "ᆬ", "ᆭ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ᆰ", "ᆱ", "ᆲ", "ᆳ", "ᆴ", "ᆵ", "ᆶ"}, new String[]{"ᆯ", "ᆰ", "ᆱ", "ᆲ", "ᆳ", "ᆴ", "ᆵ", "ᆶ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ᆺ", "ᆻ"}, new String[]{"ᆺ", "ᆻ"}, 1.f, CondVowel.none)

.addTypo(new String[]{"안"}, new String[]{"않"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"맞추", "맞히"}, new String[]{"맞추", "맞히"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"맞춰", "맞혀"}, new String[]{"맞춰", "맞혀"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"받치", "바치", "받히"}, new String[]{"받치", "바치", "받히"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"받쳐", "바쳐", "받혀"}, new String[]{"받쳐", "바쳐", "받혀"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"던", "든"}, new String[]{"던", "든"}, 1.f, CondVowel.none)
.addTypo(new String[]{"때", "데"}, new String[]{"때", "데"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"빛", "빚"}, new String[]{"빛", "빚"}, 1.f, CondVowel.none)

.addTypo(new String[]{"ᆮ이", "지"}, new String[]{"ᆮ이", "지"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆮ여", "져"}, new String[]{"ᆮ여", "져"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᇀ이", "치"}, new String[]{"ᇀ이", "치"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᇀ여", "쳐"}, new String[]{"ᇀ여", "쳐"}, 1.f, CondVowel.none)

.addTypo(new String[]{"ᄀ", "ᄁ"}, new String[]{"ᄀ", "ᄁ"}, 1.f, CondVowel.applosive)
.addTypo(new String[]{"ᄃ", "ᄄ"}, new String[]{"ᄃ", "ᄄ"}, 1.f, CondVowel.applosive)
.addTypo(new String[]{"ᄇ", "ᄈ"}, new String[]{"ᄇ", "ᄈ"}, 1.f, CondVowel.applosive)
.addTypo(new String[]{"ᄉ", "ᄊ"}, new String[]{"ᄉ", "ᄊ"}, 1.f, CondVowel.applosive)
.addTypo(new String[]{"ᄌ", "ᄍ"}, new String[]{"ᄌ", "ᄍ"}, 1.f, CondVowel.applosive)

.addTypo(new String[]{"ᇂᄒ", "ᆨᄒ", "ᇂᄀ"}, new String[]{"ᇂᄒ", "ᆨᄒ", "ᇂᄀ"}, 1.f, CondVowel.none)

.addTypo(new String[]{"ᆨᄂ", "ᆩᄂ", "ᆪᄂ", "ᆿᄂ", "ᆼᄂ"}, new String[]{"ᆨᄂ", "ᆩᄂ", "ᆪᄂ", "ᆿᄂ", "ᆼᄂ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆨᄆ", "ᆩᄆ", "ᆪᄆ", "ᆿᄆ", "ᆼᄆ"}, new String[]{"ᆨᄆ", "ᆩᄆ", "ᆪᄆ", "ᆿᄆ", "ᆼᄆ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆨᄅ", "ᆩᄅ", "ᆪᄅ", "ᆿᄅ", "ᆼᄅ", "ᆼᄂ",}, new String[]{"ᆨᄅ", "ᆩᄅ", "ᆪᄅ", "ᆿᄅ", "ᆼᄅ", "ᆼᄂ",}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆮᄂ", "ᆺᄂ", "ᆻᄂ", "ᆽᄂ", "ᆾᄂ", "ᇀᄂ", "ᆫᄂ"}, new String[]{"ᆮᄂ", "ᆺᄂ", "ᆻᄂ", "ᆽᄂ", "ᆾᄂ", "ᇀᄂ", "ᆫᄂ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆮᄆ", "ᆺᄆ", "ᆻᄆ", "ᆽᄆ", "ᆾᄆ", "ᇀᄆ", "ᆫᄆ"}, new String[]{"ᆮᄆ", "ᆺᄆ", "ᆻᄆ", "ᆽᄆ", "ᆾᄆ", "ᇀᄆ", "ᆫᄆ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆮᄅ", "ᆺᄅ", "ᆻᄅ", "ᆽᄅ", "ᆾᄅ", "ᇀᄅ", "ᆫᄅ", "ᆫᄂ",}, new String[]{"ᆮᄅ", "ᆺᄅ", "ᆻᄅ", "ᆽᄅ", "ᆾᄅ", "ᇀᄅ", "ᆫᄅ", "ᆫᄂ",}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆸᄂ", "ᆹᄂ", "ᇁᄂ", "ᆷᄂ"}, new String[]{"ᆸᄂ", "ᆹᄂ", "ᇁᄂ", "ᆷᄂ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆸᄆ", "ᆹᄆ", "ᇁᄆ", "ᆷᄆ"}, new String[]{"ᆸᄆ", "ᆹᄆ", "ᇁᄆ", "ᆷᄆ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆸᄅ", "ᆹᄅ", "ᇁᄅ", "ᆷᄅ", "ᆷᄂ",}, new String[]{"ᆸᄅ", "ᆹᄅ", "ᇁᄅ", "ᆷᄅ", "ᆷᄂ",}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆫᄅ", "ᆫᄂ", "ᆯᄅ", "ᆯᄂ"}, new String[]{"ᆫᄅ", "ᆫᄂ", "ᆯᄅ", "ᆯᄂ"}, 1.f, CondVowel.none)

.addTypo(new String[]{"ᆨᄋ", "ᄀ"}, new String[]{"ᆨᄋ", "ᄀ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆩᄋ", "ᄁ"}, new String[]{"ᆩᄋ", "ᄁ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆫᄋ", "ᆫᄒ", "ᄂ"}, new String[]{"ᆫᄋ", "ᆫᄒ", "ᄂ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆬᄋ", "ᆫᄌ"}, new String[]{"ᆬᄋ", "ᆫᄌ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆭᄋ", "ᄂ"}, new String[]{"ᆭᄋ", "ᄂ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆮᄋ", "ᄃ"}, new String[]{"ᆮᄋ", "ᄃ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆯᄋ", "ᆯᄒ", "ᄅ"}, new String[]{"ᆯᄋ", "ᆯᄒ", "ᄅ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆰᄋ", "ᆯᄀ"}, new String[]{"ᆰᄋ", "ᆯᄀ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆰᄒ", "ᆯᄏ"}, new String[]{"ᆰᄒ", "ᆯᄏ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆷᄋ", "ᄆ"}, new String[]{"ᆷᄋ", "ᄆ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆸᄋ", "ᄇ"}, new String[]{"ᆸᄋ", "ᄇ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆺᄋ", "ᄉ"}, new String[]{"ᆺᄋ", "ᄉ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆻᄋ", "ᆺᄉ", "ᄊ"}, new String[]{"ᆻᄋ", "ᆺᄉ", "ᄊ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆽᄋ", "ᄌ"}, new String[]{"ᆽᄋ", "ᄌ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᄎ"}, new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᄎ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᄏ"}, new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᄏ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᄐ"}, new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᄐ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᄑ"}, new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᄑ"}, 1.f, CondVowel.vowel)

.addTypo(new String[]{"은", "는"}, new String[]{"은", "는"}, 2.f, CondVowel.none)
.addTypo(new String[]{"을", "를"}, new String[]{"을", "를"}, 2.f, CondVowel.none)

.addTypo(new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, 1.5f, CondVowel.none)
.setContinualTypoCost(1.f)
.addTypo(new String[]{"ᆪ"}, new String[]{"ᆨᆺ", "ᆨᆻ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆬ"}, new String[]{"ᆫᆽ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆭ"}, new String[]{"ᆫᇂ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆰ"}, new String[]{"ᆯᆨ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆱ"}, new String[]{"ᆯᆷ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆲ"}, new String[]{"ᆯᆸ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆳ"}, new String[]{"ᆯᆺ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆴ"}, new String[]{"ᆯᇀ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆵ"}, new String[]{"ᆯᇁ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none);

}
32 changes: 32 additions & 0 deletions bindings/java/kr/pe/bab2min/KiwiTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,38 @@ public void testTypos() throws Exception {
assertEquals(tokens[5].form, "어");
}

@Test
public void testContinualTypos() throws Exception {
System.gc();
KiwiBuilder builder = new KiwiBuilder(modelPath);
Kiwi kiwi = builder.build(KiwiBuilder.continualTypoSet);

Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", Kiwi.Match.allWithNormalizing);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "프로그램");
assertEquals(tokens[1].form, "이");

tokens = kiwi.tokenize("프로그래믈", Kiwi.Match.allWithNormalizing);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "프로그램");
assertEquals(tokens[1].form, "을");

tokens = kiwi.tokenize("오늘사무시레서", Kiwi.Match.allWithNormalizing);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[1].form, "사무실");
assertEquals(tokens[2].form, "에서");

tokens = kiwi.tokenize("법원이 기가캤다.", Kiwi.Match.allWithNormalizing);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[2].form, "기각");
assertEquals(tokens[3].form, "하");

tokens = kiwi.tokenize("하나도 업써.", Kiwi.Match.allWithNormalizing);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[2].form, "없");
assertEquals(tokens[3].form, "어");
}

@Test
public void testBlocklist() throws Exception {
System.gc();
Expand Down
Loading

0 comments on commit c62b968

Please sign in to comment.