Skip to content

Commit

Permalink
Merge pull request #3 from yj-jtakagi/bugfix
Browse files Browse the repository at this point in the history
fix bugs
  • Loading branch information
Masakazu Nagaya authored Sep 12, 2019
2 parents 9c4063d + 6d980ca commit aa4e016
Show file tree
Hide file tree
Showing 8 changed files with 214 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
target/
.idea/
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Changelog

## [1.0.0] - 2019-09-11
### Fixed
- 一部の記号がトークナイズの際にエラー原因になる問題の修正
- 元号などの合字が正規化前に Vespa によって取り除かれる問題の修正
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

<groupId>jp.co.yahoo.vespa</groupId>
<artifactId>kuromoji-linguistics</artifactId>
<version>0.0.2-SNAPSHOT</version>
<version>1.0.0</version>
<packaging>container-plugin</packaging>

<name>kuromoji-linguistics</name>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package jp.co.yahoo.vespa.language.lib.kuromoji;

import com.yahoo.language.process.CharacterClasses;

import java.text.Normalizer;

public class KuromojiCharacterClasses extends CharacterClasses {
@Override
public boolean isLetter(int c) {
// see https://github.com/vespa-engine/vespa/blob/master/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
if (java.lang.Character.isLetter(c)) return true;
if (Character.isDigit(c) && ! isLatin(c)) return true;

if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
c == '\u300c' || c == '\u300d' || c == '\u300e' ||
c == '\u300f' || c == '\u3010' || c == '\u3011') {
return true;
}
int type = java.lang.Character.getType(c);
if (type == java.lang.Character.NON_SPACING_MARK ||
type == java.lang.Character.COMBINING_SPACING_MARK ||
type == java.lang.Character.ENCLOSING_MARK) {
return true;
} else if (type == Character.OTHER_SYMBOL) {
// OTHER_SYMBOL contains Gengo(Era), letter enclosed within a circle, etc
String norm = Normalizer.normalize(String.valueOf((char)c), Normalizer.Form.NFKC);
if (c == norm.charAt(0)) return false;
return isLetterOrDigit(norm.charAt(0));
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ public class KuromojiLinguistics implements Linguistics {

private static final Logger logger = Logger.getLogger(KuromojiLinguistics.class.getName());

private SimpleLinguistics simpleLinguistics = new SimpleLinguistics();
private final SimpleLinguistics simpleLinguistics = new SimpleLinguistics();
private final CharacterClasses characterClasses = new KuromojiCharacterClasses();
private final GramSplitter gramSplitter = new GramSplitter(characterClasses);

private KuromojiContext context;
private Tokenizer tokenizer;
Expand Down Expand Up @@ -129,14 +131,14 @@ public Detector getDetector() {
*/
@Override
public GramSplitter getGramSplitter() {
return simpleLinguistics.getGramSplitter();
return gramSplitter;
}

/**
* {@inheritDoc}
*/
@Override
public CharacterClasses getCharacterClasses() {
return simpleLinguistics.getCharacterClasses();
return characterClasses;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ public Iterable<Token> tokenize(String input, Language language, StemMode stemMo
String orig = getOrig(t, input, normOffset, result);
String tokenString = processToken(t, language, stemMode, removeAccents);

if (tokenString.isEmpty()) {
continue;
}

// @formatter:off
tokens.add(new KuromojiToken.Builder(orig).tokenString(tokenString)
// XXX: we only consider head character
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package jp.co.yahoo.vespa.language.lib.kuromoji;

import com.yahoo.language.lib.kuromoji.KuromojiConfig;
import com.yahoo.language.process.GramSplitter;
import com.yahoo.vespa.configdefinition.SpecialtokensConfig;
import org.junit.Test;

import java.io.IOException;

import static org.junit.Assert.assertEquals;

public class GramSplitterTest {
private static final GramSplitter gramSplitter;

static {
GramSplitter splitter = null;
try {
splitter = new KuromojiLinguistics(new KuromojiConfig(new KuromojiConfig.Builder()),
new SpecialtokensConfig(new SpecialtokensConfig.Builder())).getGramSplitter();
} catch (IOException e) {
} finally {
gramSplitter = splitter;
}
}

@Test
public void testWithSymbols() {
assertGramSplit("㍻最後の㍉㌔", 2, "[㍻最, 最後, 後の, の㍉, ㍉㌔]");
}

@Test
public void testWithWhiteSpace() {
assertGramSplit("㍻最後 の ㍉㌔", 2, "[㍻最, 最後, の, ㍉㌔]");
}

@Test
public void testWithIgnoreSymbol() {
assertGramSplit("㍻最後の©㍉㌔", 2, "[㍻最, 最後, 後の, ㍉㌔]");
}

private void assertGramSplit(String input, int gramSize, String expected) {
assertEquals(gramSplitter.split(input, gramSize).toExtractedList().toString(), expected);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package jp.co.yahoo.vespa.language.lib.kuromoji;

import org.junit.Test;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class KuromojiCharacterClassesTest {
@Test
public void testSpecialChars() {
KuromojiCharacterClasses characterClasses = new KuromojiCharacterClasses();
// is letter
assertTrue(characterClasses.isLetter('㍻'));
assertTrue(characterClasses.isLetter('㍉'));
assertTrue(characterClasses.isLetter('㊑'));
// isLetterOrDigit is also true
assertTrue(characterClasses.isLetterOrDigit('㍻'));
assertTrue(characterClasses.isLetterOrDigit('㍉'));
// is noise
assertFalse(characterClasses.isLetter('〒'));
assertFalse(characterClasses.isLetter('©'));;
}

@Test
public void testQuotes() {
KuromojiCharacterClasses characterClasses = new KuromojiCharacterClasses();
// quote is not letter
// please see https://github.com/vespa-engine/vespa/blob/master/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
assertFalse(characterClasses.isLetter('"'));
assertFalse(characterClasses.isLetter('\u201C'));
assertFalse(characterClasses.isLetter('\u201D'));
assertFalse(characterClasses.isLetter('\u201E'));
assertFalse(characterClasses.isLetter('\u201F'));
assertFalse(characterClasses.isLetter('\u2039'));
assertFalse(characterClasses.isLetter('\u203A'));
assertFalse(characterClasses.isLetter('\u00AB'));
assertFalse(characterClasses.isLetter('\u00BB'));
assertFalse(characterClasses.isLetter('\u301D'));
assertFalse(characterClasses.isLetter('\u301E'));
assertFalse(characterClasses.isLetter('\u301F'));
assertFalse(characterClasses.isLetter('\uFF02'));
}

@Test
public void testSigns() {
KuromojiCharacterClasses characterClasses = new KuromojiCharacterClasses();
// +- is not letter
// please see https://github.com/vespa-engine/vespa/blob/master/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
assertFalse(characterClasses.isLetter('-'));
assertFalse(characterClasses.isLetter('\uFF0D'));
assertFalse(characterClasses.isLetter('+'));
assertFalse(characterClasses.isLetter('\uFF0B'));
}

@Test
public void testPunctuations() {
KuromojiCharacterClasses characterClasses = new KuromojiCharacterClasses();
// .,:; is not letter
// please see https://github.com/vespa-engine/vespa/blob/master/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
assertFalse(characterClasses.isLetter('.'));
assertFalse(characterClasses.isLetter('\uFF0E'));
assertFalse(characterClasses.isLetter(','));
assertFalse(characterClasses.isLetter('\uFF0C'));
assertFalse(characterClasses.isLetter(':'));
assertFalse(characterClasses.isLetter('\uFF1A'));
assertFalse(characterClasses.isLetter(';'));
assertFalse(characterClasses.isLetter('\uFF1E'));
}

@Test
public void testBrace() {
KuromojiCharacterClasses characterClasses = new KuromojiCharacterClasses();
// () is not letter
// please see https://github.com/vespa-engine/vespa/blob/master/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
assertFalse(characterClasses.isLetter('('));
assertFalse(characterClasses.isLetter('\uFF08'));
assertFalse(characterClasses.isLetter(')'));
assertFalse(characterClasses.isLetter('\uFF09'));
}

@Test
public void testBracket() {
KuromojiCharacterClasses characterClasses = new KuromojiCharacterClasses();
// [] is not letter
// please see https://github.com/vespa-engine/vespa/blob/master/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
assertFalse(characterClasses.isLetter('['));
assertFalse(characterClasses.isLetter('\uFF3D'));
assertFalse(characterClasses.isLetter(']'));
assertFalse(characterClasses.isLetter('\uFF1B'));
}

@Test
public void testGraterSmaller() {
KuromojiCharacterClasses characterClasses = new KuromojiCharacterClasses();
// <> is not letter
// please see https://github.com/vespa-engine/vespa/blob/master/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
assertFalse(characterClasses.isLetter('<'));
assertFalse(characterClasses.isLetter('\uFF1E'));
assertFalse(characterClasses.isLetter('>'));
assertFalse(characterClasses.isLetter('\uFF1C'));
}

@Test
public void testSimbols() {
KuromojiCharacterClasses characterClasses = new KuromojiCharacterClasses();
// !_^*$ is not letter
// please see https://github.com/vespa-engine/vespa/blob/master/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
assertFalse(characterClasses.isLetter('!'));
assertFalse(characterClasses.isLetter('\uFF01'));
assertFalse(characterClasses.isLetter('_'));
assertFalse(characterClasses.isLetter('\uFF3F'));
assertFalse(characterClasses.isLetter('^'));
assertFalse(characterClasses.isLetter('\uFF3E'));
assertFalse(characterClasses.isLetter('*'));
assertFalse(characterClasses.isLetter('\uFF0A'));
assertFalse(characterClasses.isLetter('$'));
assertFalse(characterClasses.isLetter('\uFF04'));
}

}

0 comments on commit aa4e016

Please sign in to comment.