Skip to content

Commit

Permalink
Merge pull request #226 from ikawaha/feature/add-index-to-token
Browse files Browse the repository at this point in the history
  • Loading branch information
ikawaha authored Dec 29, 2020
2 parents 3bf895b + 4e2dfea commit c93ae10
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 49 deletions.
11 changes: 6 additions & 5 deletions tokenizer/lattice/lattice.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,12 +258,13 @@ func (la *Lattice) Backward(m TokenizeMode) {
runeLen := utf8.RuneCountInString(p.Surface)
stack := make([]*node, 0, runeLen)
i := 0
for _, r := range p.Surface {
for k, r := range p.Surface {
stack = append(stack, &node{
ID: p.ID,
Start: p.Start + i,
Class: DUMMY,
Surface: string(r),
ID: p.ID,
Start: p.Start + i,
Class: DUMMY,
Surface: string(r),
Position: p.Position + k,
})
i++
}
Expand Down
3 changes: 2 additions & 1 deletion tokenizer/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func (c TokenClass) String() string {

// Token represents a morph of a sentence.
type Token struct {
Index int
ID int
Class TokenClass
Position int // byte position
Expand Down Expand Up @@ -209,7 +210,7 @@ func (t Token) String() string {
return fmt.Sprintf("%q (%d: %d, %d) %v [%d]", t.Surface, t.Position, t.Start, t.End, t.Class, t.ID)
}

// Equal returns true if tokens are equal.
// Equal returns true if tokens are equal. This function compares values other than the `Index` field.
func (t Token) Equal(v Token) bool {
return t.ID == v.ID &&
t.Class == v.Class &&
Expand Down
1 change: 1 addition & 0 deletions tokenizer/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ func (t Tokenizer) Analyze(input string, mode TokenizeMode) []Token {
continue
}
tok := Token{
Index: len(tokens),
ID: n.ID,
Class: TokenClass(n.Class),
Position: n.Position,
Expand Down
68 changes: 25 additions & 43 deletions tokenizer/tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,21 +225,18 @@ func Test_AnalyzeWithSearchMode(t *testing.T) {
}
tokens := tnz.Analyze("関西国際空港", Search)
expected := []Token{
{ID: -1, Surface: "BOS"},
{ID: 372968, Surface: "関西", Start: 0, End: 2, Class: TokenClass(lattice.KNOWN)},
{ID: 168541, Surface: "国際", Start: 2, End: 4, Class: TokenClass(lattice.KNOWN)},
{ID: 307133, Surface: "空港", Start: 4, End: 6, Class: TokenClass(lattice.KNOWN)},
{ID: -1, Surface: "EOS", Start: 6, End: 6},
{Index: 0, ID: -1, Surface: "BOS"},
{Index: 1, ID: 372967, Surface: "関西", Start: 0, End: 2, Position: 0, Class: TokenClass(lattice.KNOWN)},
{Index: 2, ID: 168542, Surface: "国際", Start: 2, End: 4, Position: 6, Class: TokenClass(lattice.KNOWN)},
{Index: 3, ID: 307134, Surface: "空港", Start: 4, End: 6, Position: 12, Class: TokenClass(lattice.KNOWN)},
{Index: 4, ID: -1, Surface: "EOS", Start: 6, End: 6, Position: 18},
}

if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}
Expand All @@ -257,19 +254,15 @@ func Test_AnalyzeWithSearchModeUnknown(t *testing.T) {

tokens := tnz.Analyze("ポポピ", Search)
expected := []Token{
{ID: -1, Surface: "BOS"},
{ID: 34, Surface: "ポポピ", Start: 0, End: 3, Class: TokenClass(lattice.UNKNOWN)},
{ID: -1, Surface: "EOS", Start: 3, End: 3},
{Index: 0, ID: -1, Surface: "BOS"},
{Index: 1, ID: 34, Surface: "ポポピ", Start: 0, End: 3, Class: TokenClass(lattice.UNKNOWN)},
{Index: 2, ID: -1, Surface: "EOS", Start: 3, End: 3, Position: 9},
}
if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.ID != expected[i].ID ||
tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}
Expand All @@ -287,18 +280,14 @@ func Test_AnalyzeWithExtendedModeEmpty(t *testing.T) {

tokens := tnz.Analyze("", Extended)
expected := []Token{
{ID: -1, Surface: "BOS"},
{ID: -1, Surface: "EOS"},
{Index: 0, ID: -1, Surface: "BOS"},
{Index: 1, ID: -1, Surface: "EOS"},
}
if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.ID != expected[i].ID ||
tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}
Expand All @@ -316,20 +305,17 @@ func Test_AnalyzeWithExtendedMode(t *testing.T) {

tokens := tnz.Analyze("関西国際空港", Extended)
expected := []Token{
{ID: -1, Surface: "BOS"},
{ID: 372968, Surface: "関西", Start: 0, End: 2, Class: TokenClass(lattice.KNOWN)},
{ID: 168541, Surface: "国際", Start: 2, End: 4, Class: TokenClass(lattice.KNOWN)},
{ID: 307133, Surface: "空港", Start: 4, End: 6, Class: TokenClass(lattice.KNOWN)},
{ID: -1, Surface: "EOS", Start: 6, End: 6},
{Index: 0, ID: -1, Surface: "BOS"},
{Index: 1, ID: 372967, Surface: "関西", Start: 0, End: 2, Position: 0, Class: TokenClass(lattice.KNOWN)},
{Index: 2, ID: 168542, Surface: "国際", Start: 2, End: 4, Position: 6, Class: TokenClass(lattice.KNOWN)},
{Index: 3, ID: 307134, Surface: "空港", Start: 4, End: 6, Position: 12, Class: TokenClass(lattice.KNOWN)},
{Index: 4, ID: -1, Surface: "EOS", Start: 6, End: 6, Position: 18},
}
if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}
Expand All @@ -347,21 +333,17 @@ func Test_AnalyzeWithExtendedModeUnknown(t *testing.T) {

tokens := tnz.Analyze("ポポピ", Extended)
expected := []Token{
{ID: -1, Surface: "BOS"},
{ID: 34, Surface: "ポ", Start: 0, End: 1, Class: TokenClass(lattice.DUMMY)},
{ID: 34, Surface: "ポ", Start: 1, End: 2, Class: TokenClass(lattice.DUMMY)},
{ID: 34, Surface: "ピ", Start: 2, End: 3, Class: TokenClass(lattice.DUMMY)},
{ID: -1, Surface: "EOS", Start: 3, End: 3},
{Index: 0, ID: -1, Surface: "BOS"},
{Index: 1, ID: 34, Surface: "ポ", Start: 0, End: 1, Position: 0, Class: TokenClass(lattice.DUMMY)},
{Index: 2, ID: 34, Surface: "ポ", Start: 1, End: 2, Position: 3, Class: TokenClass(lattice.DUMMY)},
{Index: 3, ID: 34, Surface: "ピ", Start: 2, End: 3, Position: 6, Class: TokenClass(lattice.DUMMY)},
{Index: 4, ID: -1, Surface: "EOS", Start: 3, End: 3, Position: 9},
}
if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.ID != expected[i].ID ||
tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
if tok.Index != expected[i].Index || !tok.Equal(expected[i]) {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}
Expand Down

0 comments on commit c93ae10

Please sign in to comment.