Skip to content

Commit

Permalink
Merge pull request #221 from ikawaha/develop
Browse files Browse the repository at this point in the history
  • Loading branch information
ikawaha authored Dec 19, 2020
2 parents fa6974e + d664ec8 commit 35e8780
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 62 deletions.
22 changes: 11 additions & 11 deletions cmd/lattice/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,19 @@ import (
var (
CommandName = "lattice"
Description = `lattice viewer`
UsageMessage = "%s [-userDict userdic_file] [-dict (ipa|uni)] [-mode (normal|search|extended)] [-output output_file] [-v] sentence\n"
UsageMessage = "%s [-udict userdict_file] [-dict (ipa|uni)] [-mode (normal|search|extended)] [-output output_file] [-v] sentence\n"
ErrorWriter = os.Stderr
)

// options
type option struct {
userDict string
dict string
mode string
output string
verbose bool
input string
flagSet *flag.FlagSet
udict string
dict string
mode string
output string
verbose bool
input string
flagSet *flag.FlagSet
}

// ContinueOnError ErrorHandling // Return a descriptive error.
Expand All @@ -41,7 +41,7 @@ func newOption(_ io.Writer, eh flag.ErrorHandling) (o *option) {
flagSet: flag.NewFlagSet(CommandName, eh),
}
// option settings
o.flagSet.StringVar(&o.userDict, "userDict", "", "user dict")
o.flagSet.StringVar(&o.udict, "udict", "", "user dict")
o.flagSet.StringVar(&o.dict, "dict", "ipa", "dict type (ipa|uni)")
o.flagSet.StringVar(&o.mode, "mode", "normal", "tokenize mode (normal|search|extended)")
o.flagSet.StringVar(&o.output, "output", "", "output file")
Expand Down Expand Up @@ -106,8 +106,8 @@ func command(opt *option) error {
return err
}
udict := tokenizer.Nop()
if opt.userDict != "" {
d, err := dict.NewUserDict(opt.userDict)
if opt.udict != "" {
d, err := dict.NewUserDict(opt.udict)
if err != nil {
return err
}
Expand Down
25 changes: 25 additions & 0 deletions filter/pos_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package filter_test

import (
"fmt"
"reflect"
"testing"

Expand Down Expand Up @@ -198,3 +199,27 @@ func TestPOSFilter_Drop(t *testing.T) {
filter.Drop(nil)
})
}

func Example_POSFilter() {
d, err := dict.LoadDictFile(testDictPath)
if err != nil {
panic(err)
}
t, err := tokenizer.New(d, tokenizer.OmitBosEos())
if err != nil {
panic(err)
}
posFilter := filter.NewPOSFilter([]filter.POS{
{"名詞", filter.Any, "人名"},
{"形容詞"},
}...)
tokens := t.Tokenize("赤い蝋燭と人魚。小川未明")
posFilter.Keep(&tokens)
for _, v := range tokens {
fmt.Println(v.Surface, v.POS())
}
// Output:
// 赤い [形容詞 自立 * *]
// 小川 [名詞 固有名詞 人名 姓]
// 未明 [名詞 固有名詞 人名 名]
}
14 changes: 8 additions & 6 deletions filter/sentence_splitter_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package filter
package filter_test

import (
"bufio"
"fmt"
"reflect"
"strings"
"testing"

"github.com/ikawaha/kagome/v2/filter"
)

func Test_DefaultSplitter(t *testing.T) {
Expand Down Expand Up @@ -49,7 +51,7 @@ func Test_DefaultSplitter(t *testing.T) {

for _, d := range testdata {
scanner := bufio.NewScanner(strings.NewReader(d.input))
scanner.Split(ScanSentences)
scanner.Split(filter.ScanSentences)
r := make([]string, 0, len(d.expect))
for scanner.Scan() {
r = append(r, scanner.Text())
Expand Down Expand Up @@ -87,7 +89,7 @@ func Test_DelimWhiteSpace(t *testing.T) {
},
}

s := SentenceSplitter{
s := filter.SentenceSplitter{
Delim: []rune{' ', ' '}, // white spaces
Follower: []rune{'.', '」', '」', '』', ')', ')', '}', '}', '〉', '》'},
SkipWhiteSpace: true,
Expand Down Expand Up @@ -119,7 +121,7 @@ func Test_ScanSentences(t *testing.T) {
{atEnd: false, data: []byte{}, advance: 0, token: []byte{}, err: nil},
}
for _, d := range testdata {
advance, token, err := ScanSentences(d.data, d.atEnd)
advance, token, err := filter.ScanSentences(d.data, d.atEnd)
if err != nil {
t.Errorf("got err=%+v, expected nil", d.err)
}
Expand All @@ -132,7 +134,7 @@ func Test_ScanSentences(t *testing.T) {
}
}

func Example() {
func Example_ScanSentences() {
sampleText := ` 人魚は、南の方の海にばかり棲んでいるのではあ
りません。北の海にも棲んでいたのであります。
 北方の海うみの色は、青うございました。ある
Expand All @@ -142,7 +144,7 @@ func Example() {
小川未明作 赤い蝋燭と人魚より`

scanner := bufio.NewScanner(strings.NewReader(sampleText))
scanner.Split(ScanSentences)
scanner.Split(filter.ScanSentences)
for scanner.Scan() {
fmt.Println(scanner.Text())
}
Expand Down
23 changes: 23 additions & 0 deletions filter/word_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package filter_test

import (
"fmt"
"reflect"
"testing"

Expand Down Expand Up @@ -162,3 +163,25 @@ func TestWordFilter_Drop(t *testing.T) {
filter.Drop(nil)
})
}

func Example_WordFilter() {
d, err := dict.LoadDictFile(testDictPath)
if err != nil {
panic(err)
}
t, err := tokenizer.New(d, tokenizer.OmitBosEos())
if err != nil {
panic(err)
}
stopWords := filter.NewWordFilter([]string{"私", "は", "が", "の", "。"})
tokens := t.Tokenize("私の猫の名前はアプロです。")
stopWords.Drop(&tokens)
for _, v := range tokens {
fmt.Println(v.Surface)
}
// Output:
// 猫
// 名前
// アプロ
// です
}
106 changes: 61 additions & 45 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"fmt"
"os"
"path/filepath"
"runtime/debug"
"strings"

"github.com/ikawaha/kagome/v2/cmd/lattice"
"github.com/ikawaha/kagome/v2/cmd/server"
Expand All @@ -20,60 +22,74 @@ type subcommand struct {
PrintDefaults func(flag.ErrorHandling)
}

var subcommands = []subcommand{
{
Name: tokenize.CommandName,
Description: tokenize.Description,
Run: tokenize.Run,
Usage: tokenize.Usage,
OptionCheck: tokenize.OptionCheck,
PrintDefaults: tokenize.PrintDefaults,
},
{
Name: server.CommandName,
Description: server.Description,
Run: server.Run,
Usage: server.Usage,
OptionCheck: server.OptionCheck,
PrintDefaults: server.PrintDefaults,
},
{
Name: lattice.CommandName,
Description: lattice.Description,
Run: lattice.Run,
Usage: lattice.Usage,
OptionCheck: lattice.OptionCheck,
PrintDefaults: lattice.PrintDefaults,
},
{
Name: "version",
Description: "show version",
Run: func([]string) error {
fmt.Fprintf(os.Stderr, "%s\n", version)
return nil
},
Usage: func() {},
OptionCheck: func([]string) error { return nil },
PrintDefaults: func(flag.ErrorHandling) {},
},
}

var (
// version is the app version.
version = `!!version undefined!!
This must be specified by -X option during the go build. Such like:
$ go build --ldflags "-X 'main.version=$(git describe --tag)'"`

errorWriter = os.Stderr
version string // eg. go build --ldflags "-X 'main.version=$(git describe --tag)'"
errorWriter = os.Stderr
subcommands = []subcommand{
{
Name: tokenize.CommandName,
Description: tokenize.Description,
Run: tokenize.Run,
Usage: tokenize.Usage,
OptionCheck: tokenize.OptionCheck,
PrintDefaults: tokenize.PrintDefaults,
},
{
Name: server.CommandName,
Description: server.Description,
Run: server.Run,
Usage: server.Usage,
OptionCheck: server.OptionCheck,
PrintDefaults: server.PrintDefaults,
},
{
Name: lattice.CommandName,
Description: lattice.Description,
Run: lattice.Run,
Usage: lattice.Usage,
OptionCheck: lattice.OptionCheck,
PrintDefaults: lattice.PrintDefaults,
},
{
Name: "version",
Description: "show version",
Run: func([]string) error {
ShowVersion()
return nil
},
Usage: func() {},
OptionCheck: func([]string) error { return nil },
PrintDefaults: func(flag.ErrorHandling) {},
},
}
defaultSubcommand = subcommands[0]
)

// Usage prints to stdout information about the tool
// Usage prints information about the tool
func Usage() {
fmt.Fprintf(errorWriter, "Japanese Morphological Analyzer -- github.com/ikawaha/kagome/v2\n")
fmt.Fprintf(errorWriter, "usage: %s <command>\n", filepath.Base(os.Args[0]))
}

// ShowVersion prints the version about the tool.
func ShowVersion() {
info, ok := debug.ReadBuildInfo()
if version != "" {
fmt.Fprintln(errorWriter, version)
} else {
fmt.Fprintln(errorWriter, info.Main.Version)
}
if !ok {
return
}
const prefix = "github.com/ikawaha/kagome-dict/"
for _, v := range info.Deps {
if strings.HasPrefix(v.Path, prefix) {
fmt.Fprintln(errorWriter, " ", v.Path[len(prefix):], v.Version)
}
}
}

// PrintDefaults prints out the default flags
func PrintDefaults() {
fmt.Fprintln(errorWriter, "The commands are:")
Expand Down
9 changes: 9 additions & 0 deletions tokenizer/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,27 @@ func (t Token) FeatureAt(i int) (string, bool) {
return t.dict.POSTable.NameList[id], true
}
i -= len(pos)
if len(t.dict.Contents) <= t.ID {
return "", false
}
c := t.dict.Contents[t.ID]
if i >= len(c) {
return "", false
}
return c[i], true
case UNKNOWN:
if len(t.dict.UnkDict.Contents) <= t.ID {
return "", false
}
c := t.dict.UnkDict.Contents[t.ID]
if i >= len(c) {
return "", false
}
return c[i], true
case USER:
if len(t.udict.Contents) <= t.ID {
return "", false
}
switch i {
case 0:
return t.udict.Contents[t.ID].Pos, true
Expand Down

0 comments on commit 35e8780

Please sign in to comment.