Skip to content

Commit

Permalink
Merge pull request #49 from ikawaha/develop
Browse files Browse the repository at this point in the history
Support UniDic
  • Loading branch information
ikawaha committed Jan 12, 2016
2 parents 576c131 + 6aa3923 commit f21102e
Show file tree
Hide file tree
Showing 32 changed files with 1,905 additions and 193 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ install:

script:
- go test ./...
- cd tokenizer; go test -bench .;cd ..
- cd tokenizer; go test -benchmem -bench .; cd ..
- cd internal/dic; go test -benchmem -bench .; cd ../..
- /bin/sh ./go-coverall.sh

41 changes: 39 additions & 2 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ Kagome Japanese Morphological Analyzer

This software includes a binary and/or source version of data from

mecab-ipadic-2.7.0-20070801
a) mecab-ipadic-2.7.0-20070801
b) unidic-mecab-2.1.2_src

which can be obtained from

http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
a) http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
b) https://osdn.jp/projects/unidic/downloads/58338/unidic-mecab-2.1.2_src.zip/

===========================================================================
mecab-ipadic-2.7.0-20070801 Notice
Expand Down Expand Up @@ -76,3 +78,38 @@ grants independently of ICOT any specific warranty to the user in
writing, such person, organization or entity, will also be exempted
from and not be held liable to the user for any such damages as noted
above as far as the program is concerned.

===========================================================================
unidic-mecab-2.1.2_src Notice (BSD)
===========================================================================

Copyright (c) 2011-2013, The UniDic Consortium
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution.

* Neither the name of the UniDic Consortium nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Kagome Japanese Morphological Analyzer
===

Kagome is an open source Japanese morphological analyzer written in pure golang.
The MeCab-IPADIC dictionary/statiscal model is used and packaged in Kagome binary.
The [MeCab-IPADIC](http://taku910.github.io/mecab/) and [UniDic (unidic-mecab)](http://pj.ninjal.ac.jp/corpus_center/unidic/) dictionary/statiscal models are packaged in Kagome binary.

```
% kagome
Expand Down Expand Up @@ -38,13 +38,15 @@ The commands are:
server - run tokenize server
lattice - lattice viewer
tokenize [-file input_file] [-dic dic_file] [-udic userdic_file] [-mode (normal|search|extended)]
tokenize [-file input_file] [-dic dic_file] [-udic userdic_file] [-sysdic (ipa|uni)] [-mode (normal|search|extended)]
-dic string
dic
-file string
input file
-mode string
tokenize mode (normal|search|extended) (default "normal")
-sysdic string
system dic type (ipa|uni) (default "ipa")
-udic string
user dic
```
Expand All @@ -67,6 +69,8 @@ Usage of tokenize:
input file
-mode string
tokenize mode (normal|search|extended) (default "normal")
-sysdic string
system dic type (ipa|uni) (default "ipa")
-udic string
user dic
```
Expand All @@ -78,6 +82,8 @@ $ go run cmd/kagome/main.go server -h
Usage of server:
-http string
HTTP service address (default ":6060")
-sysdic string
system dic type (ipa|uni) (default "ipa")
-udic string
user dictionary
```
Expand Down
Binary file modified _sample/ipa.dic
Binary file not shown.
2 changes: 1 addition & 1 deletion cmd/_dictool/gobindata.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
go-bindata -o bindata.go -nomemcopy -pkg=data dic/ipa
go-bindata -o bindata.go -nomemcopy -separate -pkg=data dic/...
#go-bindata -o bindata.go -pkg=data dic/ipa
33 changes: 22 additions & 11 deletions cmd/_dictool/ipa/mkdic.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ const (

ipaDicArchiveFileName = "ipa.dic"
ipaDicMorphFileName = "morph.dic"
ipaDicContentFileName = "content.dic"
ipaDicIndexFileName = "index.dic"
ipaDicConnectionFileName = "connection.dic"
ipaDicCharDefFileName = "chardef.dic"
Expand Down Expand Up @@ -218,18 +219,28 @@ func saveIpaDic(d *IpaDic, base string, archive bool) (err error) {
if _, e = dic.MorphSlice(d.Morphs).WriteTo(out); e != nil {
return
}
// if e = enc.Encode(d.Morphs); e != nil {
// return
// }
// if _, e = buf.WriteTo(out); e != nil {
// return
// }
var buf bytes.Buffer
enc := gob.NewEncoder(&buf)
if e = enc.Encode(d.Contents); e != nil {
return
return
}(); err != nil {
return
}

if err = func() (e error) {
p := path.Join(base, ipaDicContentFileName)
var out io.Writer
if archive {
out, e = zw.Create(p)
if e != nil {
return
}
} else {
var f *os.File
if f, e = os.OpenFile(p, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666); e != nil {
return
}
defer f.Close()
out = f
}
if _, e = buf.WriteTo(out); e != nil {
if _, e = dic.Contents(d.Contents).WriteTo(out); e != nil {
return
}
return
Expand Down
2 changes: 2 additions & 0 deletions cmd/_dictool/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"path"

"./ipa"
"./uni"
)

var errorWriter = os.Stderr
Expand All @@ -31,6 +32,7 @@ var subcommands = []struct {
}{
// subcommands
{ipa.CommandName, ipa.Description, ipa.Run},
{uni.CommandName, uni.Description, uni.Run},
}

func Usage() {
Expand Down
101 changes: 101 additions & 0 deletions cmd/_dictool/uni/cmd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright 2015 ikawaha
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package uni

import (
"flag"
"fmt"
"os"
)

// subcommand property
var (
CommandName = "uni"
Description = `uni dic build tool`
usageMessage = "%s -mecab mecabdic-path [-neologd neologd-path] [-z]\n"
errorWriter = os.Stderr
)

// options
type option struct {
output string
mecab string
neologd string
archive bool

flagSet *flag.FlagSet
}

func newOption() (o *option) {
o = &option{
flagSet: flag.NewFlagSet(CommandName, flag.ContinueOnError),
}
// option settings
o.flagSet.BoolVar(&o.archive, "z", true, "build an archived dic")
o.flagSet.StringVar(&o.output, "output", ".", "set output path")
o.flagSet.StringVar(&o.mecab, "mecab", "", "set mecab src path")
o.flagSet.StringVar(&o.neologd, "neologd", "", "set neologd src path")
return
}

func (o *option) parse(args []string) (err error) {
if err = o.flagSet.Parse(args); err != nil {
return
}
// validations
if nonFlag := o.flagSet.Args(); len(nonFlag) != 0 {
return fmt.Errorf("invalid argument: %v", nonFlag)
}
if o.mecab == "" {
return fmt.Errorf("invalid argument: -mecab")
}
return
}

// command main
func command(opt *option) error {
d, err := buildUniDic(opt.mecab, opt.neologd)
if err != nil {
return err
}
if saveUniDic(d, opt.output, opt.archive); err != nil {
return fmt.Errorf("build error: %v\n", err)
}
return nil
}

func Run(args []string) error {
if len(args) == 0 {
Usage()
PrintDefaults()
return nil
}
opt := newOption()
if e := opt.parse(args); e != nil {
Usage()
PrintDefaults()
return e
}
return command(opt)
}

func Usage() {
fmt.Fprintf(os.Stderr, usageMessage, CommandName)
}

func PrintDefaults() {
o := newOption()
o.flagSet.PrintDefaults()
}
Loading

0 comments on commit f21102e

Please sign in to comment.