From 40907351bc4b58a171660e4fad90164698c2421a Mon Sep 17 00:00:00 2001 From: yejianfeng Date: Fri, 30 Jan 2015 16:00:58 +0800 Subject: [PATCH 1/5] test update --- segmenter.go | 1 + 1 file changed, 1 insertion(+) diff --git a/segmenter.go b/segmenter.go index ff2f4a8..3f293b3 100644 --- a/segmenter.go +++ b/segmenter.go @@ -63,6 +63,7 @@ func (seg *Segmenter) LoadDictionary(files string) { if size == 0 { // 文件结束 + log.Println("文件结束") break } else if size < 2 { // 无效行 From 0294f5a1fd74461ab80944fdfbef12e9f899d435 Mon Sep 17 00:00:00 2001 From: yejianfeng Date: Fri, 30 Jan 2015 16:16:13 +0800 Subject: [PATCH 2/5] use error check text end --- segmenter.go | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/segmenter.go b/segmenter.go index 3f293b3..e7363e8 100644 --- a/segmenter.go +++ b/segmenter.go @@ -4,10 +4,10 @@ package sego import ( "bufio" "fmt" + "io" "log" "math" "os" - "strconv" "strings" "unicode" "unicode/utf8" @@ -53,33 +53,35 @@ func (seg *Segmenter) LoadDictionary(files string) { reader := bufio.NewReader(dictFile) var text string - var freqText string var frequency int var pos string // 逐行读入分词 for { - size, _ := fmt.Fscanln(reader, &text, &freqText, &pos) + // 解析词频 + var err error - if size == 0 { - // 文件结束 - log.Println("文件结束") - break - } else if size < 2 { + size, err := fmt.Fscanln(reader, &text, &frequency, &pos) + + if err != nil { + if err == io.EOF { + // 文件结束 + break + } // 无效行 continue - } else if size == 2 { - // 没有词性标注时设为空字符串 - pos = "" } - // 解析词频 - var err error - frequency, err = strconv.Atoi(freqText) - if err != nil { + if size < 2 { + // 无效行 continue } + if size == 2 { + // 没有词性标注时设为空字符串 + pos = "" + } + // 过滤频率太小的词 if frequency < minTokenFrequency { continue From fecad4ae447addc7fb4643c36c728163625363e6 Mon Sep 17 00:00:00 2001 From: yejianfeng Date: Wed, 4 Feb 2015 17:24:42 +0800 Subject: [PATCH 3/5] update segmenter --- segmenter.go | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/segmenter.go b/segmenter.go index e7363e8..b6579c5 100644 --- a/segmenter.go +++ b/segmenter.go @@ -40,15 +40,16 @@ func (seg *Segmenter) Dictionary() *Dictionary { // 当一个分词既出现在用户词典也出现在通用词典中,则优先使用用户词典。 // // 词典的格式为(每个分词一行): -// 分词文本 频率 词性 -func (seg *Segmenter) LoadDictionary(files string) { +// 分词文本 频率 词性 (强制要求使用这个格式,不符合这个格式的行跳过,并且在log中可以看到) +func (seg *Segmenter) LoadDictionary(files string) error { seg.dict = new(Dictionary) for _, file := range strings.Split(files, ",") { log.Printf("载入sego词典 %s", file) dictFile, err := os.Open(file) defer dictFile.Close() if err != nil { - log.Fatalf("无法载入字典文件 \"%s\" \n", file) + log.Printf("无法载入字典文件 \"%s\" \n", file) + return err } reader := bufio.NewReader(dictFile) @@ -57,10 +58,10 @@ func (seg *Segmenter) LoadDictionary(files string) { var pos string // 逐行读入分词 + line := 0 for { - // 解析词频 - var err error - + line++ + reader.ReadLine() size, err := fmt.Fscanln(reader, &text, &frequency, &pos) if err != nil { @@ -68,20 +69,17 @@ func (seg *Segmenter) LoadDictionary(files string) { // 文件结束 break } + log.Printf("%v 文件第 %v行读取错误,跳过: %v", file, line, err.Error()) // 无效行 continue } - if size < 2 { + if size < 3 { // 无效行 + log.Printf("%v 文件第 %v行读取错误,跳过: %v", file, line, "读取个数少于两个") continue } - if size == 2 { - // 没有词性标注时设为空字符串 - pos = "" - } - // 过滤频率太小的词 if frequency < minTokenFrequency { continue @@ -126,6 +124,7 @@ func (seg *Segmenter) LoadDictionary(files string) { } log.Println("sego词典载入完毕") + return nil } // 对文本分词 From be99230733f8fb0c11e4e0de7e479922dd62376a Mon Sep 17 00:00:00 2001 From: yejianfeng Date: Wed, 4 Feb 2015 21:25:24 +0800 Subject: [PATCH 4/5] change splite method --- segmenter.go | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/segmenter.go b/segmenter.go index b6579c5..f611d31 100644 --- a/segmenter.go +++ b/segmenter.go @@ -3,11 +3,12 @@ package sego import ( "bufio" - "fmt" + "errors" "io" "log" "math" "os" + "strconv" "strings" "unicode" "unicode/utf8" @@ -61,24 +62,25 @@ func (seg *Segmenter) LoadDictionary(files string) error { line := 0 for { line++ - reader.ReadLine() - size, err := fmt.Fscanln(reader, &text, &frequency, &pos) - + txt, err := reader.ReadString('\n') if err != nil { if err == io.EOF { // 文件结束 break } log.Printf("%v 文件第 %v行读取错误,跳过: %v", file, line, err.Error()) - // 无效行 continue } - if size < 3 { - // 无效行 - log.Printf("%v 文件第 %v行读取错误,跳过: %v", file, line, "读取个数少于两个") + parts := strings.Split(txt, " ") + if len(parts) < 3 { + log.Printf("%v 文件第 %v行读取错误,跳过: %v", file, line, "读取个数少于三个") continue } + N := len(parts) + text = strings.Join(parts[:N-2], " ") + frequency, err = strconv.Atoi(parts[N-2]) + pos = parts[N-1] // 过滤频率太小的词 if frequency < minTokenFrequency { @@ -127,6 +129,18 @@ func (seg *Segmenter) LoadDictionary(files string) error { return nil } +func parse(line string) (txt string, prequence int, pos string, err error) { + parts := strings.Split(line, " ") + if len(parts) < 3 { + return "", 0, "", errors.New("incomplete line") + } + N := len(parts) + txt = strings.Join(parts[:N-2], " ") + prequence, err = strconv.Atoi(parts[N-2]) + pos = parts[N-1] + return +} + // 对文本分词 // // 输入参数: From 8dbe8a73afa8e2f928567a0a70c5565c7127a8aa Mon Sep 17 00:00:00 2001 From: yejianfeng Date: Fri, 6 Feb 2015 15:03:25 +0800 Subject: [PATCH 5/5] add global register --- README.md | 30 ++++++++++++++++++++++++++++++ segmenter.go | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/README.md b/README.md index a4e813f..d148ab2 100644 --- a/README.md +++ b/README.md @@ -43,3 +43,33 @@ func main() { fmt.Println(sego.SegmentsToString(segments, false)) } ``` + + +```go +package main + +import ( + "fmt" + "github.com/huichen/sego" +) + +func init() { + // 注册分词器 + sego.RegisterDefaultSegmenter("github.com/huichen/sego/data/dictionary.txt") + sego.RegisterSegmenter("test", "github.com/huichen/sego/data/dictionary.txt") +} + +func main() { + // 载入词典 + segmenter := sego.GetDefaultSegmenter() + segmenter2 := sego.GetSegmenter("test") + + // 分词 + text := []byte("中华人民共和国中央人民政府") + segments := segmenter.Segment(text) + + // 处理分词结果 + // 支持普通模式和搜索模式两种分词,见代码中SegmentsToString函数的注释。 + fmt.Println(sego.SegmentsToString(segments, false)) +} +``` \ No newline at end of file diff --git a/segmenter.go b/segmenter.go index f611d31..7c161d8 100644 --- a/segmenter.go +++ b/segmenter.go @@ -18,6 +18,42 @@ const ( minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词 ) +var DefaultSegmenter = new(Segmenter) + +var segmenters = map[string]*Segmenter{ + "default": DefaultSegmenter, +} + +// 注册分词器 +func RegisterSegmenter(alias string, files string) error { + seg := new(Segmenter) + err := seg.LoadDictionary(files) + if err != nil { + return err + } + + segmenters[alias] = seg + return nil +} + +// 获取分词器 +func GetSegmenter(alias string) *Segmenter { + if _, ok := segmenters[alias]; ok { + return segmenters[alias] + } + return nil +} + +// 注册默认分词器 +func RegisterDefaultSegmenter(files string) error { + return RegisterSegmenter("default", files) +} + +// 获取默认分词器 +func GetDefaultSegmenter() *Segmenter { + return GetSegmenter("default") +} + // 分词器结构体 type Segmenter struct { dict *Dictionary