Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

使用error来判断词典结束 #12

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,33 @@ func main() {
fmt.Println(sego.SegmentsToString(segments, false))
}
```


```go
package main

import (
"fmt"
"github.com/huichen/sego"
)

func init() {
// 注册分词器
sego.RegisterDefaultSegmenter("github.com/huichen/sego/data/dictionary.txt")
sego.RegisterSegmenter("test", "github.com/huichen/sego/data/dictionary.txt")
}

func main() {
// 载入词典
segmenter := sego.GetDefaultSegmenter()
segmenter2 := sego.GetSegmenter("test")

// 分词
text := []byte("中华人民共和国中央人民政府")
segments := segmenter.Segment(text)

// 处理分词结果
// 支持普通模式和搜索模式两种分词,见代码中SegmentsToString函数的注释。
fmt.Println(sego.SegmentsToString(segments, false))
}
```
90 changes: 71 additions & 19 deletions segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ package sego

import (
"bufio"
"fmt"
"errors"
"io"
"log"
"math"
"os"
Expand All @@ -17,6 +18,42 @@ const (
minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词
)

var DefaultSegmenter = new(Segmenter)

var segmenters = map[string]*Segmenter{
"default": DefaultSegmenter,
}

// 注册分词器
func RegisterSegmenter(alias string, files string) error {
seg := new(Segmenter)
err := seg.LoadDictionary(files)
if err != nil {
return err
}

segmenters[alias] = seg
return nil
}

// 获取分词器
func GetSegmenter(alias string) *Segmenter {
if _, ok := segmenters[alias]; ok {
return segmenters[alias]
}
return nil
}

// 注册默认分词器
func RegisterDefaultSegmenter(files string) error {
return RegisterSegmenter("default", files)
}

// 获取默认分词器
func GetDefaultSegmenter() *Segmenter {
return GetSegmenter("default")
}

// 分词器结构体
type Segmenter struct {
dict *Dictionary
Expand All @@ -40,44 +77,46 @@ func (seg *Segmenter) Dictionary() *Dictionary {
// 当一个分词既出现在用户词典也出现在通用词典中,则优先使用用户词典。
//
// 词典的格式为(每个分词一行):
// 分词文本 频率 词性
func (seg *Segmenter) LoadDictionary(files string) {
// 分词文本 频率 词性 (强制要求使用这个格式,不符合这个格式的行跳过,并且在log中可以看到)
func (seg *Segmenter) LoadDictionary(files string) error {
seg.dict = new(Dictionary)
for _, file := range strings.Split(files, ",") {
log.Printf("载入sego词典 %s", file)
dictFile, err := os.Open(file)
defer dictFile.Close()
if err != nil {
log.Fatalf("无法载入字典文件 \"%s\" \n", file)
log.Printf("无法载入字典文件 \"%s\" \n", file)
return err
}

reader := bufio.NewReader(dictFile)
var text string
var freqText string
var frequency int
var pos string

// 逐行读入分词
line := 0
for {
size, _ := fmt.Fscanln(reader, &text, &freqText, &pos)

if size == 0 {
// 文件结束
break
} else if size < 2 {
// 无效行
line++
txt, err := reader.ReadString('\n')
if err != nil {
if err == io.EOF {
// 文件结束
break
}
log.Printf("%v 文件第 %v行读取错误,跳过: %v", file, line, err.Error())
continue
} else if size == 2 {
// 没有词性标注时设为空字符串
pos = ""
}

// 解析词频
var err error
frequency, err = strconv.Atoi(freqText)
if err != nil {
parts := strings.Split(txt, " ")
if len(parts) < 3 {
log.Printf("%v 文件第 %v行读取错误,跳过: %v", file, line, "读取个数少于三个")
continue
}
N := len(parts)
text = strings.Join(parts[:N-2], " ")
frequency, err = strconv.Atoi(parts[N-2])
pos = parts[N-1]

// 过滤频率太小的词
if frequency < minTokenFrequency {
Expand Down Expand Up @@ -123,6 +162,19 @@ func (seg *Segmenter) LoadDictionary(files string) {
}

log.Println("sego词典载入完毕")
return nil
}

func parse(line string) (txt string, prequence int, pos string, err error) {
parts := strings.Split(line, " ")
if len(parts) < 3 {
return "", 0, "", errors.New("incomplete line")
}
N := len(parts)
txt = strings.Join(parts[:N-2], " ")
prequence, err = strconv.Atoi(parts[N-2])
pos = parts[N-1]
return
}

// 对文本分词
Expand Down