-
Notifications
You must be signed in to change notification settings - Fork 19
/
golem.go
101 lines (91 loc) · 2.36 KB
/
golem.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
package golem
import (
"fmt"
"sort"
"strings"
)
// LanguagePack is what each language should implement
type LanguagePack interface {
GetResource() ([]byte, error)
GetLocale() string
}
// Lemmatizer is the key to lemmatizing a word in a language
type Lemmatizer struct {
m map[string]int
v [][]string
}
func newLemmatizerFromBytes(b []byte) (Lemmatizer, error) {
lines := strings.Split(string(b), "\n")
s := Lemmatizer{
m: make(map[string]int),
v: [][]string{},
}
// TODO: Would it be better to do with a reader
// instead of loading the full thing into an array?
// br := bufio.NewReader(bytes.NewReader(b))
// line, err := br.ReadString('\n')
// for err == nil {
// wordIndex := make(map[string])
for _, line := range lines {
if len(line) == 0 {
continue
}
words := strings.Split(line, "\t")
if len(words) < 2 {
return s, fmt.Errorf("expected more than 1 form per word")
}
base := words[0]
for _, word := range words {
if index, ok := s.m[word]; ok {
s.v[index] = append(s.v[index], word)
} else {
index := len(s.v)
s.v = append(s.v, []string{base})
s.m[word] = index
}
}
}
return s, nil
}
// New produces a new Lemmatizer
func New(pack LanguagePack) (*Lemmatizer, error) {
resource, err := pack.GetResource()
if err != nil {
return nil, fmt.Errorf(`Could not open resource file for "%s"`, pack.GetLocale())
}
l, err := newLemmatizerFromBytes(resource)
if err != nil {
return nil, fmt.Errorf(`language %s is not valid: %s`, pack.GetLocale(), err)
}
return &l, nil
}
// InDict checks if a certain word is in the dictionary
func (l *Lemmatizer) InDict(word string) bool {
_, ok := l.m[strings.ToLower(word)]
return ok
}
// Lemma gets one of the base forms of a word
func (l *Lemmatizer) Lemma(word string) string {
if out, ok := l.m[strings.ToLower(word)]; ok {
return l.v[out][0]
}
return word
}
// LemmaLower gets one of the base forms of a lower case word
// expects `word` to be lowercased
func (l *Lemmatizer) LemmaLower(word string) string {
if out, ok := l.m[word]; ok {
return l.v[out][0]
}
return word
}
// Lemmas gets all the base forms of a word, if multiple exist
func (l *Lemmatizer) Lemmas(word string) (out []string) {
if index, ok := l.m[strings.ToLower(word)]; ok {
out := l.v[index]
// to get rid of the randomness, we sort the output
sort.Strings(out)
return out
}
return []string{word}
}