-
Notifications
You must be signed in to change notification settings - Fork 25
/
simhash.go
158 lines (143 loc) · 4.17 KB
/
simhash.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2015 Benjamin BALET. All rights reserved.
// Use of this source code is governed by the BSD license
// license that can be found in the LICENSE file.
// Package stopwords implements Charikar's simhash algorithm to generate a 64-bit
// fingerprint of a given document.
package stopwords
import (
"bytes"
"hash/fnv"
"html"
"golang.org/x/text/language"
"golang.org/x/text/unicode/norm"
)
// Each item of the array is the hash of a word of the content.
type vector [64]int
// Internal struct: 64-bit hash and weight of a word
type feature struct {
Sum uint64
Weight int
}
// Simhash returns a 64-bit simhash representing the content of the string
// removes useless spaces and stop words from a byte slice.
// BCP 47 or ISO 639-1 language code (if unknown, we'll apply english filters).
// If cleanHTML is TRUE, remove HTML tags from content and unescape HTML entities.
func Simhash(content []byte, langCode string, cleanHTML bool) uint64 {
//Remove HTML tags
if cleanHTML {
content = remTags.ReplaceAll(content, []byte(" "))
content = []byte(html.UnescapeString(string(content)))
}
//Parse language
tag := language.Make(langCode)
base, _ := tag.Base()
langCode = base.String()
var hash uint64
//Remove stop words by using a list of most frequent words
switch langCode {
case "ar":
hash = removeStopWordsAndHash(content, arabic)
case "bg":
hash = removeStopWordsAndHash(content, bulgarian)
case "cs":
hash = removeStopWordsAndHash(content, czech)
case "da":
hash = removeStopWordsAndHash(content, danish)
case "de":
hash = removeStopWordsAndHash(content, german)
case "el":
hash = removeStopWordsAndHash(content, greek)
case "en":
hash = removeStopWordsAndHash(content, english)
case "es":
hash = removeStopWordsAndHash(content, spanish)
case "fa":
hash = removeStopWordsAndHash(content, persian)
case "fr":
hash = removeStopWordsAndHash(content, french)
case "fi":
hash = removeStopWordsAndHash(content, finnish)
case "hu":
hash = removeStopWordsAndHash(content, hungarian)
case "it":
hash = removeStopWordsAndHash(content, italian)
case "ja":
hash = removeStopWordsAndHash(content, japanese)
case "km":
hash = removeStopWordsAndHash(content, khmer)
case "lv":
hash = removeStopWordsAndHash(content, latvian)
case "nl":
hash = removeStopWordsAndHash(content, dutch)
case "no":
hash = removeStopWordsAndHash(content, norwegian)
case "pl":
hash = removeStopWordsAndHash(content, polish)
case "pt":
hash = removeStopWordsAndHash(content, portuguese)
case "ro":
hash = removeStopWordsAndHash(content, romanian)
case "ru":
hash = removeStopWordsAndHash(content, russian)
case "sk":
hash = removeStopWordsAndHash(content, slovak)
case "sv":
hash = removeStopWordsAndHash(content, swedish)
case "th":
hash = removeStopWordsAndHash(content, thai)
case "tr":
hash = removeStopWordsAndHash(content, turkish)
}
return hash
}
// removeStopWords iterates through a list of words and removes stop words.
func removeStopWordsAndHash(content []byte, dict map[string]string) uint64 {
var v vector
var i int
content = norm.NFC.Bytes(content)
content = bytes.ToLower(content)
words := wordSegmenter.FindAll(content, -1)
for _, w := range words {
if _, ok := dict[string(w)]; !ok {
aFeature := newFeature(w)
sum := aFeature.Sum
weight := aFeature.Weight
for i := uint8(0); i < 64; i++ {
bit := ((sum >> i) & 1)
if bit == 1 {
v[i] += weight
} else {
v[i] -= weight
}
}
i++
}
}
// compute and return the fingerprint of the content
// The fingerprint f of a given 64-dimension vector v is defined as follows:
// f[j] = 1 if v[j] >= 0
// f[j] = 0 if v[j] < 0
var fingerprint uint64
for j := uint8(0); j < 64; j++ {
if v[j] >= 0 {
fingerprint |= (1 << j)
}
}
return fingerprint
}
// Returns a new feature representing the given byte slice, using a weight of 1
func newFeature(f []byte) feature {
h := fnv.New64()
h.Write(f)
return feature{h.Sum64(), 1}
}
// CompareSimhash calculates the Hamming distance between two 64-bit integers
// using the Kernighan method.
func CompareSimhash(a uint64, b uint64) uint8 {
v := a ^ b
var c uint8
for c = 0; v != 0; c++ {
v &= v - 1
}
return c
}