-
Notifications
You must be signed in to change notification settings - Fork 0
/
bloomfilter.go
148 lines (131 loc) · 4.63 KB
/
bloomfilter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// Package bloomflt is implementation of bloom filter with no 3rd party dependencies.
package bloomflt
import (
"encoding/binary"
"hash/crc32"
"hash/fnv"
"math"
"math/big"
)
// BloomFilter is an efficient data structure, used to test whether an element is a member of a set.
//
// Bloom filters are probabilistic, which means that false positives are tolerated, but it is guaranteed
// there will be no false negatives.
//
// In other words, if the bloom filter says an element is NOT in the set, then it is guaranteed the element
// is not in the set.
//
// If the bloom filter says an element IS in the set, then that means the element is probably there,
// but it is not guaranteed.
//
// This implementation uses the big.Int type as bitset storage and FNV-1a (Fowler–Noll–Vo) and CRC32
// from the builtin hash package as the base hash functions. Additional hash functions are simulated
// with "Double Hashing Scheme" by Kirsch and Mitzenmacher as explained in "Less Hashing, Same Performance:
// Building a Better Bloom Filter".
type BloomFilter struct {
m int // Number of elements in the set
k int // Number of hash functions
bucket *big.Int // Bit storage
}
// NewMK creates a new bloom filter with bucket size equal to m and number of hash functions equal to k.
func NewMK(m int, k int) *BloomFilter {
filter := BloomFilter{m, k, big.NewInt(0)}
return &filter
}
// CalcOptimalMK calculates optimal values of m and k for the specified number of elements in the set (n),
// and given acceptable false-positive rate (value from 0.0 to 1.0).
func CalcOptimalMK(n int, falsePositiveRate float64) (int, int) {
m := -1 * float64(n) * math.Log(falsePositiveRate) / math.Pow(math.Log(2), 2)
k := m / float64(n) * math.Log(2)
return int(m + 0.5), int(k + 0.5)
}
// New creates a new bloom filter with optimal values of m and k for the given given acceptable false-positive
// rate (value from 0.0 to 1.0).
func New(n int, falsePositiveRate float64) *BloomFilter {
m, k := CalcOptimalMK(n, falsePositiveRate)
// Use at least one bit
if m < 1 {
m = 1
}
// Limit the number of bits to 2^31
if m > math.MaxInt32 {
m = math.MaxInt32
}
// Use at least one hash function
if k < 1 {
k = 1
}
return NewMK(m, k)
}
// FNV-1a (Fowler–Noll–Vo) is used as the first hash function in kiMiHash
func (b *BloomFilter) hash1(value []byte) uint32 {
f := fnv.New32a()
f.Write(value)
hash := f.Sum32()
return hash
}
// CRC32 is used as the second hash function in kiMiHash
func (b *BloomFilter) hash2(value []byte) uint32 {
f := crc32.NewIEEE()
f.Write(value)
hash := f.Sum32()
return hash
}
// kiMiHash simulates arbitrary number of hash functions with a "Double Hashing Scheme" by using only
// two hash functions (5.2. in "Less Hashing, Same Performance: Building a Better Bloom Filter" by Kirsch
// and Mitzenmacher). Simplified explanation at:
// http://willwhim.wpengine.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/
func (b *BloomFilter) kiMiHash(value []byte, hashIdx int) int {
index := (b.hash1(value) + b.hash2(value)*uint32(hashIdx)) % uint32(b.m)
return int(index)
}
// AddBytes inserts a bytes value to the set
func (b *BloomFilter) AddBytes(value []byte) {
for h := 0; h < b.k; h++ {
index := b.kiMiHash(value, h)
b.bucket.SetBit(b.bucket, index, 1)
}
}
// AddString inserts a string value to the set
func (b *BloomFilter) AddString(value string) {
b.AddBytes([]byte(value))
}
// AddUInt32 inserts an int value to the set
func (b *BloomFilter) AddUInt32(value uint32) {
bytes := make([]byte, 4, 4)
binary.LittleEndian.PutUint32(bytes, value)
b.AddBytes(bytes)
}
// AddUInt64 inserts an int value to the set
func (b *BloomFilter) AddUInt64(value uint64) {
bytes := make([]byte, 8, 8)
binary.LittleEndian.PutUint64(bytes, value)
b.AddBytes(bytes)
}
// ContainsBytes tests if the set contains the given bytes value
func (b *BloomFilter) ContainsBytes(value []byte) bool {
res := true
for h := 0; h < b.k; h++ {
index := b.kiMiHash(value, h)
if b.bucket.Bit(index) == 0 {
res = false
}
}
return res
}
// ContainsString tests if the set contains the given string value
func (b *BloomFilter) ContainsString(value string) bool {
return b.ContainsBytes([]byte(value))
}
// ContainsUInt32 tests if the set contains the given int value
func (b *BloomFilter) ContainsUInt32(value uint32) bool {
bytes := make([]byte, 4, 4)
binary.LittleEndian.PutUint32(bytes, value)
return b.ContainsBytes(bytes)
}
// ContainsUInt64 tests if the set contains the given int value
func (b *BloomFilter) ContainsUInt64(value uint64) bool {
bytes := make([]byte, 8, 8)
binary.LittleEndian.PutUint64(bytes, value)
return b.ContainsBytes(bytes)
}