forked from DaehwanKimLab/centrifuge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhyperloglogplus.h
623 lines (520 loc) · 19.2 KB
/
hyperloglogplus.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
/*
* hyperloglogplus.h
*
* Implementation of HyperLogLog++ algorithm described by Stefan Heule et al.
*
* Created on: Apr 25, 2015
* Author: fbreitwieser
*/
#ifndef HYPERLOGLOGPLUS_H_
#define HYPERLOGLOGPLUS_H_
#include<set>
#include<vector>
#include<stdexcept>
#include<iostream>
#include<fstream>
#include<math.h> //log
#include<algorithm> //vector.count
#include<bitset>
#include "hyperloglogbias.h"
#include "third_party/MurmurHash3.cpp"
#include "assert_helpers.h"
using namespace std;
//#define HLL_DEBUG
//#define NDEBUG
//#define NDEBUG2
#define arr_len(a) (a + sizeof a / sizeof a[0])
// experimentally determined threshold values for p - 4
static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100,
6500, 11500, 20000, 50000, 120000, 350000};
///////////////////////
//
/**
* gives the estimated cardinality for m bins, v of which are non-zero
* @param m number of bins in the matrix
* @param v number of non-zero bins
* @return
*/
double linearCounting(uint32_t m, uint32_t v) {
if (v > m) {
throw std::invalid_argument("number of v should not be greater than m");
}
double fm = double(m);
return fm * log(fm/double(v));
}
/**
* from Numerical Recipes, 3rd Edition, p 352
* Returns hash of u as a 64-bit integer.
*
*/
inline uint64_t ranhash (uint64_t u) {
uint64_t v = u * 3935559000370003845 + 2691343689449507681;
v ^= v >> 21; v ^= v << 37; v ^= v >> 4;
v *= 4768777513237032717;
v ^= v << 20; v ^= v >> 41; v ^= v << 5;
return v;
}
inline uint64_t murmurhash3_finalizer (uint64_t key) {
key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that.
key ^= key >> 33;
key *= 0xff51afd7ed558ccd;
key ^= key >> 33;
key *= 0xc4ceb9fe1a85ec53;
key ^= key >> 33;
return key;
}
/**
* Bias correction factors for specific m's
* @param m
* @return
*/
double alpha(uint32_t m) {
switch (m) {
case 16: return 0.673;
case 32: return 0.697;
case 64: return 0.709;
}
// m >= 128
return 0.7213 / (1 + 1.079/double(m));
}
/**
* calculate the raw estimate as harmonic mean of the ranks in the register
* @param array
* @return
*/
double calculateEstimate(vector<uint8_t> array) {
double inverseSum = 0.0;
for (size_t i = 0; i < array.size(); ++i) {
// TODO: pre-calculate the power calculation
inverseSum += pow(2,-array[i]);
}
return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum;
}
uint32_t countZeros(vector<uint8_t> s) {
return (uint32_t)count(s.begin(), s.end(), 0);
}
/**
* Extract bits (from uint32_t or uint64_t) using LSB 0 numbering from hi to lo, including lo
* @param bits
* @param hi
* @param lo
* @return
*/
template<typename T>
T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) {
// create a bitmask:
// (T(1) << (hi - lo) a 1 at the position (hi - lo)
// ((T(1) << (hi - lo) - 1) 1's from position 0 to position (hi-lo-1)
// (((T(1) << (hi - lo)) - 1) << lo) 1's from position lo to position hi
// The T(1) is required to not cause overflow on 32bit machines
// TODO: consider creating a bitmask only once in the beginning
T bitmask = (((T(1) << (hi - lo)) - 1) << lo);
T result = value & bitmask;
if (!shift_left) {
// shift resulting bits to the right
result = result >> lo;
} else {
// shift resulting bits to the left
result = result << (sizeof(T)*8 - hi);
}
return result;
}
template<typename T>
T extractBits(T bits, uint8_t hi) {
// create a bitmask for first hi bits (LSB 0 numbering)
T bitmask = T(-1) << (sizeof(T)*8 - hi);
return (bits & bitmask);
}
// functions for counting the number of leading 0-bits (clz)
// and counting the number of trailing 0-bits (ctz)
//#ifdef __GNUC__
// TODO: switch between builtin clz and 64_clz based on architecture
//#define clz(x) __builtin_clz(x)
#if 0
static int clz_manual(uint64_t x)
{
// This uses a binary search (counting down) algorithm from Hacker's Delight.
uint64_t y;
int n = 64;
y = x >>32; if (y != 0) {n -= 32; x = y;}
y = x >>16; if (y != 0) {n -= 16; x = y;}
y = x >> 8; if (y != 0) {n -= 8; x = y;}
y = x >> 4; if (y != 0) {n -= 4; x = y;}
y = x >> 2; if (y != 0) {n -= 2; x = y;}
y = x >> 1; if (y != 0) return n - 2;
return n - x;
}
#endif
inline uint32_t clz(const uint32_t x) {
return __builtin_clz(x);
}
inline uint32_t clz(const uint64_t x) {
uint32_t u32 = (x >> 32);
uint32_t result = u32 ? __builtin_clz(u32) : 32;
if (result == 32) {
u32 = x & 0xFFFFFFFFUL;
result += (u32 ? __builtin_clz(u32) : 32);
}
return result;
}
//#else
uint32_t clz_log2(const uint64_t w) {
return 63 - floor(log2(w));
}
//#endif
// TODO: the sparse list may be encoded with variable length encoding
// see Heule et al., section 5.3.2
// Also, using sets might give a larger overhead as each insertion costs more
// consider using vector and sort/unique when merging.
typedef set<uint32_t> SparseListType;
typedef uint64_t HashSize;
/**
* HyperLogLogPlusMinus class
* typename T corresponds to the hash size - usually either uint32_t or uint64_t (implemented for uint64_t)
*/
typedef uint64_t T_KEY;
template <typename T_KEY>
class HyperLogLogPlusMinus {
private:
vector<uint8_t> M; // registers (M) of size m
uint8_t p; // precision
uint32_t m; // number of registers
bool sparse; // sparse representation of the data?
SparseListType sparseList; // TODO: use a compressed list instead
// vectors containing data for bias correction
vector<vector<double> > rawEstimateData; // TODO: make this static
vector<vector<double> > biasData;
// sparse versions of p and m
static const uint8_t pPrime = 25; // precision when using a sparse representation
// fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32
static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime
public:
~HyperLogLogPlusMinus() {};
/**
* Create new HyperLogLogPlusMinus counter
* @param precision
* @param sparse
*/
HyperLogLogPlusMinus(uint8_t precision=10, bool sparse=true):p(precision),sparse(sparse) {
if (precision > 18 || precision < 4) {
throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
}
this->m = 1 << precision;
if (sparse) {
this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size
} else {
this->M = vector<uint8_t>(m);
}
}
/**
* Add a new item to the counter.
* @param item
*/
void add(T_KEY item) {
add(item, sizeof(T_KEY));
}
/**
* Add a new item to the counter.
* @param item
* @param size size of item
*/
void add(T_KEY item, size_t size) {
// compute hash for item
HashSize hash_value = murmurhash3_finalizer(item);
#ifdef HLL_DEBUG
cerr << "Value: " << item << "; hash(value): " << hash_value << endl;
cerr << bitset<64>(hash_value) << endl;
#endif
if (sparse) {
// sparse mode: put the encoded hash into sparse list
uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value);
this->sparseList.insert(encoded_hash_value);
#ifdef HLL_DEBUG
idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value);
assert_eq(ir.idx,get_index(hash_value, p));
assert_eq(ir.rank, get_rank(hash_value, p));
#endif
// if the sparseList is too large, switch to normal (register) representation
if (this->sparseList.size() > this->m) { // TODO: is the size of m correct?
switchToNormalRepresentation();
}
} else {
// normal mode
// take first p bits as index {x63,...,x64-p}
uint32_t idx = get_index(hash_value, p);
// shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0}
uint8_t rank = get_rank(hash_value, p);
// update the register if current rank is bigger
if (rank > this->M[idx]) {
this->M[idx] = rank;
}
}
}
void add(vector<T_KEY> words) {
for(size_t i = 0; i < words.size(); ++i) {
this->add(words[i]);
}
}
/**
* Reset to its initial state.
*/
void reset() {
this->sparse = true;
this->sparseList.clear(); //
this->M.clear();
}
/**
* Convert from sparse representation (using tmpSet and sparseList) to normal (using register)
*/
void switchToNormalRepresentation() {
#ifdef HLL_DEBUG
cerr << "switching to normal representation" << endl;
cerr << " est before: " << cardinality(true) << endl;
#endif
this->sparse = false;
this->M = vector<uint8_t>(this->m);
if (sparseList.size() > 0) { //TDOD: do I need to check this, here?
addToRegisters(this->sparseList);
this->sparseList.clear();
}
#ifdef HLL_DEBUG
cerr << " est after: " << cardinality(true) << endl;
#endif
}
/**
* add sparseList to the registers of M
*/
void addToRegisters(const SparseListType &sparseList) {
if (sparseList.size() == 0) {
return;
}
for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) {
idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr);
assert_lt(ir.idx,M.size());
if (ir.rank > this->M[ir.idx]) {
this->M[ir.idx] = ir.rank;
}
}
}
/**
* Merge another HyperLogLogPlusMinus into this. Converts to normal representation
* @param other
*/
void merge(const HyperLogLogPlusMinus* other) {
if (this->p != other->p) {
throw std::invalid_argument("precisions must be equal");
}
if (this->sparse && other->sparse) {
if (this->sparseList.size()+other->sparseList.size() > this->m) {
switchToNormalRepresentation();
addToRegisters(other->sparseList);
} else {
this->sparseList.insert(other->sparseList.begin(),other->sparseList.end());
}
} else if (other->sparse) {
// other is sparse, but this is not
addToRegisters(other->sparseList);
} else {
if (this->sparse) {
switchToNormalRepresentation();
}
// merge registers
for (size_t i = 0; i < other->M.size(); ++i) {
if (other->M[i] > this->M[i]) {
this->M[i] = other->M[i];
}
}
}
}
/**
*
* @return cardinality estimate
*/
uint64_t cardinality(bool verbose=true) {
if (sparse) {
// if we are still 'sparse', then use linear counting, which is more
// accurate for low cardinalities, and use increased precision pPrime
return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size())));
}
// initialize bias correction data
if (rawEstimateData.empty()) { initRawEstimateData(); }
if (biasData.empty()) { initBiasData(); }
// calculate raw estimate on registers
//double est = alpha(m) * harmonicMean(M, m);
double est = calculateEstimate(M);
// correct for biases if estimate is smaller than 5m
if (est <= double(m)*5.0) {
est -= getEstimateBias(est);
}
uint32_t v = countZeros(M);
if (v > 2) {
// calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix
double lc_estimate = linearCounting(m, v);
// check if the lc estimate is below the threshold
if (lc_estimate <= double(threshold[p-4])) {
if (lc_estimate < 0) { throw; }
// return lc estimate of cardinality
return lc_estimate;
}
return lc_estimate; // always use lc_estimate when available
}
// return bias-corrected hyperloglog estimate of cardinality
return uint64_t(est);
}
private:
uint8_t rank(HashSize x, uint8_t b) {
uint8_t v = 1;
while (v <= b && !(x & 0x80000000)) {
v++;
x <<= 1;
}
return v;
}
template<typename T> inline uint32_t get_index(const T hash_value, const uint8_t p, const uint8_t size) const {
// take first p bits as index {x63,...,x64-p}
assert_lt(p,size);
uint32_t idx = hash_value >> (size - p);
return idx;
}
inline uint32_t get_index(const uint64_t hash_value, const uint8_t p) const {
return get_index(hash_value, p, 64);
}
inline uint32_t get_index(const uint32_t hash_value, const uint8_t p) const {
return get_index(hash_value, p, 32);
}
template<typename T> inline
T get_trailing_ones(const uint8_t p) const {
return (T(1) << p ) - 1;
}
template<typename T> inline
uint8_t get_rank(const T hash_value, const uint8_t p) const {
// shift p values off, and count leading zeros of the remaining string {x63-p,...,x0}
T_KEY rank_bits = (hash_value << p | get_trailing_ones<T>(p));
#ifdef HLL_DEBUG
cerr << "rank bits: " << bitset<32>(rank_bits) << endl;
#endif
uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1;
assert_leq(rank_val,64-p+1);
return rank_val;
}
void initRawEstimateData() {
rawEstimateData = vector<vector<double> >();
rawEstimateData.push_back(vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision6,arr_len(rawEstimateData_precision6)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision7,arr_len(rawEstimateData_precision7)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision8,arr_len(rawEstimateData_precision8)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision9,arr_len(rawEstimateData_precision9)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision10,arr_len(rawEstimateData_precision10)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision11,arr_len(rawEstimateData_precision11)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision12,arr_len(rawEstimateData_precision12)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision13,arr_len(rawEstimateData_precision13)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision14,arr_len(rawEstimateData_precision14)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision15,arr_len(rawEstimateData_precision15)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision16,arr_len(rawEstimateData_precision16)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision17,arr_len(rawEstimateData_precision17)));
rawEstimateData.push_back(vector<double>(rawEstimateData_precision18,arr_len(rawEstimateData_precision18)));
}
void initBiasData() {
biasData = vector<vector<double> >();
biasData.push_back(vector<double>(biasData_precision4,arr_len(biasData_precision4)));
biasData.push_back(vector<double>(biasData_precision5,arr_len(biasData_precision5)));
biasData.push_back(vector<double>(biasData_precision6,arr_len(biasData_precision6)));
biasData.push_back(vector<double>(biasData_precision7,arr_len(biasData_precision7)));
biasData.push_back(vector<double>(biasData_precision8,arr_len(biasData_precision8)));
biasData.push_back(vector<double>(biasData_precision9,arr_len(biasData_precision9)));
biasData.push_back(vector<double>(biasData_precision10,arr_len(biasData_precision10)));
biasData.push_back(vector<double>(biasData_precision11,arr_len(biasData_precision11)));
biasData.push_back(vector<double>(biasData_precision12,arr_len(biasData_precision12)));
biasData.push_back(vector<double>(biasData_precision13,arr_len(biasData_precision13)));
biasData.push_back(vector<double>(biasData_precision14,arr_len(biasData_precision14)));
biasData.push_back(vector<double>(biasData_precision15,arr_len(biasData_precision15)));
biasData.push_back(vector<double>(biasData_precision16,arr_len(biasData_precision16)));
biasData.push_back(vector<double>(biasData_precision17,arr_len(biasData_precision17)));
biasData.push_back(vector<double>(biasData_precision18,arr_len(biasData_precision18)));
}
/**
* Estimate the bias using empirically determined values.
* Uses weighted average of the two cells between which the estimate falls.
* TODO: Check if nearest neighbor average gives better values, as proposed in the paper
* @param est
* @return correction value for
*/
double getEstimateBias(double estimate) {
vector<double> rawEstimateTable = rawEstimateData[p-4];
vector<double> biasTable = biasData[p-4];
// check if estimate is lower than first entry, or larger than last
if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); }
if (rawEstimateTable.back() <= estimate) { return rawEstimateTable.back() - biasTable.back(); }
// get iterator to first element that is not smaller than estimate
vector<double>::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate);
size_t pos = it - rawEstimateTable.begin();
double e1 = rawEstimateTable[pos-1];
double e2 = rawEstimateTable[pos];
double c = (estimate - e1) / (e2 - e1);
return biasTable[pos-1]*(1-c) + biasTable[pos]*c;
}
/**
* Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation.
*
* Difference from the algorithm described in the paper:
* The index always is in the p most significant bits
*
* see section 5.3 in Heule et al.
* @param x the hash bits
* @return encoded hash value
*/
uint32_t encodeHashIn32Bit(uint64_t hash_value) {
// extract first pPrime bits, and shift them onto a 32-bit integer
uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32);
#ifdef HLL_DEBUG
cerr << "value: " << bitset<64>(hash_value) << endl;
cerr << "index: " << std::bitset<32>(idx) << " ( bits from 64 to " << 64-pPrime << "; " << idx << ")" << endl;
#endif
// are the bits {63-p, ..., 63-p'} all 0?
if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) {
// compute the additional rank (minimum rank is already p'-p)
// the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag
uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0
return idx | uint32_t(additional_rank<<1) | 1;
} else {
// else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0)
assert_eq((idx & 1),0);
return idx;
}
}
/**
* struct holding the index and rank/rho of an entry
*/
struct idx_n_rank {
uint32_t idx;
uint8_t rank;
idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {}
};
//
//
/**
* Decode a hash from the sparse representation.
* Returns the index and number of leading zeros (nlz) with precision p stored in k
* @param k the hash bits
* @return index and rank in non-sparse format
*/
idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const {
// difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes
uint32_t idx = get_index(encoded_hash_value, p);
uint8_t rank_val;
// check if the last bit is 1
if ( (encoded_hash_value & 1) == 1) {
// if yes: the hash was stored with higher precision, bits p to pPrime were 0
uint8_t additional_rank = pPrime - p;
rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1);
} else {
rank_val = get_rank(encoded_hash_value,p);
// clz counts 64 bit only, it seems
if (rank_val > 32)
rank_val -= 32;
}
return(idx_n_rank(idx,rank_val));
}
};
#endif /* HYPERLOGLOGPLUS_H_ */