-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathhyperloglog.go
130 lines (120 loc) · 3.07 KB
/
hyperloglog.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// Package hyperloglog implements the HyperLogLog algorithm for
// cardinality estimation. In English: it counts things. It counts
// things using very small amounts of memory compared to the number of
// objects it is counting.
//
// For a full description of the algorithm, see the paper HyperLogLog:
// the analysis of a near-optimal cardinality estimation algorithm by
// Flajolet, et. al.
package hyperloglog
import (
"fmt"
"math"
)
var (
exp32 = math.Pow(2, 32)
)
type HyperLogLog struct {
m uint // Number of registers
b uint32 // Number of bits used to determine register index
alpha float64 // Bias correction constant
registers []uint8
}
// Compute bias correction alpha_m.
func get_alpha(m uint) (result float64) {
switch m {
case 16:
result = 0.673
case 32:
result = 0.697
case 64:
result = 0.709
default:
result = 0.7213 / (1.0 + 1.079/float64(m))
}
return result
}
// Return a new HyperLogLog with the given number of registers. More
// registers leads to lower error in your estimated count, at the
// expense of memory.
//
// Choose a power of two number of registers, depending on the amount
// of memory you're willing to use and the error you're willing to
// tolerate. Each register uses one byte of memory.
//
// Approximate error will be:
// 1.04 / sqrt(registers)
//
func New(registers uint) (*HyperLogLog, error) {
if (registers & (registers - 1)) != 0 {
return nil, fmt.Errorf("number of registers %d not a power of two", registers)
}
h := &HyperLogLog{}
h.m = registers
h.b = uint32(math.Ceil(math.Log2(float64(registers))))
h.alpha = get_alpha(registers)
h.Reset()
return h, nil
}
// Reset all internal variables and set the count to zero.
func (h *HyperLogLog) Reset() {
h.registers = make([]uint8, h.m)
}
// Calculate the position of the leftmost 1-bit.
func rho(val uint32, max uint32) uint8 {
r := uint32(1)
for val&0x80000000 == 0 && r <= max {
r++
val <<= 1
}
return uint8(r)
}
// Add to the count. val should be a 32 bit unsigned integer from a
// good hash function.
func (h *HyperLogLog) Add(val uint32) {
k := 32 - h.b
r := rho(val<<h.b, k)
j := val >> uint(k)
if r > h.registers[j] {
h.registers[j] = r
}
}
// Get the estimated count.
func (h *HyperLogLog) Count() uint64 {
sum := 0.0
m := float64(h.m)
for _, val := range h.registers {
sum += 1.0 / math.Pow(2.0, float64(val))
}
estimate := h.alpha * m * m / sum
if estimate <= 5.0/2.0*m {
// Small range correction
v := 0
for _, r := range h.registers {
if r == 0 {
v++
}
}
if v > 0 {
estimate = m * math.Log(m/float64(v))
}
} else if estimate > 1.0/30.0*exp32 {
// Large range correction
estimate = -exp32 * math.Log(1-estimate/exp32)
}
return uint64(estimate)
}
// Merge another HyperLogLog into this one. The number of registers in
// each must be the same.
func (h1 *HyperLogLog) Merge(h2 *HyperLogLog) error {
if h1.m != h2.m {
return fmt.Errorf("number of registers doesn't match: %d != %d",
h1.m, h2.m)
}
for j, r := range h2.registers {
if r > h1.registers[j] {
h1.registers[j] = r
}
}
return nil
}