-
Notifications
You must be signed in to change notification settings - Fork 5.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
statistics: support building CMSketch with Top N #10163
Changes from 1 commit
9254400
2433427
77dc45b
cfc8a93
7008f89
2055420
af8e40c
bfd2128
8e63f5e
b244ab1
40ac221
d947e96
6148210
ba40c91
c07b72c
42d32af
afa8132
c1bb5bb
c78335d
d9addef
e38524f
49a0022
410d430
88d771d
f614adc
2da51b5
e47f352
3189cd0
1ba6c17
f18e797
92b8da0
1537576
8fdcdc9
5252a5c
4c4c1b8
6b1c842
118df21
6723e74
25af84c
2d047d5
4f45950
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -15,6 +15,7 @@ package statistics | |||||
|
||||||
import ( | ||||||
"bytes" | ||||||
"fmt" | ||||||
"math" | ||||||
"sort" | ||||||
|
||||||
|
@@ -97,7 +98,7 @@ func groupElements(data [][]byte) map[uint64][]cmsCount { | |||||
|
||||||
// BuildTopN builds table of top N elements. | ||||||
// elements in data should not be modified after this call. | ||||||
func (c *CMSketch) BuildTopN(data [][]byte, top, topnThreshold uint32, total uint64) { | ||||||
func (c *CMSketch) BuildTopN(data [][]byte, top, topNThreshold uint32, total uint64) { | ||||||
alivxxx marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
sampleSize := uint64(len(data)) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Put it to where it is needed. |
||||||
counter := groupElements(data) | ||||||
sorted := make([]uint64, 0) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Is better, can save some Actually you can count the distinct values in function |
||||||
|
@@ -121,9 +122,9 @@ func (c *CMSketch) BuildTopN(data [][]byte, top, topnThreshold uint32, total uin | |||||
sumTopN := uint64(0) | ||||||
|
||||||
// Add a few elements, the number of which is close to the n-th most element. | ||||||
for i := top; i < ndv && i < top*2; i++ { | ||||||
for i := uint32(0); i < ndv && i < top*2; i++ { | ||||||
// Here, 2/3 is get by running tests, tested 1, 1/2, 2/3, and 2/3 is relative better than 1 and 1/2 | ||||||
if sorted[i]*3 < NthValue*2 && newNthValue != sorted[i] { | ||||||
if i >= top && sorted[i]*3 < NthValue*2 && newNthValue != sorted[i] { | ||||||
break | ||||||
} | ||||||
// sumTopN might be smaller than sum of final sum of elements in topnindex. | ||||||
|
@@ -134,13 +135,15 @@ func (c *CMSketch) BuildTopN(data [][]byte, top, topnThreshold uint32, total uin | |||||
|
||||||
estimateNDV, ratio, onlyOnceItems := calculateEstimateNDV(sorted, total) | ||||||
|
||||||
enableTopN := (sampleSize/uint64(topnThreshold) <= sumTopN) | ||||||
enableTopN := (sampleSize/uint64(topNThreshold) <= sumTopN) | ||||||
qw4990 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
topN := make([]cmsCount, 0) | ||||||
sumTopN = 0 | ||||||
|
||||||
for _, vals := range counter { | ||||||
for i := range vals { | ||||||
if enableTopN && vals[i].count >= newNthValue { | ||||||
topN = append(topN, cmsCount{data: vals[i].data, count: vals[i].count * ratio}) | ||||||
sumTopN += vals[i].count * ratio | ||||||
} else { | ||||||
c.InsertBytesN(vals[i].data, vals[i].count*ratio) | ||||||
} | ||||||
|
@@ -155,13 +158,20 @@ func (c *CMSketch) BuildTopN(data [][]byte, top, topnThreshold uint32, total uin | |||||
// These three tests tests if all divisions are legal. | ||||||
alivxxx marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
// They also tests if we sampled all possible data. | ||||||
countWithoutTopN := total - (sampleSize-uint64(onlyOnceItems))*ratio | ||||||
alivxxx marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
if total < uint64(top)*ratio { | ||||||
|
||||||
if total <= sumTopN { | ||||||
c.defaultValue = 1 | ||||||
} else if estimateNDV <= uint64(top) { | ||||||
c.defaultValue = 1 | ||||||
} else { | ||||||
c.defaultValue = countWithoutTopN / (estimateNDV - uint64(ndv) + onlyOnceItems) | ||||||
if estimateNDV+onlyOnceItems <= uint64(ndv) { | ||||||
c.defaultValue = 1 | ||||||
} else { | ||||||
c.defaultValue = countWithoutTopN / (estimateNDV - uint64(ndv) + onlyOnceItems) | ||||||
} | ||||||
} | ||||||
|
||||||
fmt.Printf("estNDV=%d ndv=%d onlyOnce=%d sumTopN=%d top=%d defaultValue=%d enableTopN=%v\n", estimateNDV, ndv, onlyOnceItems, sumTopN, top, c.defaultValue, enableTopN) | ||||||
} | ||||||
|
||||||
func (c *CMSketch) buildTopNMap(topn []cmsCount) { | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/ data/ `data`