Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: support building CMSketch with Top N #10163

Merged
merged 41 commits into from
Apr 24, 2019
Merged
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
9254400
Split core into a single pr
erjiaqing Apr 16, 2019
2433427
remove copy and equal from pr
erjiaqing Apr 16, 2019
77dc45b
move some code into separate functions
erjiaqing Apr 17, 2019
cfc8a93
update
erjiaqing Apr 17, 2019
7008f89
rename some variables
erjiaqing Apr 17, 2019
2055420
upd
erjiaqing Apr 17, 2019
af8e40c
fix
erjiaqing Apr 17, 2019
bfd2128
fix
erjiaqing Apr 17, 2019
8e63f5e
Merge branch 'master' into cms_topn_core
erjiaqing Apr 17, 2019
b244ab1
merge
erjiaqing Apr 17, 2019
40ac221
upd
erjiaqing Apr 17, 2019
d947e96
fix
erjiaqing Apr 17, 2019
6148210
fix
erjiaqing Apr 17, 2019
ba40c91
upd
erjiaqing Apr 18, 2019
c07b72c
some rename
erjiaqing Apr 18, 2019
42d32af
fix
erjiaqing Apr 18, 2019
afa8132
Merge branch 'master' into cms_topn_core
erjiaqing Apr 19, 2019
c1bb5bb
Merge branch 'master' into cms_topn_core
erjiaqing Apr 22, 2019
c78335d
update
erjiaqing Apr 22, 2019
d9addef
Merge branch 'master' into cms_topn_core
erjiaqing Apr 22, 2019
e38524f
small fix
erjiaqing Apr 22, 2019
49a0022
back to handcraft map
erjiaqing Apr 22, 2019
410d430
Merge branch 'master' into cms_topn_core
erjiaqing Apr 22, 2019
88d771d
splie some functions
erjiaqing Apr 22, 2019
f614adc
upd
erjiaqing Apr 22, 2019
2da51b5
Merge branch 'master' into cms_topn_core
erjiaqing Apr 23, 2019
e47f352
upd
erjiaqing Apr 23, 2019
3189cd0
apply XuHuaiyu's change
erjiaqing Apr 23, 2019
1ba6c17
upd
erjiaqing Apr 23, 2019
f18e797
apply XuHuaiyu's change
erjiaqing Apr 23, 2019
92b8da0
fix
erjiaqing Apr 23, 2019
1537576
upd
erjiaqing Apr 23, 2019
8fdcdc9
upd
erjiaqing Apr 23, 2019
5252a5c
fix
erjiaqing Apr 23, 2019
4c4c1b8
upd
erjiaqing Apr 23, 2019
6b1c842
fix
erjiaqing Apr 23, 2019
118df21
upd
erjiaqing Apr 24, 2019
6723e74
upd
erjiaqing Apr 24, 2019
25af84c
fix
erjiaqing Apr 24, 2019
2d047d5
Merge branch 'master' into cms_topn_core
erjiaqing Apr 24, 2019
4f45950
Merge branch 'master' into cms_topn_core
zz-jason Apr 24, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix
  • Loading branch information
erjiaqing committed Apr 17, 2019
commit af8e40cb3c227ac4f3fd15ad3369d863e3f651ed
22 changes: 16 additions & 6 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package statistics

import (
"bytes"
"fmt"
"math"
"sort"

Expand Down Expand Up @@ -97,7 +98,7 @@ func groupElements(data [][]byte) map[uint64][]cmsCount {

// BuildTopN builds table of top N elements.
// elements in data should not be modified after this call.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/ data/ `data`

func (c *CMSketch) BuildTopN(data [][]byte, top, topnThreshold uint32, total uint64) {
func (c *CMSketch) BuildTopN(data [][]byte, top, topNThreshold uint32, total uint64) {
sampleSize := uint64(len(data))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Put it to where it is needed.

counter := groupElements(data)
sorted := make([]uint64, 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
sorted := make([]uint64, 0)
sorted := make([]uint64, 0, len(counter))

Is better, can save some runtime.growslice operations in case there are a lot of elements.

Actually you can count the distinct values in function groupElements and use that value as the exactly slice capacity.

Expand All @@ -121,9 +122,9 @@ func (c *CMSketch) BuildTopN(data [][]byte, top, topnThreshold uint32, total uin
sumTopN := uint64(0)

// Add a few elements, the number of which is close to the n-th most element.
for i := top; i < ndv && i < top*2; i++ {
for i := uint32(0); i < ndv && i < top*2; i++ {
// Here, 2/3 is get by running tests, tested 1, 1/2, 2/3, and 2/3 is relative better than 1 and 1/2
if sorted[i]*3 < NthValue*2 && newNthValue != sorted[i] {
if i >= top && sorted[i]*3 < NthValue*2 && newNthValue != sorted[i] {
break
}
// sumTopN might be smaller than sum of final sum of elements in topnindex.
Expand All @@ -134,13 +135,15 @@ func (c *CMSketch) BuildTopN(data [][]byte, top, topnThreshold uint32, total uin

estimateNDV, ratio, onlyOnceItems := calculateEstimateNDV(sorted, total)

enableTopN := (sampleSize/uint64(topnThreshold) <= sumTopN)
enableTopN := (sampleSize/uint64(topNThreshold) <= sumTopN)
topN := make([]cmsCount, 0)
sumTopN = 0

for _, vals := range counter {
for i := range vals {
if enableTopN && vals[i].count >= newNthValue {
topN = append(topN, cmsCount{data: vals[i].data, count: vals[i].count * ratio})
sumTopN += vals[i].count * ratio
} else {
c.InsertBytesN(vals[i].data, vals[i].count*ratio)
}
Expand All @@ -155,13 +158,20 @@ func (c *CMSketch) BuildTopN(data [][]byte, top, topnThreshold uint32, total uin
// These three tests tests if all divisions are legal.
// They also tests if we sampled all possible data.
countWithoutTopN := total - (sampleSize-uint64(onlyOnceItems))*ratio
if total < uint64(top)*ratio {

if total <= sumTopN {
c.defaultValue = 1
} else if estimateNDV <= uint64(top) {
c.defaultValue = 1
} else {
c.defaultValue = countWithoutTopN / (estimateNDV - uint64(ndv) + onlyOnceItems)
if estimateNDV+onlyOnceItems <= uint64(ndv) {
c.defaultValue = 1
} else {
c.defaultValue = countWithoutTopN / (estimateNDV - uint64(ndv) + onlyOnceItems)
}
}

fmt.Printf("estNDV=%d ndv=%d onlyOnce=%d sumTopN=%d top=%d defaultValue=%d enableTopN=%v\n", estimateNDV, ndv, onlyOnceItems, sumTopN, top, c.defaultValue, enableTopN)
}

func (c *CMSketch) buildTopNMap(topn []cmsCount) {
Expand Down