Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: support building CMSketch with Top N #10163

Merged
merged 41 commits into from
Apr 24, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
9254400
Split core into a single pr
erjiaqing Apr 16, 2019
2433427
remove copy and equal from pr
erjiaqing Apr 16, 2019
77dc45b
move some code into separate functions
erjiaqing Apr 17, 2019
cfc8a93
update
erjiaqing Apr 17, 2019
7008f89
rename some variables
erjiaqing Apr 17, 2019
2055420
upd
erjiaqing Apr 17, 2019
af8e40c
fix
erjiaqing Apr 17, 2019
bfd2128
fix
erjiaqing Apr 17, 2019
8e63f5e
Merge branch 'master' into cms_topn_core
erjiaqing Apr 17, 2019
b244ab1
merge
erjiaqing Apr 17, 2019
40ac221
upd
erjiaqing Apr 17, 2019
d947e96
fix
erjiaqing Apr 17, 2019
6148210
fix
erjiaqing Apr 17, 2019
ba40c91
upd
erjiaqing Apr 18, 2019
c07b72c
some rename
erjiaqing Apr 18, 2019
42d32af
fix
erjiaqing Apr 18, 2019
afa8132
Merge branch 'master' into cms_topn_core
erjiaqing Apr 19, 2019
c1bb5bb
Merge branch 'master' into cms_topn_core
erjiaqing Apr 22, 2019
c78335d
update
erjiaqing Apr 22, 2019
d9addef
Merge branch 'master' into cms_topn_core
erjiaqing Apr 22, 2019
e38524f
small fix
erjiaqing Apr 22, 2019
49a0022
back to handcraft map
erjiaqing Apr 22, 2019
410d430
Merge branch 'master' into cms_topn_core
erjiaqing Apr 22, 2019
88d771d
splie some functions
erjiaqing Apr 22, 2019
f614adc
upd
erjiaqing Apr 22, 2019
2da51b5
Merge branch 'master' into cms_topn_core
erjiaqing Apr 23, 2019
e47f352
upd
erjiaqing Apr 23, 2019
3189cd0
apply XuHuaiyu's change
erjiaqing Apr 23, 2019
1ba6c17
upd
erjiaqing Apr 23, 2019
f18e797
apply XuHuaiyu's change
erjiaqing Apr 23, 2019
92b8da0
fix
erjiaqing Apr 23, 2019
1537576
upd
erjiaqing Apr 23, 2019
8fdcdc9
upd
erjiaqing Apr 23, 2019
5252a5c
fix
erjiaqing Apr 23, 2019
4c4c1b8
upd
erjiaqing Apr 23, 2019
6b1c842
fix
erjiaqing Apr 23, 2019
118df21
upd
erjiaqing Apr 24, 2019
6723e74
upd
erjiaqing Apr 24, 2019
25af84c
fix
erjiaqing Apr 24, 2019
2d047d5
Merge branch 'master' into cms_topn_core
erjiaqing Apr 24, 2019
4f45950
Merge branch 'master' into cms_topn_core
zz-jason Apr 24, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 32 additions & 39 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ type CMSketch struct {
count uint64 // TopN is not counted in count
defaultValue uint64 // In sampled data, if cmsketch returns a small value (less than avg value / 2), then this will returned.
table [][]uint32
topN map[uint64][]dataCount
topN map[uint64][]topNMeta
}

// dataCount is a simple counter used by BuildTopN
type dataCount struct {
// topNMeta is a simple counter used by BuildTopN
type topNMeta struct {
h1 uint64
h2 uint64
data []byte
Expand Down Expand Up @@ -115,45 +115,39 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper {
func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount uint64) *CMSketch {
helper := newTopNHelper(sample, numTop)
estimateNDV, ratio := calculateEstimateNDV(helper, rowCount)
c := helper.buildCMSWithTopN(d, w, ratio)
c := buildCMSWithTopN(helper, d, w, ratio)
c.calculateDefaultVal(helper, estimateNDV, ratio, rowCount)
return c
}

// finalBuild builds Top-N and cmsketch
func (helper *topNHelper) buildCMSWithTopN(d, w int32, ratio uint64) (c *CMSketch) {
c = NewCMSketch(d, w)
func buildCMSWithTopN(helper *topNHelper, d, w int32, ratio uint64) (c *CMSketch) {
c, helper.sumTopN, helper.numTop = NewCMSketch(d, w), 0, 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we set helper.sumTopN and helper.numTop to 0 at this line, enableTopN would always be false? After we record these 2 values in newTopNHelper, we never use them and then set them to 0? This looks unreasonable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, they should be put after enableTopN

enableTopN := helper.sampleSize/topNThreshold <= helper.sumTopN
topN := make([]dataCount, 0, helper.numTop)
helper.sumTopN = 0
if enableTopN {
c.topN = make(map[uint64][]topNMeta)
}
for counterKey, cnt := range helper.counter {
scaledCount := cnt * ratio
if enableTopN && cnt >= helper.lastVal {
topN = append(topN, dataCount{data: hack.Slice(string(counterKey)), count: cnt * ratio})
helper.sumTopN += cnt * ratio
c.insertToTopN(hack.Slice(string(counterKey)), scaledCount)
helper.sumTopN += scaledCount
helper.numTop++
} else {
c.updateBytesWithDelta(hack.Slice(string(counterKey)), cnt*ratio)
c.updateBytesWithDelta(hack.Slice(string(counterKey)), scaledCount)
}
}
if !enableTopN {
return
}
helper.numTop = uint32(len(topN))
c.topN = make(map[uint64][]dataCount)
for i := range topN {
if topN[i].data == nil {
continue
}
h1, h2 := murmur3.Sum128(topN[i].data)
vals, ok := c.topN[h1]
if !ok {
vals = make([]dataCount, 0)
}
vals = append(vals, dataCount{h1, h2, topN[i].data, topN[i].count})
c.topN[h1] = vals
}
return
}

// insertToTopN assumes that data never occurred in c.topN before.
// Should only be used when building top-n index.
func (c *CMSketch) insertToTopN(data []byte, count uint64) {
h1, h2 := murmur3.Sum128(data)
vals := c.topN[h1]
vals = append(vals, topNMeta{h1, h2, data, count})
c.topN[h1] = vals
}

func (c *CMSketch) calculateDefaultVal(helper *topNHelper, estimateNDV, ratio, rowCount uint64) {
sampleNDV := uint64(len(helper.sorted))
if rowCount <= helper.sumTopN {
Expand All @@ -168,9 +162,9 @@ func (c *CMSketch) calculateDefaultVal(helper *topNHelper, estimateNDV, ratio, r
}
}

// queryAddTopN TopN adds count to CMSketch.topN if exists, and returns the count of such elements after insert
// if such elements does not in topn elements, nothing will happen and false will be returned.
func (c *CMSketch) updateTopNWithDelta(h1, h2, delta uint64, d []byte) bool {
// queryAddTopN TopN adds count to CMSketch.topN if exists, and returns the count of such elements after insert.
// If such elements does not in topn elements, nothing will happen and false will be returned.
func (c *CMSketch) updateTopNWithDelta(h1, h2 uint64, d []byte, delta uint64) bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comments are obsolete?

if c.topN == nil {
return false
}
Expand Down Expand Up @@ -200,16 +194,16 @@ func (c *CMSketch) InsertBytes(bytes []byte) {
c.updateBytesWithDelta(bytes, 1)
}

// insertBytesN adds the bytes value into the CM Sketch by n.
func (c *CMSketch) updateBytesWithDelta(bytes []byte, n uint64) {
// updateBytesWithDelta adds the bytes value into the CM Sketch by delta.
func (c *CMSketch) updateBytesWithDelta(bytes []byte, delta uint64) {
h1, h2 := murmur3.Sum128(bytes)
if c.updateTopNWithDelta(h1, h2, n, bytes) {
if c.updateTopNWithDelta(h1, h2, bytes, delta) {
return
}
c.count += n
c.count += delta
for i := range c.table {
j := (h1 + h2*uint64(i)) % uint64(c.width)
c.table[i][j] += uint32(n)
c.table[i][j] += uint32(delta)
}
}

Expand All @@ -220,7 +214,6 @@ func (c *CMSketch) considerDefVal(cnt uint64) bool {
// setValue sets the count for value that hashed into (h1, h2).
func (c *CMSketch) setValue(h1, h2 uint64, count uint32) {
oriCount := c.queryHashValue(h1, h2)

if c.considerDefVal(oriCount) {
// This case, we should also update c.defaultValue
// Set default value directly will result in more error, instead, update it by 5%.
Expand Down Expand Up @@ -293,7 +286,7 @@ func (c *CMSketch) MergeCMSketch(rc *CMSketch) error {
return errors.New("Dimensions of Count-Min Sketch should be the same")
}
if c.topN != nil || rc.topN != nil {
return errors.New("CMSketch with Top-N does not supports merge")
return errors.New("CMSketch with Top-N does not support merge")
}
c.count += rc.count
for i := range c.table {
Expand Down
54 changes: 26 additions & 28 deletions statistics/estimate.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,45 @@

package statistics

import "math"
import (
"math"

"github.com/cznic/mathutil"
)

// calculateEstimateNDV calculates the estimate ndv of a sampled data from a multisize with size total.
// count[i] stores the count of the i-th element.
// onlyOnceItems is the number of elements that occurred only once.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These comments are obsolete?

func calculateEstimateNDV(h *topNHelper, rowCount uint64) (ndv uint64, ratio uint64) {
sampleSize, sampleNDV, onlyOnceItems := h.sampleSize, uint64(len(h.sorted)), h.onlyOnceItems
ratio = rowCount / sampleSize
if rowCount < sampleSize {
ratio = 1
ndv = sampleNDV
}
ratio = mathutil.MaxUint64(1, rowCount/sampleSize)

if onlyOnceItems == sampleSize {
// Assume this is a unique column
ratio = 1
ndv = rowCount
return rowCount, 1
} else if onlyOnceItems == 0 {
// Assume data only consists of sampled data
// Nothing to do, no change with ratio
return sampleNDV, ratio
}
// Charikar, Moses, et al. "Towards estimation error guarantees for distinct values."
// Proceedings of the nineteenth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems. ACM, 2000.
// This is GEE in that paper.
// estimateNDV = sqrt(N/n) f_1 + sum_2..inf f_i
// f_i = number of elements occurred i times in sample

f1 := float64(onlyOnceItems)
n := float64(sampleSize)
N := float64(rowCount)
d := float64(sampleNDV)

ndv = uint64(math.Sqrt(N/n)*f1 + d - f1 + 0.5)

if ndv < sampleNDV {
ndv = sampleNDV
} else {
// Charikar, Moses, et al. "Towards estimation error guarantees for distinct values."
// Proceedings of the nineteenth ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems. ACM, 2000.
// This is GEE in that paper.
// estimateNDV = sqrt(N/n) f_1 + sum_2..inf f_i
// f_i = number of elements occurred i times in sample

f1 := float64(onlyOnceItems)
n := float64(sampleSize)
N := float64(rowCount)
d := float64(sampleNDV)

ndv = uint64(math.Sqrt(N/n)*f1 + d - f1 + 0.5)

if ndv < sampleNDV {
ndv = sampleNDV
}
if ndv > rowCount {
ndv = rowCount
}
}
if ndv > rowCount {
ndv = rowCount
}
return ndv, ratio
}