Skip to content

Commit fce2833

Browse files
HeidiHan0000facebook-github-bot
authored andcommitted
refactor: Move HllAccumulator to HllUtils (#15573)
Summary: Moving HllAccumulator which was part of HyperloglogAggregates to HllUtils, so that it can be reused in Khyperloglog. HllAccumulator provides the functionality to switch between Sparse and Dense HLLs, along with other functions like merge, insertHash, cardinality which also take care of the 2 versions of HLL (sparse and dense), which is also needed for the implementation of KHLL. Reviewed By: mbasmanova Differential Revision: D87486444
1 parent 13ac97f commit fce2833

File tree

3 files changed

+217
-174
lines changed

3 files changed

+217
-174
lines changed
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#define XXH_INLINE_ALL
19+
20+
#include <xxhash.h>
21+
#include <cmath>
22+
23+
#include "velox/common/base/Exceptions.h"
24+
#include "velox/common/hyperloglog/DenseHll.h"
25+
#include "velox/common/hyperloglog/Murmur3Hash128.h"
26+
#include "velox/common/hyperloglog/SparseHll.h"
27+
#include "velox/common/memory/HashStringAllocator.h"
28+
29+
namespace facebook::velox::common::hll {
30+
31+
namespace detail {
32+
template <typename T, bool HllAsFinalResult>
33+
inline uint64_t hashOne(const T& value) {
34+
if constexpr (HllAsFinalResult) {
35+
if constexpr (std::is_same_v<T, int64_t>) {
36+
return common::hll::Murmur3Hash128::hash64ForLong(value, 0);
37+
} else if constexpr (std::is_same_v<T, double>) {
38+
return common::hll::Murmur3Hash128::hash64ForLong(
39+
*reinterpret_cast<const int64_t*>(&value), 0);
40+
}
41+
return common::hll::Murmur3Hash128::hash64(&value, sizeof(T), 0);
42+
} else {
43+
return XXH64(&value, sizeof(T), 0);
44+
}
45+
}
46+
47+
// Use timestamp.toMillis() to compute hash value.
48+
template <>
49+
inline uint64_t hashOne<Timestamp, false>(const Timestamp& value) {
50+
return hashOne<int64_t, false>(value.toMillis());
51+
}
52+
53+
template <>
54+
inline uint64_t hashOne<Timestamp, true>(const Timestamp& /*value*/) {
55+
VELOX_UNREACHABLE("approx_set(timestamp) is not supported.");
56+
}
57+
58+
template <>
59+
inline uint64_t hashOne<StringView, false>(const StringView& value) {
60+
return XXH64(value.data(), value.size(), 0);
61+
}
62+
63+
template <>
64+
inline uint64_t hashOne<StringView, true>(const StringView& value) {
65+
return common::hll::Murmur3Hash128::hash64(value.data(), value.size(), 0);
66+
}
67+
68+
} // namespace detail
69+
70+
template <typename T, bool HllAsFinalResult>
71+
struct HllAccumulator {
72+
explicit HllAccumulator(HashStringAllocator* allocator)
73+
: sparseHll_{allocator}, denseHll_{allocator} {}
74+
75+
void setIndexBitLength(int8_t indexBitLength) {
76+
indexBitLength_ = indexBitLength;
77+
sparseHll_.setSoftMemoryLimit(
78+
common::hll::DenseHlls::estimateInMemorySize(indexBitLength_));
79+
}
80+
81+
void append(T value) {
82+
const auto hash = detail::hashOne<T, HllAsFinalResult>(value);
83+
84+
if (isSparse_) {
85+
if (sparseHll_.insertHash(hash)) {
86+
toDense();
87+
}
88+
} else {
89+
denseHll_.insertHash(hash);
90+
}
91+
}
92+
93+
int64_t cardinality() const {
94+
return isSparse_ ? sparseHll_.cardinality() : denseHll_.cardinality();
95+
}
96+
97+
void mergeWith(StringView serialized, HashStringAllocator* allocator) {
98+
auto input = serialized.data();
99+
if (common::hll::SparseHlls::canDeserialize(input)) {
100+
if (isSparse_) {
101+
sparseHll_.mergeWith(input);
102+
if (indexBitLength_ < 0) {
103+
setIndexBitLength(
104+
common::hll::DenseHlls::deserializeIndexBitLength(input));
105+
}
106+
if (sparseHll_.overLimit()) {
107+
toDense();
108+
}
109+
} else {
110+
common::hll::SparseHll<> other{input, allocator};
111+
other.toDense(denseHll_);
112+
}
113+
} else if (common::hll::DenseHlls::canDeserialize(input)) {
114+
if (isSparse_) {
115+
if (indexBitLength_ < 0) {
116+
setIndexBitLength(
117+
common::hll::DenseHlls::deserializeIndexBitLength(input));
118+
}
119+
toDense();
120+
}
121+
denseHll_.mergeWith(input);
122+
} else {
123+
VELOX_USER_FAIL("Unexpected type of HLL");
124+
}
125+
}
126+
127+
int32_t serializedSize() {
128+
return isSparse_ ? sparseHll_.serializedSize() : denseHll_.serializedSize();
129+
}
130+
131+
void serialize(char* outputBuffer) {
132+
return isSparse_ ? sparseHll_.serialize(indexBitLength_, outputBuffer)
133+
: denseHll_.serialize(outputBuffer);
134+
}
135+
136+
private:
137+
void toDense() {
138+
isSparse_ = false;
139+
denseHll_.initialize(indexBitLength_);
140+
sparseHll_.toDense(denseHll_);
141+
sparseHll_.reset();
142+
}
143+
144+
bool isSparse_{true};
145+
int8_t indexBitLength_{-1};
146+
common::hll::SparseHll<> sparseHll_;
147+
common::hll::DenseHll<> denseHll_;
148+
};
149+
150+
template <>
151+
struct HllAccumulator<bool, false> {
152+
explicit HllAccumulator(HashStringAllocator* /*allocator*/) {}
153+
154+
void append(bool value) {
155+
approxDistinctState_ |= (1 << value);
156+
}
157+
158+
int64_t cardinality() const {
159+
return (approxDistinctState_ & 1) + ((approxDistinctState_ & 2) >> 1);
160+
}
161+
162+
void mergeWith(
163+
StringView /*serialized*/,
164+
HashStringAllocator* /*allocator*/) {
165+
VELOX_UNREACHABLE(
166+
"APPROX_DISTINCT<BOOLEAN> unsupported mergeWith(StringView, HashStringAllocator*)");
167+
}
168+
169+
void mergeWith(int8_t data) {
170+
approxDistinctState_ |= data;
171+
}
172+
173+
int32_t serializedSize() const {
174+
return sizeof(int8_t);
175+
}
176+
177+
void serialize(char* /*outputBuffer*/) {
178+
VELOX_UNREACHABLE("APPROX_DISTINCT<BOOLEAN> unsupported serialize(char*)");
179+
}
180+
181+
void setIndexBitLength(int8_t /*indexBitLength*/) {}
182+
183+
int8_t getState() const {
184+
return approxDistinctState_;
185+
}
186+
187+
private:
188+
int8_t approxDistinctState_{0};
189+
};
190+
191+
} // namespace facebook::velox::common::hll

0 commit comments

Comments
 (0)