Skip to content

Commit 7155bd1

Browse files
speed up bloom construction (#13073)
1 parent 307b994 commit 7155bd1

File tree

3 files changed

+76
-21
lines changed

3 files changed

+76
-21
lines changed

ydb/core/tx/columnshard/columnshard__write.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ void TColumnShard::Handle(NPrivateEvents::NWrite::TEvWritePortionResult::TPtr& e
100100
std::vector<TInsertedPortions> writtenPacks = ev->Get()->DetachInsertedPacks();
101101
const TMonotonic now = TMonotonic::Now();
102102
for (auto&& i : writtenPacks) {
103-
AFL_WARN(NKikimrServices::TX_COLUMNSHARD_WRITE)("writing_size", i.GetDataSize())("event", "data_write_finished")(
103+
AFL_DEBUG(NKikimrServices::TX_COLUMNSHARD_WRITE)("writing_size", i.GetDataSize())("event", "data_write_finished")(
104104
"writing_id", i.GetWriteMeta().GetId());
105105
Counters.OnWritePutBlobsSuccess(now - i.GetWriteMeta().GetWriteStartInstant(), i.GetRecordsCount());
106106
Counters.GetWritesMonitor()->OnFinishWrite(i.GetDataSize(), 1);

ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,50 @@
11
#pragma once
22
#include <ydb/core/tx/columnshard/engines/scheme/indexes/abstract/simple.h>
3+
4+
#include <util/generic/bitmap.h>
5+
36
namespace NKikimr::NOlap::NIndexes {
47

58
class TFixStringBitsStorage {
69
private:
710
YDB_READONLY_DEF(TString, Data);
811

12+
template <class T>
13+
class TSizeDetector {};
14+
15+
template <>
16+
class TSizeDetector<std::vector<bool>> {
17+
public:
18+
static ui32 GetSize(const std::vector<bool>& v) {
19+
return v.size();
20+
}
21+
};
22+
23+
template <>
24+
class TSizeDetector<TDynBitMap> {
25+
public:
26+
static ui32 GetSize(const TDynBitMap& v) {
27+
return v.Size();
28+
}
29+
};
30+
931
public:
1032
TFixStringBitsStorage(const TString& data)
11-
: Data(data)
12-
{}
33+
: Data(data) {
34+
}
1335

1436
static ui32 GrowBitsCountToByte(const ui32 bitsCount) {
1537
const ui32 bytesCount = bitsCount / 8;
1638
return (bytesCount + ((bitsCount % 8) ? 1 : 0)) * 8;
1739
}
1840

19-
TFixStringBitsStorage(const std::vector<bool>& bitsVector)
20-
: TFixStringBitsStorage(bitsVector.size()) {
41+
template <class TBitsVector>
42+
TFixStringBitsStorage(const TBitsVector& bitsVector)
43+
: TFixStringBitsStorage(TSizeDetector<TBitsVector>::GetSize(bitsVector)) {
2144
ui32 byteIdx = 0;
2245
ui8 byteCurrent = 0;
2346
ui8 shiftCurrent = 0;
24-
for (ui32 i = 0; i < bitsVector.size(); ++i) {
47+
for (ui32 i = 0; i < TSizeDetector<TBitsVector>::GetSize(bitsVector); ++i) {
2548
if (i && i % 8 == 0) {
2649
Data[byteIdx] = (char)byteCurrent;
2750
byteCurrent = 0;
@@ -70,26 +93,27 @@ class TBloomFilterChecker: public TSimpleIndexChecker {
7093
static TString GetClassNameStatic() {
7194
return "BLOOM_FILTER";
7295
}
96+
7397
private:
7498
using TBase = TSimpleIndexChecker;
7599
std::set<ui64> HashValues;
76100
static inline auto Registrator = TFactory::TRegistrator<TBloomFilterChecker>(GetClassNameStatic());
101+
77102
protected:
78103
virtual bool DoDeserializeFromProtoImpl(const NKikimrSSA::TProgram::TOlapIndexChecker& proto) override;
79104
virtual void DoSerializeToProtoImpl(NKikimrSSA::TProgram::TOlapIndexChecker& proto) const override;
80105

81106
virtual bool DoCheckImpl(const std::vector<TString>& blobs) const override;
107+
82108
public:
83109
TBloomFilterChecker() = default;
84110
TBloomFilterChecker(const ui32 indexId, std::set<ui64>&& hashes)
85111
: TBase(indexId)
86-
, HashValues(std::move(hashes))
87-
{
88-
112+
, HashValues(std::move(hashes)) {
89113
}
90114
virtual TString GetClassName() const override {
91115
return GetClassNameStatic();
92116
}
93117
};
94118

95-
} // namespace NKikimr::NOlap::NIndexes
119+
} // namespace NKikimr::NOlap::NIndexes

ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include <contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h>
1313
#include <library/cpp/deprecated/atomic/atomic.h>
14+
#include <util/generic/bitmap.h>
1415

1516
namespace NKikimr::NOlap::NIndexes::NBloomNGramm {
1617

@@ -179,31 +180,61 @@ class TNGrammBuilder {
179180

180181
class TVectorInserter {
181182
private:
182-
bool* Values;
183+
TDynBitMap& Values;
183184
const ui32 Size;
184185

185186
public:
186-
TVectorInserter(std::vector<bool>& values)
187-
: Values(&values[0])
188-
, Size(values.size()) {
187+
TVectorInserter(TDynBitMap& values)
188+
: Values(values)
189+
, Size(values.Size()) {
190+
AFL_VERIFY(values.Size());
189191
}
190192

191193
void operator()(const ui64 hash) {
192-
Values[hash % Size] = true;
194+
Values.Set(hash % Size);
195+
}
196+
};
197+
198+
class TVectorInserterPower2 {
199+
private:
200+
TDynBitMap& Values;
201+
const ui32 SizeMask;
202+
203+
public:
204+
TVectorInserterPower2(TDynBitMap& values)
205+
: Values(values)
206+
, SizeMask(values.Size() - 1) {
207+
AFL_VERIFY(values.Size());
208+
}
209+
210+
void operator()(const ui64 hash) {
211+
Values.Set(hash & SizeMask);
193212
}
194213
};
195214

196215
TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 /*recordsCount*/) const {
197216
AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount());
198217
TNGrammBuilder builder(HashesCount);
199218

200-
std::vector<bool> bitsVector(FilterSizeBytes * 8, false);
201-
TVectorInserter inserter(bitsVector);
202-
for (reader.Start(); reader.IsCorrect();) {
203-
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), inserter);
204-
reader.ReadNext(reader.begin()->GetCurrentChunk()->length());
219+
TDynBitMap bitMap;
220+
const ui32 size = FilterSizeBytes * 8;
221+
bitMap.Reserve(FilterSizeBytes * 8);
222+
223+
const auto doFillFilter = [&](auto& inserter) {
224+
for (reader.Start(); reader.IsCorrect();) {
225+
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), inserter);
226+
reader.ReadNext(reader.begin()->GetCurrentChunk()->length());
227+
}
228+
};
229+
230+
if ((size & (size - 1)) == 0) {
231+
TVectorInserterPower2 inserter(bitMap);
232+
doFillFilter(inserter);
233+
} else {
234+
TVectorInserter inserter(bitMap);
235+
doFillFilter(inserter);
205236
}
206-
return TFixStringBitsStorage(bitsVector).GetData();
237+
return TFixStringBitsStorage(bitMap).GetData();
207238
}
208239

209240
void TIndexMeta::DoFillIndexCheckers(

0 commit comments

Comments
 (0)