Skip to content

Commit 13813aa

Browse files
Merge d893bd2 into 97167eb
2 parents 97167eb + d893bd2 commit 13813aa

File tree

3 files changed

+104
-59
lines changed

3 files changed

+104
-59
lines changed

ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,28 @@ class TFixStringBitsStorage {
1111
: Data(data)
1212
{}
1313

14+
TFixStringBitsStorage(const std::vector<bool>& bitsVector)
15+
: TFixStringBitsStorage(bitsVector.size()) {
16+
ui32 byteIdx = 0;
17+
ui8 byteCurrent = 0;
18+
ui8 shiftCurrent = 0;
19+
for (ui32 i = 0; i < bitsVector.size(); ++i) {
20+
if (i && i % 8 == 0) {
21+
Data[byteIdx] = (char)byteCurrent;
22+
byteCurrent = 0;
23+
shiftCurrent = 1;
24+
++byteIdx;
25+
}
26+
if (bitsVector[i]) {
27+
byteCurrent += shiftCurrent;
28+
}
29+
shiftCurrent = (shiftCurrent << 1);
30+
}
31+
if (byteCurrent) {
32+
Data[byteIdx] = (char)byteCurrent;
33+
}
34+
}
35+
1436
ui32 GetSizeBits() const {
1537
return Data.size() * 8;
1638
}

ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp

Lines changed: 82 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,70 @@ namespace NKikimr::NOlap::NIndexes::NBloomNGramm {
1515

1616
class TNGrammBuilder {
1717
private:
18-
NArrow::NHash::NXX64::TStreamStringHashCalcer HashCalcer;
1918
TBuffer Zeros;
19+
const ui32 HashesCount;
20+
21+
static const ui64 HashesConstructorP = 9223372036854775783;
22+
static const ui64 HashesConstructorA = 1;
23+
24+
template <int HashIdx>
25+
class THashesBuilder {
26+
public:
27+
template <class TActor>
28+
static void Build(const ui64 originalHash, const TActor& actor) {
29+
actor((HashesConstructorA * originalHash + HashIdx) % HashesConstructorP);
30+
}
31+
};
32+
33+
template <>
34+
class THashesBuilder<0> {
35+
public:
36+
template <class TActor>
37+
static void Build(const ui64 /*originalHash*/, const TActor& /*actor*/) {
38+
}
39+
};
40+
41+
template <class TActor>
42+
void BuildHashesSet(const ui64 originalHash, const TActor& actor) const {
43+
if (HashesCount == 1) {
44+
THashesBuilder<1>::Build(originalHash, actor);
45+
} else if (HashesCount == 2) {
46+
THashesBuilder<2>::Build(originalHash, actor);
47+
} else if (HashesCount == 3) {
48+
THashesBuilder<3>::Build(originalHash, actor);
49+
} else if (HashesCount == 4) {
50+
THashesBuilder<4>::Build(originalHash, actor);
51+
} else if (HashesCount == 5) {
52+
THashesBuilder<5>::Build(originalHash, actor);
53+
} else if (HashesCount == 6) {
54+
THashesBuilder<6>::Build(originalHash, actor);
55+
} else if (HashesCount == 7) {
56+
THashesBuilder<7>::Build(originalHash, actor);
57+
} else if (HashesCount == 8) {
58+
THashesBuilder<8>::Build(originalHash, actor);
59+
} else {
60+
for (ui32 b = 1; b <= HashesCount; ++b) {
61+
const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP;
62+
actor(hash);
63+
}
64+
}
65+
}
66+
67+
ui64 CalcHash(const char* data, const ui32 size) const {
68+
if (size == 3) {
69+
return ((ui64)data[0]) | (((ui64)data[1]) << 8) | (((ui64)data[2]) << 16);
70+
} else if (size == 4) {
71+
return *(ui32*)&data[0];
72+
} else {
73+
uint64_t h = 2166136261;
74+
for (size_t i = 0; i < size; i++) {
75+
h = h ^ uint64_t(data[i]);
76+
h = h * 16777619;
77+
}
78+
return h;
79+
}
80+
}
81+
2082
template <class TAction>
2183
void BuildNGramms(const char* data, const ui32 dataSize, const std::optional<NRequest::TLikePart::EOperation> op, const ui32 nGrammSize,
2284
const TAction& pred) const {
@@ -28,24 +90,26 @@ class TNGrammBuilder {
2890
if (fakeStart.size() < nGrammSize) {
2991
fakeStart.Append(Zeros.data(), nGrammSize - fakeStart.size());
3092
}
31-
pred(fakeStart.data());
93+
BuildHashesSet(CalcHash(fakeStart.data(), nGrammSize), pred);
3294
}
3395
}
3496
for (ui32 c = 0; c < dataSize; ++c) {
3597
if (c + nGrammSize <= dataSize) {
36-
pred(data + c);
98+
pred(CalcHash(data + c, nGrammSize));
3799
} else if (!op || op == NRequest::TLikePart::EOperation::EndsWith) {
38100
TBuffer fakeStart;
39101
fakeStart.Append(data + c, dataSize - c);
40102
fakeStart.Append(Zeros.data(), nGrammSize - fakeStart.size());
41-
pred(fakeStart.data());
103+
BuildHashesSet(CalcHash(fakeStart.data(), nGrammSize), pred);
42104
}
43105
}
44106
}
45107

46108
public:
47-
TNGrammBuilder()
48-
: HashCalcer(0) {
109+
TNGrammBuilder(const ui32 hashesCount)
110+
: HashesCount(hashesCount)
111+
{
112+
AFL_VERIFY((ui64)HashesCount < HashesConstructorP);
49113
Zeros.Fill('\0', 1024);
50114
}
51115

@@ -64,15 +128,7 @@ class TNGrammBuilder {
64128
}
65129
if constexpr (arrow::has_string_view<T>()) {
66130
auto value = typedArray.GetView(row);
67-
if (value.size() < nGrammSize) {
68-
continue;
69-
}
70-
const auto pred = [&](const char* data) {
71-
HashCalcer.Start();
72-
HashCalcer.Update((const ui8*)data, nGrammSize);
73-
fillData(HashCalcer.Finish());
74-
};
75-
BuildNGramms(value.data(), value.size(), {}, nGrammSize, pred);
131+
BuildNGramms(value.data(), value.size(), {}, nGrammSize, fillData);
76132
} else {
77133
AFL_VERIFY(false);
78134
}
@@ -83,33 +139,23 @@ class TNGrammBuilder {
83139

84140
template <class TFiller>
85141
void FillNGrammHashes(const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, const TFiller& fillData) {
86-
const auto pred = [&](const char* value) {
87-
HashCalcer.Start();
88-
HashCalcer.Update((const ui8*)value, nGrammSize);
89-
fillData(HashCalcer.Finish());
90-
};
91-
BuildNGramms(userReq.data(), userReq.size(), op, nGrammSize, pred);
142+
BuildNGramms(userReq.data(), userReq.size(), op, nGrammSize, fillData);
92143
}
93144
};
94145

95146
TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const {
96147
AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount());
97-
TNGrammBuilder builder;
148+
TNGrammBuilder builder(HashesCount);
98149

99-
TFixStringBitsStorage bits(FilterSizeBytes * 8);
100-
101-
const auto pred = [&](const ui64 hash) {
102-
const auto predSet = [&](const ui64 hashSecondary) {
103-
bits.Set(true, hashSecondary % bits.GetSizeBits());
104-
};
105-
BuildHashesSet(hash, predSet);
150+
std::vector<bool> bitsVector(FilterSizeBytes * 8, false);
151+
const auto predSet = [&](const ui64 hashSecondary) {
152+
bitsVector[hashSecondary % (FilterSizeBytes * 8)] = true;
106153
};
107154
for (reader.Start(); reader.IsCorrect();) {
108-
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), pred);
155+
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), predSet);
109156
reader.ReadNext(reader.begin()->GetCurrentChunk()->length());
110157
}
111-
112-
return bits.GetData();
158+
return TFixStringBitsStorage(bitsVector).GetData();
113159
}
114160

115161
void TIndexMeta::DoFillIndexCheckers(
@@ -133,16 +179,13 @@ void TIndexMeta::DoFillIndexCheckers(
133179
}
134180

135181
std::set<ui64> hashes;
136-
const auto pred = [&](const ui64 hash) {
137-
const auto predSet = [&](const ui64 hashSecondary) {
138-
hashes.emplace(hashSecondary);
139-
};
140-
BuildHashesSet(hash, predSet);
182+
const auto predSet = [&](const ui64 hashSecondary) {
183+
hashes.emplace(hashSecondary);
141184
};
142-
TNGrammBuilder builder;
185+
TNGrammBuilder builder(HashesCount);
143186
for (auto&& c : foundColumns) {
144187
for (auto&& ls : c.second.GetLikeSequences()) {
145-
builder.FillNGrammHashes(NGrammSize, ls.second.GetOperation(), ls.second.GetValue(), pred);
188+
builder.FillNGrammHashes(NGrammSize, ls.second.GetOperation(), ls.second.GetValue(), predSet);
146189
}
147190
}
148191
branch->MutableIndexes().emplace_back(std::make_shared<TFilterChecker>(GetIndexId(), std::move(hashes)));

ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,26 +23,6 @@ class TIndexMeta: public TIndexByColumns {
2323
AFL_VERIFY(NGrammSize > 2);
2424
}
2525

26-
static const ui64 HashesConstructorP = ((ui64)2 << 31) - 1;
27-
static const ui64 HashesConstructorA = (ui64)2 << 16;
28-
29-
template <class TActor>
30-
void BuildHashesSet(const ui64 originalHash, const TActor& actor) const {
31-
AFL_VERIFY(HashesCount < HashesConstructorP);
32-
for (ui32 b = 1; b <= HashesCount; ++b) {
33-
const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP;
34-
actor(hash);
35-
}
36-
}
37-
38-
template <class TContainer, class TActor>
39-
void BuildHashesSet(const TContainer& originalHashes, const TActor& actor) const {
40-
AFL_VERIFY(HashesCount < HashesConstructorP);
41-
for (auto&& hOriginal : originalHashes) {
42-
BuildHashesSet(hOriginal, actor);
43-
}
44-
}
45-
4626
protected:
4727
virtual TConclusionStatus DoCheckModificationCompatibility(const IIndexMeta& /*newMeta*/) const override {
4828
return TConclusionStatus::Fail("not supported");

0 commit comments

Comments
 (0)