Skip to content

Commit d6cccb1

Browse files
Merge fafb00d into 2edc9b5
2 parents 2edc9b5 + fafb00d commit d6cccb1

File tree

4 files changed

+132
-76
lines changed

4 files changed

+132
-76
lines changed

ydb/core/kqp/ut/olap/indexes_ut.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
482482
}
483483
{
484484
ResetZeroLevel(csController);
485-
ui32 requestsCount = 100;
485+
ui32 requestsCount = 500;
486486
for (ui32 i = 0; i < requestsCount; ++i) {
487487
const ui32 idx = RandomNumber<ui32>(uids.size());
488488
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
@@ -494,12 +494,12 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
494494
};
495495
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
496496
}
497-
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)("approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
497+
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)("approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
498498
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
499499
}
500500
{
501501
ResetZeroLevel(csController);
502-
ui32 requestsCount = 100;
502+
ui32 requestsCount = 500;
503503
for (ui32 i = 0; i < requestsCount; ++i) {
504504
const ui32 idx = RandomNumber<ui32>(uids.size());
505505
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
@@ -511,13 +511,13 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
511511
};
512512
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
513513
}
514-
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)(
514+
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)(
515515
"approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
516516
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
517517
}
518518
{
519519
ResetZeroLevel(csController);
520-
ui32 requestsCount = 100;
520+
ui32 requestsCount = 500;
521521
for (ui32 i = 0; i < requestsCount; ++i) {
522522
const ui32 idx = RandomNumber<ui32>(uids.size());
523523
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
@@ -529,7 +529,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
529529
};
530530
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
531531
}
532-
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)(
532+
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)(
533533
"approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
534534
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
535535
}

ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,28 @@ class TFixStringBitsStorage {
1111
: Data(data)
1212
{}
1313

14+
TFixStringBitsStorage(const std::vector<bool>& bitsVector)
15+
: TFixStringBitsStorage(bitsVector.size()) {
16+
ui32 byteIdx = 0;
17+
ui8 byteCurrent = 0;
18+
ui8 shiftCurrent = 0;
19+
for (ui32 i = 0; i < bitsVector.size(); ++i) {
20+
if (i && i % 8 == 0) {
21+
Data[byteIdx] = (char)byteCurrent;
22+
byteCurrent = 0;
23+
shiftCurrent = 1;
24+
++byteIdx;
25+
}
26+
if (bitsVector[i]) {
27+
byteCurrent += shiftCurrent;
28+
}
29+
shiftCurrent = (shiftCurrent << 1);
30+
}
31+
if (byteCurrent) {
32+
Data[byteIdx] = (char)byteCurrent;
33+
}
34+
}
35+
1436
ui32 GetSizeBits() const {
1537
return Data.size() * 8;
1638
}

ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp

Lines changed: 104 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -15,37 +15,111 @@ namespace NKikimr::NOlap::NIndexes::NBloomNGramm {
1515

1616
class TNGrammBuilder {
1717
private:
18-
NArrow::NHash::NXX64::TStreamStringHashCalcer HashCalcer;
1918
TBuffer Zeros;
19+
const ui32 HashesCount;
20+
21+
static const ui64 HashesConstructorP = 9223372036854775783;
22+
static const ui64 HashesConstructorA = 1;
23+
24+
template <int HashIdx>
25+
class THashesBuilder {
26+
public:
27+
template <class TActor>
28+
static void Build(const ui64 originalHash, const TActor& actor) {
29+
actor((HashesConstructorA * originalHash + HashIdx) % HashesConstructorP);
30+
}
31+
};
32+
33+
template <>
34+
class THashesBuilder<0> {
35+
public:
36+
template <class TActor>
37+
static void Build(const ui64 /*originalHash*/, const TActor& /*actor*/) {
38+
}
39+
};
40+
41+
template <class TActor>
42+
void BuildHashesSet(const ui64 originalHash, const TActor& actor) const {
43+
if (HashesCount == 1) {
44+
THashesBuilder<1>::Build(originalHash, actor);
45+
} else if (HashesCount == 2) {
46+
THashesBuilder<2>::Build(originalHash, actor);
47+
} else if (HashesCount == 3) {
48+
THashesBuilder<3>::Build(originalHash, actor);
49+
} else if (HashesCount == 4) {
50+
THashesBuilder<4>::Build(originalHash, actor);
51+
} else if (HashesCount == 5) {
52+
THashesBuilder<5>::Build(originalHash, actor);
53+
} else if (HashesCount == 6) {
54+
THashesBuilder<6>::Build(originalHash, actor);
55+
} else if (HashesCount == 7) {
56+
THashesBuilder<7>::Build(originalHash, actor);
57+
} else if (HashesCount == 8) {
58+
THashesBuilder<8>::Build(originalHash, actor);
59+
} else {
60+
for (ui32 b = 1; b <= HashesCount; ++b) {
61+
const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP;
62+
actor(hash);
63+
}
64+
}
65+
}
66+
67+
ui64 CalcHash(const char* data, const ui32 size) const {
68+
if (size == 3) {
69+
return (*(const ui32*)data) & 0x00FFFFFF;
70+
// TStringBuilder sb;
71+
// sb << res << "/" << (ui32)((ui8*)&res)[0] << "/" << (ui32)((ui8*)&res)[1] << "/" << (ui32)((ui8*)&res)[2] << "/"
72+
// << (ui32)((ui8*)&res)[3] << " vs " << (ui64)data[0] << "/" << (((ui64)data[1])) << "/" << (((ui64)data[2])) << Endl;
73+
// Cerr << sb;
74+
// return (ui64(*(const ui32*)data)) >> 8;
75+
} else if (size == 4) {
76+
return *(const ui32*)data;
77+
} else {
78+
uint64_t h = 2166136261;
79+
for (size_t i = 0; i < size; i++) {
80+
h = h ^ uint64_t(data[i]);
81+
h = h * 16777619;
82+
}
83+
return h;
84+
}
85+
}
86+
2087
template <class TAction>
2188
void BuildNGramms(const char* data, const ui32 dataSize, const std::optional<NRequest::TLikePart::EOperation> op, const ui32 nGrammSize,
2289
const TAction& pred) const {
90+
TBuffer fakeString;
91+
AFL_VERIFY(nGrammSize >= 3)("value", nGrammSize);
2392
if (!op || op == NRequest::TLikePart::EOperation::StartsWith) {
2493
for (ui32 c = 1; c <= nGrammSize; ++c) {
25-
TBuffer fakeStart;
26-
fakeStart.Fill('\0', nGrammSize - c);
27-
fakeStart.Append(data, std::min(c, dataSize));
28-
if (fakeStart.size() < nGrammSize) {
29-
fakeStart.Append(Zeros.data(), nGrammSize - fakeStart.size());
94+
fakeString.Clear();
95+
fakeString.Fill('\0', nGrammSize - c);
96+
fakeString.Append(data, std::min(c, dataSize));
97+
if (fakeString.size() < nGrammSize) {
98+
fakeString.Fill('\0', nGrammSize - fakeString.size());
3099
}
31-
pred(fakeStart.data());
100+
BuildHashesSet(CalcHash(fakeString.data(), nGrammSize), pred);
32101
}
33102
}
34-
for (ui32 c = 0; c < dataSize; ++c) {
35-
if (c + nGrammSize <= dataSize) {
36-
pred(data + c);
37-
} else if (!op || op == NRequest::TLikePart::EOperation::EndsWith) {
38-
TBuffer fakeStart;
39-
fakeStart.Append(data + c, dataSize - c);
40-
fakeStart.Append(Zeros.data(), nGrammSize - fakeStart.size());
41-
pred(fakeStart.data());
103+
ui32 c = 0;
104+
for (; c + nGrammSize <= dataSize; ++c) {
105+
pred(CalcHash(data + c, nGrammSize));
106+
}
107+
108+
if (!op || op == NRequest::TLikePart::EOperation::EndsWith) {
109+
for (; c < dataSize; ++c) {
110+
fakeString.Clear();
111+
fakeString.Append(data + c, dataSize - c);
112+
fakeString.Fill('\0', nGrammSize - fakeString.size());
113+
BuildHashesSet(CalcHash(fakeString.data(), nGrammSize), pred);
42114
}
43115
}
44116
}
45117

46118
public:
47-
TNGrammBuilder()
48-
: HashCalcer(0) {
119+
TNGrammBuilder(const ui32 hashesCount)
120+
: HashesCount(hashesCount)
121+
{
122+
AFL_VERIFY((ui64)HashesCount < HashesConstructorP);
49123
Zeros.Fill('\0', 1024);
50124
}
51125

@@ -64,15 +138,7 @@ class TNGrammBuilder {
64138
}
65139
if constexpr (arrow::has_string_view<T>()) {
66140
auto value = typedArray.GetView(row);
67-
if (value.size() < nGrammSize) {
68-
continue;
69-
}
70-
const auto pred = [&](const char* data) {
71-
HashCalcer.Start();
72-
HashCalcer.Update((const ui8*)data, nGrammSize);
73-
fillData(HashCalcer.Finish());
74-
};
75-
BuildNGramms(value.data(), value.size(), {}, nGrammSize, pred);
141+
BuildNGramms(value.data(), value.size(), {}, nGrammSize, fillData);
76142
} else {
77143
AFL_VERIFY(false);
78144
}
@@ -83,33 +149,24 @@ class TNGrammBuilder {
83149

84150
template <class TFiller>
85151
void FillNGrammHashes(const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, const TFiller& fillData) {
86-
const auto pred = [&](const char* value) {
87-
HashCalcer.Start();
88-
HashCalcer.Update((const ui8*)value, nGrammSize);
89-
fillData(HashCalcer.Finish());
90-
};
91-
BuildNGramms(userReq.data(), userReq.size(), op, nGrammSize, pred);
152+
BuildNGramms(userReq.data(), userReq.size(), op, nGrammSize, fillData);
92153
}
93154
};
94155

95156
TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const {
96157
AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount());
97-
TNGrammBuilder builder;
98-
99-
TFixStringBitsStorage bits(FilterSizeBytes * 8);
158+
TNGrammBuilder builder(HashesCount);
100159

101-
const auto pred = [&](const ui64 hash) {
102-
const auto predSet = [&](const ui64 hashSecondary) {
103-
bits.Set(true, hashSecondary % bits.GetSizeBits());
104-
};
105-
BuildHashesSet(hash, predSet);
160+
std::vector<bool> bitsVector(FilterSizeBytes * 8, false);
161+
bool* memAccessor = &bitsVector[0];
162+
const auto predSet = [&](const ui64 hashSecondary) {
163+
memAccessor[hashSecondary % (FilterSizeBytes * 8)] = true;
106164
};
107165
for (reader.Start(); reader.IsCorrect();) {
108-
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), pred);
166+
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), predSet);
109167
reader.ReadNext(reader.begin()->GetCurrentChunk()->length());
110168
}
111-
112-
return bits.GetData();
169+
return TFixStringBitsStorage(bitsVector).GetData();
113170
}
114171

115172
void TIndexMeta::DoFillIndexCheckers(
@@ -133,16 +190,13 @@ void TIndexMeta::DoFillIndexCheckers(
133190
}
134191

135192
std::set<ui64> hashes;
136-
const auto pred = [&](const ui64 hash) {
137-
const auto predSet = [&](const ui64 hashSecondary) {
138-
hashes.emplace(hashSecondary);
139-
};
140-
BuildHashesSet(hash, predSet);
193+
const auto predSet = [&](const ui64 hashSecondary) {
194+
hashes.emplace(hashSecondary);
141195
};
142-
TNGrammBuilder builder;
196+
TNGrammBuilder builder(HashesCount);
143197
for (auto&& c : foundColumns) {
144198
for (auto&& ls : c.second.GetLikeSequences()) {
145-
builder.FillNGrammHashes(NGrammSize, ls.second.GetOperation(), ls.second.GetValue(), pred);
199+
builder.FillNGrammHashes(NGrammSize, ls.second.GetOperation(), ls.second.GetValue(), predSet);
146200
}
147201
}
148202
branch->MutableIndexes().emplace_back(std::make_shared<TFilterChecker>(GetIndexId(), std::move(hashes)));

ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,26 +23,6 @@ class TIndexMeta: public TIndexByColumns {
2323
AFL_VERIFY(NGrammSize > 2);
2424
}
2525

26-
static const ui64 HashesConstructorP = ((ui64)2 << 31) - 1;
27-
static const ui64 HashesConstructorA = (ui64)2 << 16;
28-
29-
template <class TActor>
30-
void BuildHashesSet(const ui64 originalHash, const TActor& actor) const {
31-
AFL_VERIFY(HashesCount < HashesConstructorP);
32-
for (ui32 b = 1; b <= HashesCount; ++b) {
33-
const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP;
34-
actor(hash);
35-
}
36-
}
37-
38-
template <class TContainer, class TActor>
39-
void BuildHashesSet(const TContainer& originalHashes, const TActor& actor) const {
40-
AFL_VERIFY(HashesCount < HashesConstructorP);
41-
for (auto&& hOriginal : originalHashes) {
42-
BuildHashesSet(hOriginal, actor);
43-
}
44-
}
45-
4626
protected:
4727
virtual TConclusionStatus DoCheckModificationCompatibility(const IIndexMeta& /*newMeta*/) const override {
4828
return TConclusionStatus::Fail("not supported");

0 commit comments

Comments
 (0)