Skip to content

Commit a86f731

Browse files
fixes
1 parent 934720e commit a86f731

File tree

4 files changed

+61
-23
lines changed

4 files changed

+61
-23
lines changed

ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ TString TBloomIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui3
1414
const ui32 bitsCount = TFixStringBitsStorage::GrowBitsCountToByte(HashesCount * recordsCount / std::log(2));
1515
std::vector<bool> filterBits(bitsCount, false);
1616
for (ui32 i = 0; i < HashesCount; ++i) {
17-
NArrow::NHash::NXX64::TStreamStringHashCalcer hashCalcer(i);
17+
NArrow::NHash::NXX64::TStreamStringHashCalcer_H3 hashCalcer(i);
1818
for (reader.Start(); reader.IsCorrect(); reader.ReadNext()) {
1919
hashCalcer.Start();
2020
for (auto&& i : reader) {
@@ -47,7 +47,7 @@ void TBloomIndexMeta::DoFillIndexCheckers(const std::shared_ptr<NRequest::TDataF
4747
}
4848
std::set<ui64> hashes;
4949
for (ui32 i = 0; i < HashesCount; ++i) {
50-
NArrow::NHash::NXX64::TStreamStringHashCalcer calcer(i);
50+
NArrow::NHash::NXX64::TStreamStringHashCalcer_H3 calcer(i);
5151
calcer.Start();
5252
for (auto&& i : foundColumns) {
5353
NArrow::NHash::TXX64::AppendField(i.second, calcer);

ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class TNGrammBuilder {
4040
class THashesCountSelector {
4141
public:
4242
template <class TActor>
43-
static void BuildHashes(const ui8* data, const TActor& actor) {
43+
static void BuildHashes(const ui8* data, TActor& actor) {
4444
ui64 hash = (ui64)2166136261 * (ui64)HashIdx;
4545
actor(THashesBuilder<CharsCount>::Build(data, hash));
4646
THashesCountSelector<HashIdx - 1, CharsCount>::BuildHashes(data, actor);
@@ -51,7 +51,7 @@ class TNGrammBuilder {
5151
class THashesCountSelector<0, CharsCount> {
5252
public:
5353
template <class TActor>
54-
static void BuildHashes(const ui8* /*data*/, const TActor& /*actor*/) {
54+
static void BuildHashes(const ui8* /*data*/, TActor& /*actor*/) {
5555
}
5656
};
5757

@@ -60,7 +60,7 @@ class TNGrammBuilder {
6060
private:
6161
template <class TActor>
6262
static void BuildHashesImpl(
63-
const ui8* data, const ui32 dataSize, const std::optional<NRequest::TLikePart::EOperation> op, const TActor& actor) {
63+
const ui8* data, const ui32 dataSize, const std::optional<NRequest::TLikePart::EOperation> op, TActor& actor) {
6464
TBuffer fakeString;
6565
if (!op || op == NRequest::TLikePart::EOperation::StartsWith) {
6666
for (ui32 c = 1; c <= CharsCount; ++c) {
@@ -90,7 +90,7 @@ class TNGrammBuilder {
9090
public:
9191
template <class TActor>
9292
static void BuildHashes(const ui8* data, const ui32 dataSize, const ui32 hashesCount, const ui32 nGrammSize,
93-
const std::optional<NRequest::TLikePart::EOperation> op, const TActor& actor) {
93+
const std::optional<NRequest::TLikePart::EOperation> op, TActor& actor) {
9494
if (HashesCount == hashesCount && CharsCount == nGrammSize) {
9595
BuildHashesImpl(data, dataSize, op, actor);
9696
} else if (HashesCount > hashesCount && CharsCount > nGrammSize) {
@@ -105,13 +105,12 @@ class TNGrammBuilder {
105105
}
106106
};
107107

108-
109108
template <ui32 CharsCount>
110109
class THashesSelector<0, CharsCount> {
111110
public:
112111
template <class TActor>
113112
static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/,
114-
const std::optional<NRequest::TLikePart::EOperation> /*op*/, const TActor& /*actor*/) {
113+
const std::optional<NRequest::TLikePart::EOperation> /*op*/, TActor& /*actor*/) {
115114
AFL_VERIFY(false);
116115
}
117116
};
@@ -121,7 +120,7 @@ class TNGrammBuilder {
121120
public:
122121
template <class TActor>
123122
static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/,
124-
const std::optional<NRequest::TLikePart::EOperation> /*op*/, const TActor& /*actor*/) {
123+
const std::optional<NRequest::TLikePart::EOperation> /*op*/, TActor& /*actor*/) {
125124
AFL_VERIFY(false);
126125
}
127126
};
@@ -131,14 +130,14 @@ class TNGrammBuilder {
131130
public:
132131
template <class TActor>
133132
static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/,
134-
const std::optional<NRequest::TLikePart::EOperation> /*op*/, const TActor& /*actor*/) {
133+
const std::optional<NRequest::TLikePart::EOperation> /*op*/, TActor& /*actor*/) {
135134
AFL_VERIFY(false);
136135
}
137136
};
138137

139138
template <class TAction>
140-
void BuildNGramms(const char* data, const ui32 dataSize, const std::optional<NRequest::TLikePart::EOperation> op, const ui32 nGrammSize,
141-
const TAction& pred) {
139+
void BuildNGramms(
140+
const char* data, const ui32 dataSize, const std::optional<NRequest::TLikePart::EOperation> op, const ui32 nGrammSize, TAction& pred) {
142141
THashesSelector<TConstants::MaxHashesCount, TConstants::MaxNGrammSize>::BuildHashes(
143142
(const ui8*)data, dataSize, HashesCount, nGrammSize, op, pred);
144143
}
@@ -149,7 +148,7 @@ class TNGrammBuilder {
149148
}
150149

151150
template <class TFiller>
152-
void FillNGrammHashes(const ui32 nGrammSize, const std::shared_ptr<arrow::Array>& array, const TFiller& fillData) {
151+
void FillNGrammHashes(const ui32 nGrammSize, const std::shared_ptr<arrow::Array>& array, TFiller& fillData) {
153152
AFL_VERIFY(array->type_id() == arrow::utf8()->id())("id", array->type()->ToString());
154153
NArrow::SwitchType(array->type_id(), [&](const auto& type) {
155154
using TWrap = std::decay_t<decltype(type)>;
@@ -173,22 +172,35 @@ class TNGrammBuilder {
173172
}
174173

175174
template <class TFiller>
176-
void FillNGrammHashes(const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, const TFiller& fillData) {
175+
void FillNGrammHashes(const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, TFiller& fillData) {
177176
BuildNGramms(userReq.data(), userReq.size(), op, nGrammSize, fillData);
178177
}
179178
};
180179

180+
class TVectorInserter {
181+
private:
182+
bool* Values;
183+
const ui32 Size;
184+
185+
public:
186+
TVectorInserter(std::vector<bool>& values)
187+
: Values(&values[0])
188+
, Size(values.size()) {
189+
}
190+
191+
void operator()(const ui64 hash) {
192+
Values[hash % Size] = true;
193+
}
194+
};
195+
181196
TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 /*recordsCount*/) const {
182197
AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount());
183198
TNGrammBuilder builder(HashesCount);
184199

185200
std::vector<bool> bitsVector(FilterSizeBytes * 8, false);
186-
bool* memAccessor = &bitsVector[0];
187-
const auto predSet = [&](const ui64 hashSecondary) {
188-
memAccessor[hashSecondary % (FilterSizeBytes * 8)] = true;
189-
};
201+
TVectorInserter inserter(bitsVector);
190202
for (reader.Start(); reader.IsCorrect();) {
191-
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), predSet);
203+
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), inserter);
192204
reader.ReadNext(reader.begin()->GetCurrentChunk()->length());
193205
}
194206
return TFixStringBitsStorage(bitsVector).GetData();

ydb/library/formats/arrow/hash/xx_hash.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,28 @@
22

33
namespace NKikimr::NArrow::NHash::NXX64 {
44

5-
void TStreamStringHashCalcer::Start() {
5+
void TStreamStringHashCalcer_H3::Start() {
66
XXH3_64bits_reset_withSeed(&HashState, Seed);
77
}
88

9-
void TStreamStringHashCalcer::Update(const ui8* data, const ui32 size) {
9+
void TStreamStringHashCalcer_H3::Update(const ui8* data, const ui32 size) {
1010
XXH3_64bits_update(&HashState, data, size);
1111
}
1212

13-
ui64 TStreamStringHashCalcer::Finish() {
13+
ui64 TStreamStringHashCalcer_H3::Finish() {
1414
return XXH3_64bits_digest(&HashState);
1515
}
1616

17+
void TStreamStringHashCalcer::Start() {
18+
XXH64_reset(&HashState, Seed);
19+
}
20+
21+
void TStreamStringHashCalcer::Update(const ui8* data, const ui32 size) {
22+
XXH64_update(&HashState, data, size);
23+
}
24+
25+
ui64 TStreamStringHashCalcer::Finish() {
26+
return XXH64_digest(&HashState);
27+
}
28+
1729
}

ydb/library/formats/arrow/hash/xx_hash.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ namespace NKikimr::NArrow::NHash::NXX64 {
1010
class TStreamStringHashCalcer {
1111
private:
1212
const ui64 Seed;
13-
XXH3_state_t HashState;
13+
XXH64_state_t HashState;
1414
public:
1515
TStreamStringHashCalcer(const ui64 seed)
1616
: Seed(seed) {
@@ -21,4 +21,18 @@ class TStreamStringHashCalcer {
2121
ui64 Finish();
2222
};
2323

24+
class TStreamStringHashCalcer_H3 {
25+
private:
26+
const ui64 Seed;
27+
XXH3_state_t HashState;
28+
public:
29+
TStreamStringHashCalcer_H3(const ui64 seed)
30+
: Seed(seed) {
31+
}
32+
33+
void Start();
34+
void Update(const ui8* data, const ui32 size);
35+
ui64 Finish();
36+
};
37+
2438
}

0 commit comments

Comments
 (0)