@@ -15,8 +15,70 @@ namespace NKikimr::NOlap::NIndexes::NBloomNGramm {
1515
1616class TNGrammBuilder {
1717private:
18- NArrow::NHash::NXX64::TStreamStringHashCalcer HashCalcer;
1918 TBuffer Zeros;
19+ const ui32 HashesCount;
20+
21+ static const ui64 HashesConstructorP = 9223372036854775783 ;
22+ static const ui64 HashesConstructorA = 1 ;
23+
24+ template <int HashIdx>
25+ class THashesBuilder {
26+ public:
27+ template <class TActor >
28+ static void Build (const ui64 originalHash, const TActor& actor) {
29+ actor ((HashesConstructorA * originalHash + HashIdx) % HashesConstructorP);
30+ }
31+ };
32+
33+ template <>
34+ class THashesBuilder <0 > {
35+ public:
36+ template <class TActor >
37+ static void Build (const ui64 /* originalHash*/ , const TActor& /* actor*/ ) {
38+ }
39+ };
40+
41+ template <class TActor >
42+ void BuildHashesSet (const ui64 originalHash, const TActor& actor) const {
43+ if (HashesCount == 1 ) {
44+ THashesBuilder<1 >::Build (originalHash, actor);
45+ } else if (HashesCount == 2 ) {
46+ THashesBuilder<2 >::Build (originalHash, actor);
47+ } else if (HashesCount == 3 ) {
48+ THashesBuilder<3 >::Build (originalHash, actor);
49+ } else if (HashesCount == 4 ) {
50+ THashesBuilder<4 >::Build (originalHash, actor);
51+ } else if (HashesCount == 5 ) {
52+ THashesBuilder<5 >::Build (originalHash, actor);
53+ } else if (HashesCount == 6 ) {
54+ THashesBuilder<6 >::Build (originalHash, actor);
55+ } else if (HashesCount == 7 ) {
56+ THashesBuilder<7 >::Build (originalHash, actor);
57+ } else if (HashesCount == 8 ) {
58+ THashesBuilder<8 >::Build (originalHash, actor);
59+ } else {
60+ for (ui32 b = 1 ; b <= HashesCount; ++b) {
61+ const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP;
62+ actor (hash);
63+ }
64+ }
65+ }
66+
67+ ui64 CalcHash (const char * data, const ui32 size) const {
68+ if (size == 3 ) {
69+ return ((ui64)data[0 ]) | (((ui64)data[1 ]) << 8 ) | (((ui64)data[2 ]) << 16 );
70+ } else if (size == 4 ) {
71+ return *(ui32*)&data[0 ];
72+ } else {
73+ uint64_t h = 2166136261 ;
74+ for (size_t i = 0 ; i < size; i++) {
75+ h = h ^ uint64_t (data[i]);
76+ h = h * 16777619 ;
77+ }
78+ return h;
79+ }
80+ }
81+
2082 template <class TAction >
2183 void BuildNGramms (const char * data, const ui32 dataSize, const std::optional<NRequest::TLikePart::EOperation> op, const ui32 nGrammSize,
2284 const TAction& pred) const {
@@ -28,24 +90,26 @@ class TNGrammBuilder {
2890 if (fakeStart.size () < nGrammSize) {
2991 fakeStart.Append (Zeros.data (), nGrammSize - fakeStart.size ());
3092 }
31- pred ( fakeStart.data ());
93+ BuildHashesSet ( CalcHash ( fakeStart.data (), nGrammSize), pred );
3294 }
3395 }
3496 for (ui32 c = 0 ; c < dataSize; ++c) {
3597 if (c + nGrammSize <= dataSize) {
36- pred (data + c);
98+ pred (CalcHash ( data + c, nGrammSize) );
3799 } else if (!op || op == NRequest::TLikePart::EOperation::EndsWith) {
38100 TBuffer fakeStart;
39101 fakeStart.Append (data + c, dataSize - c);
40102 fakeStart.Append (Zeros.data (), nGrammSize - fakeStart.size ());
41- pred ( fakeStart.data ());
103+ BuildHashesSet ( CalcHash ( fakeStart.data (), nGrammSize), pred );
42104 }
43105 }
44106 }
45107
46108public:
47- TNGrammBuilder ()
48- : HashCalcer(0 ) {
109+ TNGrammBuilder (const ui32 hashesCount)
110+ : HashesCount(hashesCount)
111+ {
112+ AFL_VERIFY ((ui64)HashesCount < HashesConstructorP);
49113 Zeros.Fill (' \0 ' , 1024 );
50114 }
51115
@@ -64,15 +128,7 @@ class TNGrammBuilder {
64128 }
65129 if constexpr (arrow::has_string_view<T>()) {
66130 auto value = typedArray.GetView (row);
67- if (value.size () < nGrammSize) {
68- continue ;
69- }
70- const auto pred = [&](const char * data) {
71- HashCalcer.Start ();
72- HashCalcer.Update ((const ui8*)data, nGrammSize);
73- fillData (HashCalcer.Finish ());
74- };
75- BuildNGramms (value.data (), value.size (), {}, nGrammSize, pred);
131+ BuildNGramms (value.data (), value.size (), {}, nGrammSize, fillData);
76132 } else {
77133 AFL_VERIFY (false );
78134 }
@@ -83,33 +139,23 @@ class TNGrammBuilder {
83139
84140 template <class TFiller >
85141 void FillNGrammHashes (const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, const TFiller& fillData) {
86- const auto pred = [&](const char * value) {
87- HashCalcer.Start ();
88- HashCalcer.Update ((const ui8*)value, nGrammSize);
89- fillData (HashCalcer.Finish ());
90- };
91- BuildNGramms (userReq.data (), userReq.size (), op, nGrammSize, pred);
142+ BuildNGramms (userReq.data (), userReq.size (), op, nGrammSize, fillData);
92143 }
93144};
94145
95146TString TIndexMeta::DoBuildIndexImpl (TChunkedBatchReader& reader) const {
96147 AFL_VERIFY (reader.GetColumnsCount () == 1 )(" count" , reader.GetColumnsCount ());
97- TNGrammBuilder builder;
148+ TNGrammBuilder builder (HashesCount) ;
98149
99- TFixStringBitsStorage bits (FilterSizeBytes * 8 );
100-
101- const auto pred = [&](const ui64 hash) {
102- const auto predSet = [&](const ui64 hashSecondary) {
103- bits.Set (true , hashSecondary % bits.GetSizeBits ());
104- };
105- BuildHashesSet (hash, predSet);
150+ std::vector<bool > bitsVector (FilterSizeBytes * 8 , false );
151+ const auto predSet = [&](const ui64 hashSecondary) {
152+ bitsVector[hashSecondary % (FilterSizeBytes * 8 )] = true ;
106153 };
107154 for (reader.Start (); reader.IsCorrect ();) {
108- builder.FillNGrammHashes (NGrammSize, reader.begin ()->GetCurrentChunk (), pred );
155+ builder.FillNGrammHashes (NGrammSize, reader.begin ()->GetCurrentChunk (), predSet );
109156 reader.ReadNext (reader.begin ()->GetCurrentChunk ()->length ());
110157 }
111-
112- return bits.GetData ();
158+ return TFixStringBitsStorage (bitsVector).GetData ();
113159}
114160
115161void TIndexMeta::DoFillIndexCheckers (
@@ -133,16 +179,13 @@ void TIndexMeta::DoFillIndexCheckers(
133179 }
134180
135181 std::set<ui64> hashes;
136- const auto pred = [&](const ui64 hash) {
137- const auto predSet = [&](const ui64 hashSecondary) {
138- hashes.emplace (hashSecondary);
139- };
140- BuildHashesSet (hash, predSet);
182+ const auto predSet = [&](const ui64 hashSecondary) {
183+ hashes.emplace (hashSecondary);
141184 };
142- TNGrammBuilder builder;
185+ TNGrammBuilder builder (HashesCount) ;
143186 for (auto && c : foundColumns) {
144187 for (auto && ls : c.second .GetLikeSequences ()) {
145- builder.FillNGrammHashes (NGrammSize, ls.second .GetOperation (), ls.second .GetValue (), pred );
188+ builder.FillNGrammHashes (NGrammSize, ls.second .GetOperation (), ls.second .GetValue (), predSet );
146189 }
147190 }
148191 branch->MutableIndexes ().emplace_back (std::make_shared<TFilterChecker>(GetIndexId (), std::move (hashes)));
0 commit comments