@@ -15,37 +15,111 @@ namespace NKikimr::NOlap::NIndexes::NBloomNGramm {
1515
1616class TNGrammBuilder {
1717private:
18- NArrow::NHash::NXX64::TStreamStringHashCalcer HashCalcer;
1918 TBuffer Zeros;
19+ const ui32 HashesCount;
20+
21+ static const ui64 HashesConstructorP = 9223372036854775783 ;
22+ static const ui64 HashesConstructorA = 1 ;
23+
24+ template <int HashIdx>
25+ class THashesBuilder {
26+ public:
27+ template <class TActor >
28+ static void Build (const ui64 originalHash, const TActor& actor) {
29+ actor ((HashesConstructorA * originalHash + HashIdx) % HashesConstructorP);
30+ }
31+ };
32+
33+ template <>
34+ class THashesBuilder <0 > {
35+ public:
36+ template <class TActor >
37+ static void Build (const ui64 /* originalHash*/ , const TActor& /* actor*/ ) {
38+ }
39+ };
40+
41+ template <class TActor >
42+ void BuildHashesSet (const ui64 originalHash, const TActor& actor) const {
43+ if (HashesCount == 1 ) {
44+ THashesBuilder<1 >::Build (originalHash, actor);
45+ } else if (HashesCount == 2 ) {
46+ THashesBuilder<2 >::Build (originalHash, actor);
47+ } else if (HashesCount == 3 ) {
48+ THashesBuilder<3 >::Build (originalHash, actor);
49+ } else if (HashesCount == 4 ) {
50+ THashesBuilder<4 >::Build (originalHash, actor);
51+ } else if (HashesCount == 5 ) {
52+ THashesBuilder<5 >::Build (originalHash, actor);
53+ } else if (HashesCount == 6 ) {
54+ THashesBuilder<6 >::Build (originalHash, actor);
55+ } else if (HashesCount == 7 ) {
56+ THashesBuilder<7 >::Build (originalHash, actor);
57+ } else if (HashesCount == 8 ) {
58+ THashesBuilder<8 >::Build (originalHash, actor);
59+ } else {
60+ for (ui32 b = 1 ; b <= HashesCount; ++b) {
61+ const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP;
62+ actor (hash);
63+ }
64+ }
65+ }
66+
67+ ui64 CalcHash (const char * data, const ui32 size) const {
68+ if (size == 3 ) {
69+ return (*(const ui32*)data) & 0x00FFFFFF ;
70+ // TStringBuilder sb;
71+ // sb << res << "/" << (ui32)((ui8*)&res)[0] << "/" << (ui32)((ui8*)&res)[1] << "/" << (ui32)((ui8*)&res)[2] << "/"
72+ // << (ui32)((ui8*)&res)[3] << " vs " << (ui64)data[0] << "/" << (((ui64)data[1])) << "/" << (((ui64)data[2])) << Endl;
73+ // Cerr << sb;
74+ // return (ui64(*(const ui32*)data)) >> 8;
75+ } else if (size == 4 ) {
76+ return *(const ui32*)data;
77+ } else {
78+ uint64_t h = 2166136261 ;
79+ for (size_t i = 0 ; i < size; i++) {
80+ h = h ^ uint64_t (data[i]);
81+ h = h * 16777619 ;
82+ }
83+ return h;
84+ }
85+ }
86+
2087 template <class TAction >
2188 void BuildNGramms (const char * data, const ui32 dataSize, const std::optional<NRequest::TLikePart::EOperation> op, const ui32 nGrammSize,
2289 const TAction& pred) const {
90+ TBuffer fakeString;
91+ AFL_VERIFY (nGrammSize >= 3 )(" value" , nGrammSize);
2392 if (!op || op == NRequest::TLikePart::EOperation::StartsWith) {
2493 for (ui32 c = 1 ; c <= nGrammSize; ++c) {
25- TBuffer fakeStart ;
26- fakeStart .Fill (' \0 ' , nGrammSize - c);
27- fakeStart .Append (data, std::min (c, dataSize));
28- if (fakeStart .size () < nGrammSize) {
29- fakeStart. Append (Zeros. data () , nGrammSize - fakeStart .size ());
94+ fakeString. Clear () ;
95+ fakeString .Fill (' \0 ' , nGrammSize - c);
96+ fakeString .Append (data, std::min (c, dataSize));
97+ if (fakeString .size () < nGrammSize) {
98+ fakeString. Fill ( ' \0 ' , nGrammSize - fakeString .size ());
3099 }
31- pred (fakeStart .data ());
100+ BuildHashesSet ( CalcHash (fakeString .data (), nGrammSize), pred );
32101 }
33102 }
34- for (ui32 c = 0 ; c < dataSize; ++c) {
35- if (c + nGrammSize <= dataSize) {
36- pred (data + c);
37- } else if (!op || op == NRequest::TLikePart::EOperation::EndsWith) {
38- TBuffer fakeStart;
39- fakeStart.Append (data + c, dataSize - c);
40- fakeStart.Append (Zeros.data (), nGrammSize - fakeStart.size ());
41- pred (fakeStart.data ());
103+ ui32 c = 0 ;
104+ for (; c + nGrammSize <= dataSize; ++c) {
105+ pred (CalcHash (data + c, nGrammSize));
106+ }
107+
108+ if (!op || op == NRequest::TLikePart::EOperation::EndsWith) {
109+ for (; c < dataSize; ++c) {
110+ fakeString.Clear ();
111+ fakeString.Append (data + c, dataSize - c);
112+ fakeString.Fill (' \0 ' , nGrammSize - fakeString.size ());
113+ BuildHashesSet (CalcHash (fakeString.data (), nGrammSize), pred);
42114 }
43115 }
44116 }
45117
46118public:
47- TNGrammBuilder ()
48- : HashCalcer(0 ) {
119+ TNGrammBuilder (const ui32 hashesCount)
120+ : HashesCount(hashesCount)
121+ {
122+ AFL_VERIFY ((ui64)HashesCount < HashesConstructorP);
49123 Zeros.Fill (' \0 ' , 1024 );
50124 }
51125
@@ -64,15 +138,7 @@ class TNGrammBuilder {
64138 }
65139 if constexpr (arrow::has_string_view<T>()) {
66140 auto value = typedArray.GetView (row);
67- if (value.size () < nGrammSize) {
68- continue ;
69- }
70- const auto pred = [&](const char * data) {
71- HashCalcer.Start ();
72- HashCalcer.Update ((const ui8*)data, nGrammSize);
73- fillData (HashCalcer.Finish ());
74- };
75- BuildNGramms (value.data (), value.size (), {}, nGrammSize, pred);
141+ BuildNGramms (value.data (), value.size (), {}, nGrammSize, fillData);
76142 } else {
77143 AFL_VERIFY (false );
78144 }
@@ -83,33 +149,24 @@ class TNGrammBuilder {
83149
84150 template <class TFiller >
85151 void FillNGrammHashes (const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, const TFiller& fillData) {
86- const auto pred = [&](const char * value) {
87- HashCalcer.Start ();
88- HashCalcer.Update ((const ui8*)value, nGrammSize);
89- fillData (HashCalcer.Finish ());
90- };
91- BuildNGramms (userReq.data (), userReq.size (), op, nGrammSize, pred);
152+ BuildNGramms (userReq.data (), userReq.size (), op, nGrammSize, fillData);
92153 }
93154};
94155
95156TString TIndexMeta::DoBuildIndexImpl (TChunkedBatchReader& reader) const {
96157 AFL_VERIFY (reader.GetColumnsCount () == 1 )(" count" , reader.GetColumnsCount ());
97- TNGrammBuilder builder;
98-
99- TFixStringBitsStorage bits (FilterSizeBytes * 8 );
158+ TNGrammBuilder builder (HashesCount);
100159
101- const auto pred = [&](const ui64 hash) {
102- const auto predSet = [&](const ui64 hashSecondary) {
103- bits.Set (true , hashSecondary % bits.GetSizeBits ());
104- };
105- BuildHashesSet (hash, predSet);
160+ std::vector<bool > bitsVector (FilterSizeBytes * 8 , false );
161+ bool * memAccessor = &bitsVector[0 ];
162+ const auto predSet = [&](const ui64 hashSecondary) {
163+ memAccessor[hashSecondary % (FilterSizeBytes * 8 )] = true ;
106164 };
107165 for (reader.Start (); reader.IsCorrect ();) {
108- builder.FillNGrammHashes (NGrammSize, reader.begin ()->GetCurrentChunk (), pred );
166+ builder.FillNGrammHashes (NGrammSize, reader.begin ()->GetCurrentChunk (), predSet );
109167 reader.ReadNext (reader.begin ()->GetCurrentChunk ()->length ());
110168 }
111-
112- return bits.GetData ();
169+ return TFixStringBitsStorage (bitsVector).GetData ();
113170}
114171
115172void TIndexMeta::DoFillIndexCheckers (
@@ -133,16 +190,13 @@ void TIndexMeta::DoFillIndexCheckers(
133190 }
134191
135192 std::set<ui64> hashes;
136- const auto pred = [&](const ui64 hash) {
137- const auto predSet = [&](const ui64 hashSecondary) {
138- hashes.emplace (hashSecondary);
139- };
140- BuildHashesSet (hash, predSet);
193+ const auto predSet = [&](const ui64 hashSecondary) {
194+ hashes.emplace (hashSecondary);
141195 };
142- TNGrammBuilder builder;
196+ TNGrammBuilder builder (HashesCount) ;
143197 for (auto && c : foundColumns) {
144198 for (auto && ls : c.second .GetLikeSequences ()) {
145- builder.FillNGrammHashes (NGrammSize, ls.second .GetOperation (), ls.second .GetValue (), pred );
199+ builder.FillNGrammHashes (NGrammSize, ls.second .GetOperation (), ls.second .GetValue (), predSet );
146200 }
147201 }
148202 branch->MutableIndexes ().emplace_back (std::make_shared<TFilterChecker>(GetIndexId (), std::move (hashes)));
0 commit comments