Skip to content

Commit 3786e14

Browse files
corrections
1 parent df91876 commit 3786e14

File tree

15 files changed

+78
-82
lines changed

15 files changed

+78
-82
lines changed

ydb/core/tx/columnshard/engines/scheme/indexes/abstract/common.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <ydb/library/actors/core/log.h>
44

5+
#include <contrib/libs/xxhash/xxhash.h>
56
#include <util/string/builder.h>
67

78
namespace NKikimr::NOlap::NIndexes::NRequest {
@@ -30,4 +31,8 @@ TString TOriginalDataAddress::DebugString() const {
3031
}
3132
}
3233

34+
ui64 TOriginalDataAddress::CalcSubColumnHash(const std::string_view sv) {
35+
return XXH3_64bits(sv.data(), sv.size());
36+
}
37+
3338
} // namespace NKikimr::NOlap::NIndexes::NRequest

ydb/core/tx/columnshard/engines/scheme/indexes/abstract/common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ class TOriginalDataAddress {
2323
YDB_READONLY_DEF(TString, SubColumnName);
2424

2525
public:
26+
static ui64 CalcSubColumnHash(const std::string_view sv);
27+
28+
static ui64 CalcSubColumnHash(const TString& path) {
29+
return CalcSubColumnHash(std::string_view(path.data(), path.size()));
30+
}
31+
2632
explicit TOriginalDataAddress(const ui32 columnId, const TString& subColumnName = "")
2733
: ColumnId(columnId)
2834
, SubColumnName(subColumnName) {

ydb/core/tx/columnshard/engines/scheme/indexes/abstract/tree.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ bool TOperationNode::DoCollapse() {
213213
bool TKernelNode::DoCollapse() {
214214
if (KernelName == "JsonValue" && Children.size() == 2 && Children[1]->Is<TConstantNode>() && Children[0]->Is<TOriginalColumn>()) {
215215
auto scalar = Children[1]->As<TConstantNode>()->GetConstant();
216-
AFL_VERIFY(scalar->type->id() == arrow::binary()->id());
216+
AFL_VERIFY(scalar->type->id() == arrow::binary()->id() || scalar->type->id() == arrow::utf8()->id())("type", scalar->type->ToString());
217217
auto scalarString = static_pointer_cast<arrow::BinaryScalar>(scalar);
218218
const TString jsonPath((const char*)scalarString->value->data(), scalarString->value->size());
219219
Parent->Exchange(GetNodeId(), std::make_shared<TOriginalColumn>(Children[0]->As<TOriginalColumn>()->GetNodeId().GetColumnId(), jsonPath));

ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -54,30 +54,26 @@ TString TBloomIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui3
5454
void TBloomIndexMeta::DoFillIndexCheckers(
5555
const std::shared_ptr<NRequest::TDataForIndexesCheckers>& info, const NSchemeShard::TOlapSchema& /*schema*/) const {
5656
for (auto&& branch : info->GetBranches()) {
57-
THashMap<NRequest::TOriginalDataAddress, std::shared_ptr<arrow::Scalar>> foundColumns;
58-
auto addresses = GetDataExtractor()->GetOriginalDataAddresses(ColumnIds);
59-
for (auto&& cId : addresses) {
60-
auto itEqual = branch->GetEquals().find(cId);
61-
if (itEqual == branch->GetEquals().end()) {
62-
AFL_DEBUG(NKikimrServices::TX_COLUMNSHARD)("warn", "column not found for equal")("id", cId.DebugString());
63-
break;
57+
for (auto&& i : branch->GetEquals()) {
58+
if (i.first.GetColumnId() != GetColumnId()) {
59+
continue;
6460
}
65-
foundColumns.emplace(cId, itEqual->second);
66-
}
67-
if (foundColumns.size() != ColumnIds.size()) {
68-
continue;
69-
}
70-
std::set<ui64> hashes;
71-
for (ui32 i = 0; i < HashesCount; ++i) {
72-
NArrow::NHash::NXX64::TStreamStringHashCalcer_H3 calcer(i);
73-
calcer.Start();
74-
AFL_VERIFY(foundColumns.size() == 1)("reason", "hashmap not sorted");
75-
for (auto&& i : foundColumns) {
61+
ui64 hashBase = 0;
62+
if (!GetDataExtractor()->CheckForIndex(i.first, hashBase)) {
63+
continue;
64+
}
65+
std::set<ui64> hashes;
66+
for (ui32 hashSeed = 0; hashSeed < HashesCount; ++hashSeed) {
67+
NArrow::NHash::NXX64::TStreamStringHashCalcer_H3 calcer(hashSeed);
68+
calcer.Start();
69+
if (hashBase) {
70+
calcer.Update((const ui8*)&hashBase, sizeof(hashBase));
71+
}
7672
NArrow::NHash::TXX64::AppendField(i.second, calcer);
73+
hashes.emplace(calcer.Finish());
7774
}
78-
hashes.emplace(calcer.Finish());
75+
branch->MutableIndexes().emplace_back(std::make_shared<TBloomFilterChecker>(GetIndexId(), std::move(hashes)));
7976
}
80-
branch->MutableIndexes().emplace_back(std::make_shared<TBloomFilterChecker>(GetIndexId(), std::move(hashes)));
8177
}
8278
}
8379

ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class TBloomIndexMeta: public TIndexByColumns {
4040
auto& bFilter = proto.GetBloomFilter();
4141
FalsePositiveProbability = bFilter.GetFalsePositiveProbability();
4242
for (auto&& i : bFilter.GetColumnIds()) {
43-
ColumnIds.emplace(i);
43+
AddColumnId(i);
4444
}
4545
if (!MutableDataExtractor().DeserializeFromProto(bFilter.GetDataExtractor())) {
4646
return false;
@@ -51,7 +51,7 @@ class TBloomIndexMeta: public TIndexByColumns {
5151
virtual void DoSerializeToProto(NKikimrSchemeOp::TOlapIndexDescription& proto) const override {
5252
auto* filterProto = proto.MutableBloomFilter();
5353
filterProto->SetFalsePositiveProbability(FalsePositiveProbability);
54-
for (auto&& i : ColumnIds) {
54+
for (auto&& i : GetColumnIds()) {
5555
filterProto->AddColumnIds(i);
5656
}
5757
*filterProto->MutableDataExtractor() = GetDataExtractor().SerializeToProto();

ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -258,32 +258,24 @@ TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 rec
258258
void TIndexMeta::DoFillIndexCheckers(
259259
const std::shared_ptr<NRequest::TDataForIndexesCheckers>& info, const NSchemeShard::TOlapSchema& /*schema*/) const {
260260
for (auto&& branch : info->GetBranches()) {
261-
THashMap<NRequest::TOriginalDataAddress, NRequest::TLikeDescription> foundColumns;
262-
auto addresses = GetDataExtractor()->GetOriginalDataAddresses(ColumnIds);
263-
for (auto&& cId : addresses) {
264-
auto it = branch->GetLikes().find(cId);
265-
if (it == branch->GetLikes().end()) {
266-
AFL_DEBUG(NKikimrServices::TX_COLUMNSHARD)("warn", "not found like for column")("id", cId.DebugString());
267-
break;
261+
for (auto&& i : branch->GetLikes()) {
262+
if (i.first.GetColumnId() != GetColumnId()) {
263+
continue;
268264
}
269-
foundColumns.emplace(cId, it->second);
270-
}
271-
if (foundColumns.size() != ColumnIds.size()) {
272-
continue;
273-
}
274-
275-
std::set<ui64> hashes;
276-
const auto predSet = [&](const ui64 hashSecondary) {
277-
hashes.emplace(hashSecondary);
278-
};
279-
TNGrammBuilder builder(HashesCount);
280-
AFL_VERIFY(foundColumns.size() == 1);
281-
for (auto&& [_, c] : foundColumns) {
282-
for (auto&& ls : c.GetLikeSequences()) {
265+
ui64 hashBase;
266+
if (!GetDataExtractor()->CheckForIndex(i.first, hashBase)) {
267+
continue;
268+
}
269+
std::set<ui64> hashes;
270+
const auto predSet = [&](const ui64 hashSecondary) {
271+
hashes.emplace(hashSecondary);
272+
};
273+
TNGrammBuilder builder(HashesCount);
274+
for (auto&& ls : i.second.GetLikeSequences()) {
283275
builder.FillNGrammHashes(NGrammSize, ls.second.GetOperation(), ls.second.GetValue(), predSet);
284276
}
277+
branch->MutableIndexes().emplace_back(std::make_shared<TFilterChecker>(GetIndexId(), std::move(hashes)));
285278
}
286-
branch->MutableIndexes().emplace_back(std::make_shared<TFilterChecker>(GetIndexId(), std::move(hashes)));
287279
}
288280
}
289281

ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ class TIndexMeta: public TIndexByColumns {
7272
if (!bFilter.HasColumnId() || !bFilter.GetColumnId()) {
7373
return false;
7474
}
75-
ColumnIds.emplace(bFilter.GetColumnId());
75+
AddColumnId(bFilter.GetColumnId());
7676
Initialize();
7777
return true;
7878
}
@@ -82,12 +82,11 @@ class TIndexMeta: public TIndexByColumns {
8282
AFL_VERIFY(TConstants::CheckFilterSizeBytes(FilterSizeBytes));
8383
AFL_VERIFY(TConstants::CheckHashesCount(HashesCount));
8484
AFL_VERIFY(TConstants::CheckRecordsCount(RecordsCount));
85-
AFL_VERIFY(ColumnIds.size() == 1);
8685
filterProto->SetRecordsCount(RecordsCount);
8786
filterProto->SetNGrammSize(NGrammSize);
8887
filterProto->SetFilterSizeBytes(FilterSizeBytes);
8988
filterProto->SetHashesCount(HashesCount);
90-
filterProto->SetColumnId(*ColumnIds.begin());
89+
filterProto->SetColumnId(GetColumnId());
9190
*filterProto->MutableDataExtractor() = GetDataExtractor().SerializeToProto();
9291
}
9392

ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@ class TIndexMeta: public TIndexByColumns {
3535
AFL_VERIFY(proto.HasCountMinSketch());
3636
auto& sketch = proto.GetCountMinSketch();
3737
for (auto&& i : sketch.GetColumnIds()) {
38-
ColumnIds.emplace(i);
38+
AddColumnId(i);
3939
}
4040
return true;
4141
}
4242

4343
virtual void DoSerializeToProto(NKikimrSchemeOp::TOlapIndexDescription& proto) const override {
4444
auto* sketchProto = proto.MutableCountMinSketch();
45-
for (auto&& i : ColumnIds) {
45+
for (auto&& i : GetColumnIds()) {
4646
sketchProto->AddColumnIds(i);
4747
}
4848
}
@@ -56,10 +56,6 @@ class TIndexMeta: public TIndexByColumns {
5656
virtual TString GetClassName() const override {
5757
return GetClassNameStatic();
5858
}
59-
60-
const std::set<ui32>& GetColumnIds() const {
61-
return ColumnIds;
62-
}
6359
};
6460

6561
} // namespace NKikimr::NOlap::NIndexes::NCountMinSketch

ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@ std::shared_ptr<arrow::Scalar> TIndexMeta::GetMaxScalarVerified(
4444
}
4545

4646
NJson::TJsonValue TIndexMeta::DoSerializeDataToJson(const TString& data, const TIndexInfo& indexInfo) const {
47-
AFL_VERIFY(ColumnIds.size() == 1);
48-
auto scalar = GetMaxScalarVerified({ data }, indexInfo.GetColumnFeaturesVerified(*ColumnIds.begin()).GetArrowField()->type());
47+
auto scalar = GetMaxScalarVerified({ data }, indexInfo.GetColumnFeaturesVerified(GetColumnId()).GetArrowField()->type());
4948
return scalar->ToString();
5049
}
5150

ydb/core/tx/columnshard/engines/storage/indexes/max/meta.h

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,15 @@ class TIndexMeta: public TIndexByColumns {
3535
AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("problem", "incorrect column id");
3636
return false;
3737
};
38-
ColumnIds.emplace(bFilter.GetColumnId());
38+
AddColumnId(bFilter.GetColumnId());
3939
return true;
4040
}
4141

4242
virtual NJson::TJsonValue DoSerializeDataToJson(const TString& data, const TIndexInfo& indexInfo) const override;
4343

4444
virtual void DoSerializeToProto(NKikimrSchemeOp::TOlapIndexDescription& proto) const override {
45-
AFL_VERIFY(ColumnIds.size() == 1);
4645
auto* filterProto = proto.MutableMaxIndex();
47-
filterProto->SetColumnId(*ColumnIds.begin());
46+
filterProto->SetColumnId(GetColumnId());
4847
}
4948

5049
public:
@@ -53,11 +52,6 @@ class TIndexMeta: public TIndexByColumns {
5352
: TBase(indexId, indexName, columnId, storageId, std::make_shared<TDefaultDataExtractor>()) {
5453
}
5554

56-
ui32 GetColumnId() const {
57-
AFL_VERIFY(ColumnIds.size() == 1);
58-
return *ColumnIds.begin();
59-
}
60-
6155
static bool IsAvailableType(const NScheme::TTypeInfo type) {
6256
auto dataTypeResult = NArrow::GetArrowType(type);
6357
if (!dataTypeResult.ok()) {

0 commit comments

Comments
 (0)