Skip to content

Commit 9a05b04

Browse files
case unsensitive index (#16642)
1 parent 107ab50 commit 9a05b04

File tree

32 files changed

+559
-243
lines changed

32 files changed

+559
-243
lines changed

ydb/core/formats/arrow/program/abstract.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,44 @@ class TAccessorsCollection;
1313

1414
namespace NKikimr::NArrow::NSSA {
1515

16+
class TIndexCheckOperation {
17+
public:
18+
enum class EOperation : ui32 {
19+
Equals,
20+
StartsWith,
21+
EndsWith,
22+
Contains
23+
};
24+
25+
private:
26+
const EOperation Operation;
27+
YDB_READONLY(bool, CaseSensitive, true);
28+
29+
public:
30+
TString GetSignalId() const {
31+
return TStringBuilder() << Operation << "::" << (CaseSensitive ? 1 : 0);
32+
}
33+
34+
TString DebugString() const {
35+
return TStringBuilder() << "{" << Operation << "," << CaseSensitive << "}";
36+
}
37+
38+
EOperation GetOperation() const {
39+
return Operation;
40+
}
41+
42+
TIndexCheckOperation(const EOperation op, const bool caseSensitive)
43+
: Operation(op)
44+
, CaseSensitive(caseSensitive) {
45+
}
46+
47+
explicit operator size_t() const {
48+
return (size_t)Operation;
49+
}
50+
51+
bool operator==(const TIndexCheckOperation& op) const = default;
52+
};
53+
1654
using IChunkedArray = NAccessor::IChunkedArray;
1755
using TAccessorsCollection = NAccessor::TAccessorsCollection;
1856

ydb/core/formats/arrow/program/assign_internal.cpp

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,19 +51,12 @@ NJson::TJsonValue TCalculationProcessor::DoDebugJson() const {
5151

5252
ui64 TCalculationProcessor::DoGetWeight() const {
5353
if (KernelLogic) {
54-
return 0;
54+
return (ui64)KernelLogic->GetWeight();
5555
}
5656
if (!YqlOperationId) {
57-
return 10;
58-
} else if ((NYql::TKernelRequestBuilder::EBinaryOp)*YqlOperationId == NYql::TKernelRequestBuilder::EBinaryOp::StartsWith ||
59-
(NYql::TKernelRequestBuilder::EBinaryOp)*YqlOperationId == NYql::TKernelRequestBuilder::EBinaryOp::EndsWith) {
60-
return 7;
61-
} else if ((NYql::TKernelRequestBuilder::EBinaryOp)*YqlOperationId == NYql::TKernelRequestBuilder::EBinaryOp::StringContains) {
62-
return 10;
63-
} else if ((NYql::TKernelRequestBuilder::EBinaryOp)*YqlOperationId == NYql::TKernelRequestBuilder::EBinaryOp::Equals) {
64-
return 5;
57+
return (ui64)ECalculationHardness::Unknown;
6558
}
66-
return 0;
59+
return (ui64)ECalculationHardness::NotSpecified;
6760
}
6861

6962
TString TCalculationProcessor::DoGetSignalCategoryName() const {

ydb/core/formats/arrow/program/execution.h

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,6 @@
1010

1111
namespace NKikimr::NArrow::NSSA {
1212

13-
enum class EIndexCheckOperation {
14-
Equals,
15-
StartsWith,
16-
EndsWith,
17-
Contains
18-
};
19-
2013
class TProcessorContext;
2114

2215
class IFetchLogic {
@@ -153,15 +146,15 @@ class IDataSource {
153146

154147
class TFetchIndexContext {
155148
public:
156-
using EOperation = EIndexCheckOperation;
149+
using TOperation = TIndexCheckOperation;
157150

158151
class TOperationsBySubColumn {
159152
private:
160153
std::optional<bool> FullColumnOperations;
161-
THashMap<TString, THashSet<EOperation>> Data;
154+
THashMap<TString, THashSet<TOperation>> Data;
162155

163156
public:
164-
const THashMap<TString, THashSet<EOperation>>& GetData() const {
157+
const THashMap<TString, THashSet<TOperation>>& GetData() const {
165158
return Data;
166159
}
167160

@@ -170,7 +163,7 @@ class IDataSource {
170163
return !*FullColumnOperations;
171164
}
172165

173-
TOperationsBySubColumn& Add(const TString& subColumn, const EOperation operation, const bool strict = true) {
166+
TOperationsBySubColumn& Add(const TString& subColumn, const TOperation operation, const bool strict = true) {
174167
if (FullColumnOperations) {
175168
AFL_VERIFY(*FullColumnOperations == !subColumn);
176169
} else {
@@ -196,7 +189,7 @@ class IDataSource {
196189
for (auto&& i : OperationsBySubColumn.GetData()) {
197190
auto& subColumnJson = result.InsertValue(i.first, NJson::JSON_ARRAY);
198191
for (auto&& op : i.second) {
199-
subColumnJson.AppendValue(::ToString(op));
192+
subColumnJson.AppendValue(op.DebugString());
200193
}
201194
}
202195
return result;
@@ -231,15 +224,19 @@ class IDataSource {
231224
private:
232225
YDB_READONLY(ui32, ColumnId, 0);
233226
YDB_READONLY_DEF(TString, SubColumnName);
234-
YDB_READONLY(EIndexCheckOperation, Operation, EIndexCheckOperation::Equals);
227+
TIndexCheckOperation Operation;
235228

236229
public:
237-
TCheckIndexContext(const ui32 columnId, const TString& subColumnName, const EIndexCheckOperation operation)
230+
TCheckIndexContext(const ui32 columnId, const TString& subColumnName, const TIndexCheckOperation& operation)
238231
: ColumnId(columnId)
239232
, SubColumnName(subColumnName)
240233
, Operation(operation) {
241234
}
242235

236+
const TIndexCheckOperation& GetOperation() const {
237+
return Operation;
238+
}
239+
243240
bool operator==(const TCheckIndexContext& item) const {
244241
return std::tie(ColumnId, SubColumnName, Operation) == std::tie(item.ColumnId, item.SubColumnName, item.Operation);
245242
}

ydb/core/formats/arrow/program/graph_optimization.cpp

Lines changed: 60 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
#include <ydb/library/arrow_kernels/operations.h>
1111
#include <ydb/library/formats/arrow/switch/switch_type.h>
1212

13+
#include <library/cpp/string_utils/quote/quote.h>
1314
#include <util/string/builder.h>
15+
#include <util/string/escape.h>
1416
#include <yql/essentials/core/arrow_kernels/request/request.h>
1517

1618
namespace NKikimr::NArrow::NSSA::NGraph::NOptimization {
@@ -199,9 +201,10 @@ TConclusion<bool> TGraph::OptimizeMergeFetching(TGraphNode* baseNode) {
199201
if (!i.second->Is(EProcessorType::FetchOriginalData)) {
200202
continue;
201203
}
202-
if (i.second->GetProcessorAs<TOriginalColumnDataProcessor>()->GetDataAddresses().size() +
203-
i.second->GetProcessorAs<TOriginalColumnDataProcessor>()->GetIndexContext().size() +
204-
i.second->GetProcessorAs<TOriginalColumnDataProcessor>()->GetHeaderContext().size() > 1) {
204+
if (i.second->GetProcessorAs<TOriginalColumnDataProcessor>()->GetDataAddresses().size() +
205+
i.second->GetProcessorAs<TOriginalColumnDataProcessor>()->GetIndexContext().size() +
206+
i.second->GetProcessorAs<TOriginalColumnDataProcessor>()->GetHeaderContext().size() >
207+
1) {
205208
continue;
206209
}
207210
if (i.second->GetProcessorAs<TOriginalColumnDataProcessor>()->GetDataAddresses().size()) {
@@ -220,8 +223,7 @@ TConclusion<bool> TGraph::OptimizeMergeFetching(TGraphNode* baseNode) {
220223
for (auto&& i : dataAddresses) {
221224
columnIds.emplace(i->GetProcessorAs<TOriginalColumnDataProcessor>()->GetOutputColumnIdOnce());
222225
}
223-
auto proc =
224-
std::make_shared<TOriginalColumnDataProcessor>(std::vector<ui32>(columnIds.begin(), columnIds.end()));
226+
auto proc = std::make_shared<TOriginalColumnDataProcessor>(std::vector<ui32>(columnIds.begin(), columnIds.end()));
225227
for (auto&& i : dataAddresses) {
226228
for (auto&& addr : i->GetProcessorAs<TOriginalColumnDataProcessor>()->GetDataAddresses()) {
227229
proc->Add(addr.second);
@@ -230,7 +232,7 @@ TConclusion<bool> TGraph::OptimizeMergeFetching(TGraphNode* baseNode) {
230232
auto nodeFetch = AddNode(proc);
231233
FetchersMerged.emplace(nodeFetch->GetIdentifier());
232234
for (auto&& i : dataAddresses) {
233-
for (auto&& to: i->GetOutputEdges()) {
235+
for (auto&& to : i->GetOutputEdges()) {
234236
AddEdge(nodeFetch.get(), to.second, to.first.GetResourceId());
235237
}
236238
RemoveNode(i->GetIdentifier());
@@ -245,8 +247,7 @@ TConclusion<bool> TGraph::OptimizeMergeFetching(TGraphNode* baseNode) {
245247
for (auto&& i : headers) {
246248
columnIds.emplace(i->GetProcessorAs<TOriginalColumnDataProcessor>()->GetOutputColumnIdOnce());
247249
}
248-
auto proc =
249-
std::make_shared<TOriginalColumnDataProcessor>(std::vector<ui32>(columnIds.begin(), columnIds.end()));
250+
auto proc = std::make_shared<TOriginalColumnDataProcessor>(std::vector<ui32>(columnIds.begin(), columnIds.end()));
250251
for (auto&& i : indexes) {
251252
for (auto&& addr : i->GetProcessorAs<TOriginalColumnDataProcessor>()->GetIndexContext()) {
252253
proc->Add(addr.second);
@@ -361,11 +362,11 @@ TConclusion<bool> TGraph::OptimizeConditionsForIndexes(TGraphNode* condNode) {
361362
if (condNode->GetProcessor()->GetProcessorType() != EProcessorType::Calculation) {
362363
return false;
363364
}
364-
if (condNode->GetProcessor()->GetInput().size() != 2) {
365+
auto calc = condNode->GetProcessorAs<TCalculationProcessor>();
366+
if (!calc->GetKernelLogic()) {
365367
return false;
366368
}
367-
auto calc = condNode->GetProcessorAs<TCalculationProcessor>();
368-
if (!calc->GetYqlOperationId()) {
369+
if (condNode->GetProcessor()->GetInput().size() != 2) {
369370
return false;
370371
}
371372
if (condNode->GetOutputEdges().size() != 1) {
@@ -376,17 +377,7 @@ TConclusion<bool> TGraph::OptimizeConditionsForIndexes(TGraphNode* condNode) {
376377
if (constNode->GetProcessor()->GetProcessorType() != EProcessorType::Const) {
377378
return false;
378379
}
379-
if (!!calc->GetKernelLogic()) {
380-
if (!calc->GetKernelLogic()->IsBoolInResult()) {
381-
return false;
382-
}
383-
}
384-
if (calc->GetYqlOperationId()) {
385-
if (!IsBoolResultYqlOperator((NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId())) {
386-
return false;
387-
}
388-
}
389-
if (!calc->GetYqlOperationId() && !calc->GetKernelLogic()) {
380+
if (!calc->GetKernelLogic()->IsBoolInResult()) {
390381
return false;
391382
}
392383
std::optional<TResourceAddress> dataAddr = GetOriginalAddress(dataNode);
@@ -395,63 +386,44 @@ TConclusion<bool> TGraph::OptimizeConditionsForIndexes(TGraphNode* condNode) {
395386
}
396387
auto* dest = condNode->GetOutputEdges().begin()->second;
397388
const ui32 destResourceId = condNode->GetOutputEdges().begin()->first.GetResourceId();
398-
if ((NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId() == NYql::TKernelRequestBuilder::EBinaryOp::Equals ||
399-
(NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId() == NYql::TKernelRequestBuilder::EBinaryOp::StartsWith ||
400-
(NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId() == NYql::TKernelRequestBuilder::EBinaryOp::EndsWith ||
401-
(NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId() == NYql::TKernelRequestBuilder::EBinaryOp::StringContains) {
402-
if (!IndexesConstructed.emplace(condNode->GetIdentifier()).second) {
403-
return false;
404-
}
405-
RemoveEdge(condNode, dest, destResourceId);
406-
407-
const EIndexCheckOperation indexOperation = [&]() {
408-
if ((NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId() == NYql::TKernelRequestBuilder::EBinaryOp::Equals) {
409-
return EIndexCheckOperation::Equals;
410-
}
411-
if ((NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId() == NYql::TKernelRequestBuilder::EBinaryOp::StartsWith) {
412-
return EIndexCheckOperation::StartsWith;
413-
}
414-
if ((NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId() == NYql::TKernelRequestBuilder::EBinaryOp::EndsWith) {
415-
return EIndexCheckOperation::EndsWith;
416-
}
417-
if ((NYql::TKernelRequestBuilder::EBinaryOp)*calc->GetYqlOperationId() == NYql::TKernelRequestBuilder::EBinaryOp::StringContains) {
418-
return EIndexCheckOperation::Contains;
419-
}
420-
return EIndexCheckOperation::Contains;
421-
AFL_VERIFY(false);
422-
}();
423-
424-
const ui32 resourceIdxFetch = BuildNextResourceId();
425-
IDataSource::TFetchIndexContext indexContext(dataAddr->GetColumnId(),
426-
IDataSource::TFetchIndexContext::TOperationsBySubColumn().Add(dataAddr->GetSubColumnName(), indexOperation));
427-
auto indexFetchProc = std::make_shared<TOriginalColumnDataProcessor>(resourceIdxFetch, indexContext);
428-
auto indexFetchNode = AddNode(indexFetchProc);
429-
RegisterProducer(resourceIdxFetch, indexFetchNode.get());
430-
431-
const ui32 resourceIdIndexToAnd = BuildNextResourceId();
432-
IDataSource::TCheckIndexContext checkIndexContext(dataAddr->GetColumnId(), dataAddr->GetSubColumnName(), indexOperation);
433-
auto indexCheckProc = std::make_shared<TIndexCheckerProcessor>(
434-
resourceIdxFetch, constNode->GetProcessor()->GetOutputColumnIdOnce(), checkIndexContext, resourceIdIndexToAnd);
435-
auto indexProcNode = AddNode(indexCheckProc);
436-
RegisterProducer(resourceIdIndexToAnd, indexProcNode.get());
437-
AddEdge(indexFetchNode.get(), indexProcNode.get(), resourceIdxFetch);
438-
AddEdge(constNode, indexProcNode.get(), constNode->GetProcessor()->GetOutputColumnIdOnce());
439-
440-
const ui32 resourceIdEqToAnd = BuildNextResourceId();
441-
RegisterProducer(resourceIdEqToAnd, condNode);
442-
calc->SetOutputResourceIdOnce(resourceIdEqToAnd);
443-
444-
auto andProcessor = std::make_shared<TStreamLogicProcessor>(TColumnChainInfo::BuildVector({ resourceIdEqToAnd, resourceIdIndexToAnd }),
445-
TColumnChainInfo(destResourceId), NKernels::EOperation::And);
446-
auto andNode = AddNode(andProcessor);
447-
AddEdge(andNode.get(), dest, destResourceId);
448-
449-
AddEdge(indexProcNode.get(), andNode.get(), resourceIdIndexToAnd);
450-
AddEdge(condNode, andNode.get(), resourceIdEqToAnd);
451-
ResetProducer(destResourceId, andNode.get());
452-
return true;
389+
auto indexChecker = calc->GetKernelLogic()->GetIndexCheckerOperation();
390+
if (!indexChecker) {
391+
return false;
453392
}
454-
return false;
393+
if (!IndexesConstructed.emplace(condNode->GetIdentifier()).second) {
394+
return false;
395+
}
396+
RemoveEdge(condNode, dest, destResourceId);
397+
398+
const ui32 resourceIdxFetch = BuildNextResourceId();
399+
IDataSource::TFetchIndexContext indexContext(
400+
dataAddr->GetColumnId(), IDataSource::TFetchIndexContext::TOperationsBySubColumn().Add(dataAddr->GetSubColumnName(), *indexChecker));
401+
auto indexFetchProc = std::make_shared<TOriginalColumnDataProcessor>(resourceIdxFetch, indexContext);
402+
auto indexFetchNode = AddNode(indexFetchProc);
403+
RegisterProducer(resourceIdxFetch, indexFetchNode.get());
404+
405+
const ui32 resourceIdIndexToAnd = BuildNextResourceId();
406+
IDataSource::TCheckIndexContext checkIndexContext(dataAddr->GetColumnId(), dataAddr->GetSubColumnName(), *indexChecker);
407+
auto indexCheckProc = std::make_shared<TIndexCheckerProcessor>(
408+
resourceIdxFetch, constNode->GetProcessor()->GetOutputColumnIdOnce(), checkIndexContext, resourceIdIndexToAnd);
409+
auto indexProcNode = AddNode(indexCheckProc);
410+
RegisterProducer(resourceIdIndexToAnd, indexProcNode.get());
411+
AddEdge(indexFetchNode.get(), indexProcNode.get(), resourceIdxFetch);
412+
AddEdge(constNode, indexProcNode.get(), constNode->GetProcessor()->GetOutputColumnIdOnce());
413+
414+
const ui32 resourceIdEqToAnd = BuildNextResourceId();
415+
RegisterProducer(resourceIdEqToAnd, condNode);
416+
calc->SetOutputResourceIdOnce(resourceIdEqToAnd);
417+
418+
auto andProcessor = std::make_shared<TStreamLogicProcessor>(
419+
TColumnChainInfo::BuildVector({ resourceIdEqToAnd, resourceIdIndexToAnd }), TColumnChainInfo(destResourceId), NKernels::EOperation::And);
420+
auto andNode = AddNode(andProcessor);
421+
AddEdge(andNode.get(), dest, destResourceId);
422+
423+
AddEdge(indexProcNode.get(), andNode.get(), resourceIdIndexToAnd);
424+
AddEdge(condNode, andNode.get(), resourceIdEqToAnd);
425+
ResetProducer(destResourceId, andNode.get());
426+
return true;
455427
}
456428

457429
bool TGraph::IsBoolResultYqlOperator(const NYql::TKernelRequestBuilder::EBinaryOp op) const {
@@ -687,16 +659,16 @@ TConclusionStatus TGraph::Collapse() {
687659
}
688660
}
689661

690-
// {
691-
// auto conclusion = OptimizeConditionsForHeadersCheck(n.get());
692-
// if (conclusion.IsFail()) {
693-
// return conclusion;
694-
// }
695-
// if (*conclusion) {
696-
// hasChanges = true;
697-
// break;
698-
// }
699-
// }
662+
// {
663+
// auto conclusion = OptimizeConditionsForHeadersCheck(n.get());
664+
// if (conclusion.IsFail()) {
665+
// return conclusion;
666+
// }
667+
// if (*conclusion) {
668+
// hasChanges = true;
669+
// break;
670+
// }
671+
// }
700672

701673
{
702674
auto conclusion = OptimizeConditionsForStream(n.get());

ydb/core/formats/arrow/program/index.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class TIndexCheckerProcessor: public IResourceProcessor {
2828
bool ApplyToFilterFlag = false;
2929

3030
virtual TString DoGetSignalCategoryName() const override {
31-
return ::ToString(GetProcessorType()) + "::" + ::ToString(IndexContext.GetOperation());
31+
return ::ToString(GetProcessorType()) + "::" + IndexContext.GetOperation().GetSignalId();
3232
}
3333

3434
public:

0 commit comments

Comments
 (0)