Skip to content

Commit fc12c0e

Browse files
dyemanovactions-userasfernandesNoremosArtyom Abakumov
authored
Unnest IN/ANY/EXISTS subqueries and optimize them using semi-join algorithm (#8061)
* Added support for semi/anti and outer joins to hash join algorithm. Reimplemented support for semi/anti joins inside the nested loop algorithm. Slightly changed implementation of full outer joins. Added transformation of IN/EXISTS subqueries into lateral semi-joins. Basic optimizer support for semi-joins. More efficient optimization for cross joins. Added some debug info (hash table statistics) for hash joins. * Sync with v5. Better dependency tracking for independently generated RSBs, add a primitive estimation for hash vs loop semi-joins. Remove the outdated dependency tracking. Exclude parent conjuncts from the invariant checks. Postfix for EXISTS/IN with procedures inside --------- Co-authored-by: GitHub Action <action@github.com> Co-authored-by: Adriano dos Santos Fernandes <529415+asfernandes@users.noreply.github.com> Co-authored-by: Artyom Abakumov <artem210966@gmail.com> Co-authored-by: Adriano dos Santos Fernandes <adrianosf@gmail.com> Co-authored-by: Artyom Abakumov <artyom.abakumov@red-soft.ru> Co-authored-by: Vlad Khorsun <hvlad@users.sourceforge.net> Co-authored-by: AlexPeshkoff <alexander.peshkoff@gmail.com> Co-authored-by: Dimitry Sibiryakov <sd@ibphoenix.com> Co-authored-by: Dmitry Kovalenko <dmitry.lipetsk@gmail.com> Co-authored-by: Dmitry Starodubov <dmitry.starodubov@red-soft.ru> Co-authored-by: TreeHunter <60896014+TreeHunter9@users.noreply.github.com> Co-authored-by: Artyom Ivanov <artyom.ivanov@red-soft.ru> Co-authored-by: Paul Reeves <preeves@ibphoenix.com> Co-authored-by: Alexander <116901579+Zhdanov0@users.noreply.github.com> Co-authored-by: Alexander Zhdanov <alexander.zhdanov@red-soft.ru> Co-authored-by: Marat Iskakov R. <44665373+marat-iskakov@users.noreply.github.com> Co-authored-by: MaratIskakov <maratiskakov@10.0.2.15>
1 parent 71bf66a commit fc12c0e

25 files changed

+935
-195
lines changed

src/jrd/RecordSourceNodes.cpp

Lines changed: 253 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,165 @@ static ValueExprNode* resolveUsingField(DsqlCompilerScratch* dsqlScratch, const
5454

5555
namespace
5656
{
57+
// Search through the list of ANDed booleans to find comparisons
58+
// referring streams of parent select expressions.
59+
// Extract those booleans and return them to the caller.
60+
61+
bool findDependentBooleans(CompilerScratch* csb,
62+
const StreamList& rseStreams,
63+
BoolExprNode** parentBoolean,
64+
BoolExprNodeStack& booleanStack)
65+
{
66+
const auto boolean = *parentBoolean;
67+
68+
const auto binaryNode = nodeAs<BinaryBoolNode>(boolean);
69+
if (binaryNode && binaryNode->blrOp == blr_and)
70+
{
71+
const bool found1 = findDependentBooleans(csb, rseStreams,
72+
binaryNode->arg1.getAddress(), booleanStack);
73+
const bool found2 = findDependentBooleans(csb, rseStreams,
74+
binaryNode->arg2.getAddress(), booleanStack);
75+
76+
if (!binaryNode->arg1 && !binaryNode->arg2)
77+
*parentBoolean = nullptr;
78+
else if (!binaryNode->arg1)
79+
*parentBoolean = binaryNode->arg2;
80+
else if (!binaryNode->arg2)
81+
*parentBoolean = binaryNode->arg1;
82+
83+
return (found1 || found2);
84+
}
85+
86+
if (const auto cmpNode = nodeAs<ComparativeBoolNode>(boolean))
87+
{
88+
if (cmpNode->blrOp == blr_eql || cmpNode->blrOp == blr_equiv)
89+
{
90+
SortedStreamList streams;
91+
cmpNode->collectStreams(streams);
92+
93+
for (const auto stream : streams)
94+
{
95+
if (rseStreams.exist(stream))
96+
{
97+
booleanStack.push(boolean);
98+
*parentBoolean = nullptr;
99+
return true;
100+
}
101+
}
102+
}
103+
}
104+
105+
return false;
106+
}
107+
108+
// Search through the list of ANDed booleans to find correlated EXISTS/IN sub-queries.
109+
// They are candidates to be converted into semi- or anti-joins.
110+
111+
bool findPossibleJoins(CompilerScratch* csb,
112+
const StreamList& rseStreams,
113+
BoolExprNode** parentBoolean,
114+
RecordSourceNodeStack& rseStack,
115+
BoolExprNodeStack& booleanStack)
116+
{
117+
auto boolNode = *parentBoolean;
118+
119+
const auto binaryNode = nodeAs<BinaryBoolNode>(boolNode);
120+
if (binaryNode && binaryNode->blrOp == blr_and)
121+
{
122+
const bool found1 = findPossibleJoins(csb, rseStreams,
123+
binaryNode->arg1.getAddress(), rseStack, booleanStack);
124+
const bool found2 = findPossibleJoins(csb, rseStreams,
125+
binaryNode->arg2.getAddress(), rseStack, booleanStack);
126+
127+
if (!binaryNode->arg1 && !binaryNode->arg2)
128+
*parentBoolean = nullptr;
129+
else if (!binaryNode->arg1)
130+
*parentBoolean = binaryNode->arg2;
131+
else if (!binaryNode->arg2)
132+
*parentBoolean = binaryNode->arg1;
133+
134+
return (found1 || found2);
135+
}
136+
137+
const auto rseNode = nodeAs<RseBoolNode>(boolNode);
138+
// Both EXISTS (blr_any) and IN (blr_ansi_any) sub-queries are handled
139+
if (rseNode && (rseNode->blrOp == blr_any || rseNode->blrOp == blr_ansi_any))
140+
{
141+
auto rse = rseNode->rse;
142+
fb_assert(rse && (rse->flags & RseNode::FLAG_SUB_QUERY));
143+
144+
if (rse->rse_boolean && rse->rse_jointype == blr_inner &&
145+
!rse->rse_first && !rse->rse_skip && !rse->rse_plan)
146+
{
147+
// Find booleans convertable into semi-joins
148+
149+
StreamList streams;
150+
rse->computeRseStreams(streams);
151+
152+
BoolExprNodeStack booleans;
153+
if (findDependentBooleans(csb, rseStreams,
154+
rse->rse_boolean.getAddress(),
155+
booleans))
156+
{
157+
// Compose the conjunct boolean
158+
159+
fb_assert(booleans.hasData());
160+
auto boolean = booleans.pop();
161+
while (booleans.hasData())
162+
{
163+
const auto andNode = FB_NEW_POOL(csb->csb_pool)
164+
BinaryBoolNode(csb->csb_pool, blr_and);
165+
andNode->arg1 = boolean;
166+
andNode->arg2 = booleans.pop();
167+
boolean = andNode;
168+
}
169+
170+
// Ensure that no external references are left inside the subquery.
171+
// If so, mark the RSE as joined and add it to the stack.
172+
173+
SortedStreamList streams;
174+
rse->collectStreams(streams);
175+
176+
bool dependent = false;
177+
for (const auto stream : streams)
178+
{
179+
if (rseStreams.exist(stream))
180+
{
181+
dependent = true;
182+
break;
183+
}
184+
}
185+
186+
if (!dependent)
187+
{
188+
rse->flags &= ~RseNode::FLAG_SUB_QUERY;
189+
rse->flags |= RseNode::FLAG_SEMI_JOINED;
190+
rseStack.push(rse);
191+
booleanStack.push(boolean);
192+
*parentBoolean = nullptr;
193+
return true;
194+
}
195+
196+
// Otherwise, restore the original sub-query by adding
197+
// the collected booleans back to the RSE.
198+
199+
if (rse->rse_boolean)
200+
{
201+
const auto andNode = FB_NEW_POOL(csb->csb_pool)
202+
BinaryBoolNode(csb->csb_pool, blr_and);
203+
andNode->arg1 = boolean;
204+
andNode->arg2 = rse->rse_boolean;
205+
boolean = andNode;
206+
}
207+
208+
rse->rse_boolean = boolean;
209+
}
210+
}
211+
}
212+
213+
return false;
214+
}
215+
57216
class AutoActivateResetStreams : public AutoStorage
58217
{
59218
public:
@@ -3025,6 +3184,9 @@ RseNode* RseNode::pass1(thread_db* tdbb, CompilerScratch* csb)
30253184
{
30263185
SET_TDBB(tdbb);
30273186

3187+
if (const auto newRse = processPossibleJoins(tdbb, csb))
3188+
return newRse->pass1(tdbb, csb);
3189+
30283190
// for scoping purposes, maintain a stack of RseNode's which are
30293191
// currently being parsed; if there are none on the stack as
30303192
// yet, mark the RseNode as variant to make sure that statement-
@@ -3130,6 +3292,12 @@ RseNode* RseNode::pass1(thread_db* tdbb, CompilerScratch* csb)
31303292
void RseNode::pass1Source(thread_db* tdbb, CompilerScratch* csb, RseNode* rse,
31313293
BoolExprNode** boolean, RecordSourceNodeStack& stack)
31323294
{
3295+
if (const auto newRse = processPossibleJoins(tdbb, csb))
3296+
{
3297+
newRse->pass1Source(tdbb, csb, rse, boolean, stack);
3298+
return;
3299+
}
3300+
31333301
if (rse_jointype != blr_inner)
31343302
{
31353303
// Check whether any of the upper level booleans (those belonging to the WHERE clause)
@@ -3183,15 +3351,15 @@ void RseNode::pass1Source(thread_db* tdbb, CompilerScratch* csb, RseNode* rse,
31833351
}
31843352
}
31853353

3186-
// in the case of an RseNode, it is possible that a new RseNode will be generated,
3354+
// In the case of an RseNode, it is possible that a new RseNode will be generated,
31873355
// so wait to process the source before we push it on the stack (bug 8039)
31883356

31893357
// The addition of the JOIN syntax for specifying inner joins causes an
31903358
// RseNode tree to be generated, which is undesirable in the simplest case
31913359
// where we are just trying to inner join more than 2 streams. If possible,
31923360
// try to flatten the tree out before we go any further.
31933361

3194-
if (!isLateral() &&
3362+
if (!isLateral() && !isSemiJoined() &&
31953363
rse->rse_jointype == blr_inner &&
31963364
rse_jointype == blr_inner &&
31973365
!rse_sorted && !rse_projection &&
@@ -3296,11 +3464,11 @@ RecordSource* RseNode::compile(thread_db* tdbb, Optimizer* opt, bool innerSubStr
32963464

32973465
StreamStateHolder stateHolder(csb, opt->getOuterStreams());
32983466

3299-
if (opt->isLeftJoin() || isLateral())
3467+
if (opt->isLeftJoin() || isLateral() || isSemiJoined())
33003468
{
33013469
stateHolder.activate();
33023470

3303-
if (opt->isLeftJoin())
3471+
if (opt->isLeftJoin() || isSemiJoined())
33043472
{
33053473
// Push all conjuncts except "missing" ones (e.g. IS NULL)
33063474
for (auto iter = opt->getConjuncts(false, true); iter.hasData(); ++iter)
@@ -3323,6 +3491,87 @@ RecordSource* RseNode::compile(thread_db* tdbb, Optimizer* opt, bool innerSubStr
33233491
return opt->compile(this, &conjunctStack);
33243492
}
33253493

3494+
RseNode* RseNode::processPossibleJoins(thread_db* tdbb, CompilerScratch* csb)
3495+
{
3496+
if (rse_jointype != blr_inner || !rse_boolean || rse_plan)
3497+
return nullptr;
3498+
3499+
// If the sub-query is nested inside the other sub-query which wasn't converted into semi-join,
3500+
// it makes no sense to apply a semi-join at the deeper levels, as a sub-query is expected
3501+
// to be executed repeatedly.
3502+
// This is a temporary fix until nested loop semi-joins are allowed by the optimizer.
3503+
3504+
if (flags & FLAG_SUB_QUERY)
3505+
return nullptr;
3506+
3507+
for (const auto node : csb->csb_current_nodes)
3508+
{
3509+
if (const auto rse = nodeAs<RseNode>(node))
3510+
{
3511+
if (rse->flags & FLAG_SUB_QUERY)
3512+
return nullptr;
3513+
}
3514+
}
3515+
3516+
RecordSourceNodeStack rseStack;
3517+
BoolExprNodeStack booleanStack;
3518+
3519+
// Find possibly joinable sub-queries
3520+
3521+
StreamList rseStreams;
3522+
computeRseStreams(rseStreams);
3523+
3524+
if (!findPossibleJoins(csb, rseStreams, rse_boolean.getAddress(), rseStack, booleanStack))
3525+
return nullptr;
3526+
3527+
fb_assert(rseStack.hasData() && booleanStack.hasData());
3528+
fb_assert(rseStack.getCount() == booleanStack.getCount());
3529+
3530+
// Create joins between the original node and detected joinable nodes.
3531+
// Preserve FIRST/SKIP nodes at their original position, i.e. outside semi-joins.
3532+
3533+
const auto first = rse_first;
3534+
rse_first = nullptr;
3535+
3536+
const auto skip = rse_skip;
3537+
rse_skip = nullptr;
3538+
3539+
const auto orgFlags = flags;
3540+
flags = 0;
3541+
3542+
auto rse = this;
3543+
while (rseStack.hasData())
3544+
{
3545+
const auto newRse = FB_NEW_POOL(*tdbb->getDefaultPool())
3546+
RseNode(*tdbb->getDefaultPool());
3547+
3548+
newRse->rse_relations.add(rse);
3549+
newRse->rse_relations.add(rseStack.pop());
3550+
3551+
newRse->rse_jointype = blr_inner;
3552+
newRse->rse_boolean = booleanStack.pop();
3553+
3554+
rse = newRse;
3555+
}
3556+
3557+
if (first || skip)
3558+
{
3559+
const auto newRse = FB_NEW_POOL(*tdbb->getDefaultPool())
3560+
RseNode(*tdbb->getDefaultPool());
3561+
3562+
newRse->rse_relations.add(rse);
3563+
newRse->rse_jointype = blr_inner;
3564+
newRse->rse_first = first;
3565+
newRse->rse_skip = skip;
3566+
3567+
rse = newRse;
3568+
}
3569+
3570+
rse->flags = orgFlags;
3571+
3572+
return rse;
3573+
}
3574+
33263575
// Check that all streams in the RseNode have a plan specified for them.
33273576
// If they are not, there are streams in the RseNode which were not mentioned in the plan.
33283577
void RseNode::planCheck(const CompilerScratch* csb) const

src/jrd/RecordSourceNodes.h

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -718,14 +718,15 @@ class RseNode final : public TypedNode<RecordSourceNode, RecordSourceNode::TYPE_
718718
public:
719719
enum : USHORT
720720
{
721-
FLAG_VARIANT = 0x01, // variant (not invariant?)
722-
FLAG_SINGULAR = 0x02, // singleton select
723-
FLAG_WRITELOCK = 0x04, // locked for write
724-
FLAG_SCROLLABLE = 0x08, // scrollable cursor
725-
FLAG_DSQL_COMPARATIVE = 0x10, // transformed from DSQL ComparativeBoolNode
726-
FLAG_LATERAL = 0x20, // lateral derived table
727-
FLAG_SKIP_LOCKED = 0x40, // skip locked
728-
FLAG_SUB_QUERY = 0x80 // sub-query
721+
FLAG_VARIANT = 0x01, // variant (not invariant?)
722+
FLAG_SINGULAR = 0x02, // singleton select
723+
FLAG_WRITELOCK = 0x04, // locked for write
724+
FLAG_SCROLLABLE = 0x08, // scrollable cursor
725+
FLAG_DSQL_COMPARATIVE = 0x10, // transformed from DSQL ComparativeBoolNode
726+
FLAG_LATERAL = 0x20, // lateral derived table
727+
FLAG_SKIP_LOCKED = 0x40, // skip locked
728+
FLAG_SUB_QUERY = 0x80, // sub-query
729+
FLAG_SEMI_JOINED = 0x100 // participates in semi-join
729730
};
730731

731732
bool isInvariant() const
@@ -753,6 +754,11 @@ class RseNode final : public TypedNode<RecordSourceNode, RecordSourceNode::TYPE_
753754
return (flags & FLAG_SUB_QUERY) != 0;
754755
}
755756

757+
bool isSemiJoined() const
758+
{
759+
return (flags & FLAG_SEMI_JOINED) != 0;
760+
}
761+
756762
bool hasWriteLock() const
757763
{
758764
return (flags & FLAG_WRITELOCK) != 0;
@@ -857,6 +863,7 @@ class RseNode final : public TypedNode<RecordSourceNode, RecordSourceNode::TYPE_
857863
private:
858864
void planCheck(const CompilerScratch* csb) const;
859865
static void planSet(CompilerScratch* csb, PlanNode* plan);
866+
RseNode* processPossibleJoins(thread_db* tdbb, CompilerScratch* csb);
860867

861868
public:
862869
NestConst<ValueExprNode> dsqlFirst;

src/jrd/optimizer/InnerJoin.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ void InnerJoin::calculateStreamInfo()
108108
innerStream->baseIndexes = candidate->indexes;
109109
innerStream->baseUnique = candidate->unique;
110110
innerStream->baseNavigated = candidate->navigated;
111+
innerStream->baseConjuncts = candidate->conjuncts;
111112

112113
csb->csb_rpt[innerStream->number].deactivate();
113114
}
@@ -579,13 +580,15 @@ River* InnerJoin::formRiver()
579580

580581
// Create a hash join
581582
rsb = FB_NEW_POOL(getPool())
582-
HashJoin(tdbb, csb, 2, hashJoinRsbs, keys.begin(), stream.selectivity);
583+
HashJoin(tdbb, csb, INNER_JOIN, 2, hashJoinRsbs, keys.begin(), stream.selectivity);
583584

584585
// Clear priorly processed rsb's, as they're already incorporated into a hash join
585586
rsbs.clear();
586587
}
587588
else
589+
{
588590
rsb = optimizer->generateRetrieval(stream.number, sortPtr, false, false);
591+
}
589592

590593
rsbs.add(rsb);
591594
streams.add(stream.number);

0 commit comments

Comments
 (0)