Skip to content

Commit

Permalink
Fix orca preprocess step for query with Select-Project-NaryJoin patte…
Browse files Browse the repository at this point in the history
…rn (#17423)

Issue:
When a query having pattern of Select-Project-NaryJoin
with Select predicate's condition also containing pattern
ScalarSubquery-Project-Select-Project-NaryJoin
is executed, a crash was happening.

RCA:
During the preprocessing step `PexprTransposeSelectAndProject`,
function `CollapseSelectAndReplaceColref` is called on the expr with the
pattern and Project expr inside the Select's predicate  is being dropped,
but it's column references are not being replaced with equivalent dropped project expr.
That is because `CollapseSelectAndReplaceColref` is called recursively on
the Select's predicate. That function only takes care of removing project but
doesn't re-add the project expr on top of collapsed select.

Fix:
Now function `CollapseSelectAndReplaceColref` is removed.

For creation of collapsed select, 
* First colrefs of the columns projected in the project list are replaced
 with their equivalent Project exprs in the Select's predicate expr.
 This happens in the refactored function `CUtils::ReplaceColrefWithProjectExpr`
* Then the new Select predicate with replaced colrefs and the NaryJoin expr 
 are transposed recursively with `PexprTransposeSelectAndProject`.
* Collapsed Select is created using the transposed NaryJoin and transposed Select predicate.

Collapsed Select is then added as a child to the CLogicalProject creating the new transposed expr.

Input:
+--CLogicalSelect
   |--CLogicalProject
   |  +--CLogicalNAryJoin
   +--...
      +--CLogicalSelect
         +--CLogicalProject
            +--CLogicalNAryJoin

Old Output:
+--CLogicalProject
   |--CLogicalSelect
   |  +--CLogicalNAryJoin
   +--...
         +--CLogicalSelect
            +--CLogicalNAryJoin

Fixed output:
+--CLogicalProject
   |--CLogicalSelect
   |  +--CLogicalNAryJoin
   +--...
      +--CLogicalProject
         +--CLogicalSelect
            +--CLogicalNAryJoin


* New regression testcases are added that tests queries containing the above pattern.
* New mdp testcase is added that tests a query with above pattern.
  • Loading branch information
hpbee authored May 15, 2024
1 parent d41557a commit d126b4e
Show file tree
Hide file tree
Showing 9 changed files with 1,353 additions and 79 deletions.

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions src/backend/gporca/libgpopt/include/gpopt/base/CUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,12 @@ class CUtils

static CTableDescriptorHashSet *RemoveDuplicateMdids(
CMemoryPool *mp, CTableDescriptorHashSet *tabdescs);

static CExpression *ReplaceColrefWithProjectExpr(CMemoryPool *mp,
CExpression *pexpr,
CColRef *pcolref,
CExpression *pprojExpr);

}; // class CUtils

// hash set from expressions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,11 +222,6 @@ class CExpressionPreprocessor
static CExpression *ConvertSplitUpdateToInPlaceUpdate(CMemoryPool *mp,
CExpression *expr);

static CExpression *CollapseSelectAndReplaceColref(CMemoryPool *mp,
CExpression *expr,
CColRef *pcolref,
CExpression *pprojExpr);

public:
CExpressionPreprocessor() = delete;

Expand Down
25 changes: 25 additions & 0 deletions src/backend/gporca/libgpopt/src/base/CUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5052,4 +5052,29 @@ CUtils::RemoveDuplicateMdids(CMemoryPool *mp, CTableDescriptorHashSet *tabdescs)
mdids->Release();
return result;
}

// Replace column reference with projection expr recursively
CExpression *
CUtils::ReplaceColrefWithProjectExpr(CMemoryPool *mp, CExpression *pexpr,
CColRef *pcolref, CExpression *pprojExpr)
{
// replace reference
if (pexpr->Pop()->Eopid() == COperator::EopScalarIdent &&
CColRef::Equals(CScalarIdent::PopConvert(pexpr->Pop())->Pcr(), pcolref))
{
pprojExpr->AddRef();
return pprojExpr;
}

// recurse to children
CExpressionArray *pdrgpexprChildren = GPOS_NEW(mp) CExpressionArray(mp);
for (ULONG ul = 0; ul < pexpr->Arity(); ul++)
{
pdrgpexprChildren->Append(
ReplaceColrefWithProjectExpr(mp, (*pexpr)[ul], pcolref, pprojExpr));
}
COperator *pop = pexpr->Pop();
pop->AddRef();
return GPOS_NEW(mp) CExpression(mp, pop, pdrgpexprChildren);
}
// EOF
133 changes: 60 additions & 73 deletions src/backend/gporca/libgpopt/src/operators/CExpressionPreprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3007,59 +3007,6 @@ CExpressionPreprocessor::PcnstrFromChildPartition(
return cnstr;
}

// Collapse a select over a project and update column reference.
CExpression *
CExpressionPreprocessor::CollapseSelectAndReplaceColref(CMemoryPool *mp,
CExpression *pexpr,
CColRef *pcolref,
CExpression *pprojExpr)
{
// remove the logical project
//
// Input:
// +--CLogicalSelect (x = 'meh')
// +--CLogicalProject (col1...n, expr as x)
// +-- CLogicalNAryJoin
// Output:
// +--CLogicalSelect (expr = 'meh')
// +-- CLogicalNAryJoin
if (pexpr->Pop()->Eopid() == COperator::EopLogicalSelect &&
(*pexpr)[0]->Pop()->Eopid() == COperator::EopLogicalProject &&
(*(*pexpr)[0])[0]->Pop()->Eopid() == COperator::EopLogicalNAryJoin)
{
(*(*pexpr)[0])[0]->AddRef();
CExpression *pexprCollapsedSelect = GPOS_NEW(mp)
CExpression(mp, GPOS_NEW(mp) CLogicalSelect(mp), (*(*pexpr)[0])[0],
CollapseSelectAndReplaceColref(mp, (*pexpr)[1], pcolref,
pprojExpr));

CExpression *pexprTransposed =
PexprTransposeSelectAndProject(mp, pexprCollapsedSelect);
pexprCollapsedSelect->Release();
return pexprTransposed;
}

// replace reference
if (pexpr->Pop()->Eopid() == COperator::EopScalarIdent &&
CColRef::Equals(CScalarIdent::PopConvert(pexpr->Pop())->Pcr(), pcolref))
{
pprojExpr->AddRef();
return pprojExpr;
}

// recurse to children
CExpressionArray *pdrgpexprChildren = GPOS_NEW(mp) CExpressionArray(mp);
for (ULONG ul = 0; ul < pexpr->Arity(); ul++)
{
pdrgpexprChildren->Append(CollapseSelectAndReplaceColref(
mp, (*pexpr)[ul], pcolref, pprojExpr));
}

COperator *pop = pexpr->Pop();
pop->AddRef();
return GPOS_NEW(mp) CExpression(mp, pop, pdrgpexprChildren);
}

// Transpose a select over a project
//
// This preprocessing step enables additional opportunities for predicate push
Expand Down Expand Up @@ -3139,9 +3086,8 @@ CExpressionPreprocessor::PexprTransposeSelectAndProject(CMemoryPool *mp,
{
CExpression *pproject = (*pexpr)[0];
CExpression *pprojectList = (*pproject)[1];
CExpression *pselectNew = pexpr;

CExpressionArray *pdrgpexpr = GPOS_NEW(mp) CExpressionArray(mp);
// Return same pexpr if transpose is not possible
for (ULONG ul = 0; ul < pprojectList->Arity(); ul++)
{
CExpression *pprojexpr =
Expand All @@ -3168,25 +3114,66 @@ CExpressionPreprocessor::PexprTransposeSelectAndProject(CMemoryPool *mp,
pexpr->AddRef();
return pexpr;
}

// TODO: In order to support mixed pushable and non-pushable
// predicates we need to be able to deconstruct a select
// conjunction constraint into pushable and non-pushable
// parts.
//
// NB: JoinOnViewWithMixOfPushableAndNonpushablePredicates.mdp
CExpression *prevpselectNew = pselectNew;
pselectNew = CollapseSelectAndReplaceColref(
mp, prevpselectNew,
CUtils::PNthProjectElement(pproject, ul)->Pcr(),
CUtils::PNthProjectElementExpr(pproject, ul));
if (pexpr != prevpselectNew)
{
prevpselectNew->Release();
}
}
pdrgpexpr->Append(pselectNew);

// TODO: In order to support mixed pushable and non-pushable
// predicates we need to be able to deconstruct a select
// conjunction constraint into pushable and non-pushable
// parts.
//
// NB: JoinOnViewWithMixOfPushableAndNonpushablePredicates.mdp


// Transpose new select containing predicate with updated colrefs and NaryJoin recursively
// remove the logical project
//
// Input of step:
// +--CLogicalSelect (x = 'meh')
// +--CLogicalProject (col1...n, expr as x)
// +-- CLogicalNAryJoin
// Output of step:
// +--CLogicalSelect (expr = 'meh')
// +-- CLogicalNAryJoin

// Replace colref's of the columns projected in the project list in the Select's predicate expr.
CExpression *pexprSelectPred = (*pexpr)[1];
pexprSelectPred->AddRef();
for (ULONG ul = 0; ul < pprojectList->Arity(); ul++)
{
CExpression *pprojexpr =
CUtils::PNthProjectElementExpr(pproject, ul);
gpopt::CColRef *pcolref =
CUtils::PNthProjectElement(pproject, ul)->Pcr();
CExpression *pexprSelectPredTemp =
CUtils::ReplaceColrefWithProjectExpr(mp, pexprSelectPred,
pcolref, pprojexpr);
pexprSelectPred->Release();
pexprSelectPred = pexprSelectPredTemp;
}

// Transpose Select's Relation expr so it can be used to create collpased select
CExpression *pexprSelectRelNew =
PexprTransposeSelectAndProject(mp, (*(*pexpr)[0])[0]);

// Transpose Select's Predicate expr so it can be used to create collpased select
CExpression *pexprSelectPredNew =
PexprTransposeSelectAndProject(mp, pexprSelectPred);
pexprSelectPred->Release();

// Create collapsed select from updated Select children
CExpression *pexprCollapsedSelect =
GPOS_NEW(mp) CExpression(mp, GPOS_NEW(mp) CLogicalSelect(mp),
pexprSelectRelNew, pexprSelectPredNew);

// Re-add the CLogicalProject on top of collapsed select
// Input of step:
// +--CLogicalSelect (x = 'meh')
// +--CLogicalProject (col1...n, expr as x)
// +-- CLogicalNAryJoin
// Output of step:
// +--CLogicalProject (col1...n, expr as x)
// +--CLogicalSelect (expr = 'meh')
// +-- CLogicalNAryJoin
pdrgpexpr->Append(pexprCollapsedSelect);
CExpressionArray *pdrgpprojelems = GPOS_NEW(mp) CExpressionArray(mp);
for (ULONG ul = 0; ul < pprojectList->Arity(); ul++)
{
Expand Down
2 changes: 1 addition & 1 deletion src/backend/gporca/server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ MultipleDampedPredJoinCardinality MultipleIndependentPredJoinCardinality MultiDi
MultiDistKeyWithOtherPredsJoinCardinality NoDistKeyMultiPredJoinCardinality OneDistKeyMultiPredJoinCardinality
JoinOnViewWithCastedColumn JoinOnViewWithCastedColumnAndSubqueryInPredicate JoinOnViewWithVolatileColumn
JoinOnViewWithMixOfPushableAndNonpushablePredicates JoinOnViewWithSetReturningColumn OuterJoinOnViewWithCastedColumn JoinWithSubqueryProjectColumn
AggSubqCollapseFalseFilter NestedJoinWithCastedColumn PushConstantSelectPredicateThruJoin-1 PushConstantSelectPredicateThruJoin-2
AggSubqCollapseFalseFilter NestedJoinWithCastedColumn JoinOnViewWithCastedColumnAndJoinWithCastedColumnInPredicate PushConstantSelectPredicateThruJoin-1 PushConstantSelectPredicateThruJoin-2
PushConstantSelectPredicateThruJoin-3 PushConstantSelectPredicateThruJoin-4 PushConstantSelectPredicateThruJoin-5 PushConstantSelectPredicateThruJoin-6
PushConstantSelectPredicateThruJoin-7 PushConstantSelectPredicateThruJoin-8 PushConstantSelectPredicateThruJoin-9 PushConstantSelectPredicateThruJoin-10
PushConstantSelectPredicateThruJoin-11 PushConstantSelectPredicateThruJoin-12 PushConstantSelectPredicateThruJoin-13
Expand Down
108 changes: 108 additions & 0 deletions src/test/regress/expected/qp_subquery.out
Original file line number Diff line number Diff line change
Expand Up @@ -1918,6 +1918,114 @@ select * from v where not exists (select b from v where b<>2);
---+---
(0 rows)

-- Check that a query having pattern of Select-Project-NaryJoin,
-- also containing a Select predicate condition with the same pattern nested in a subquery runs
CREATE TABLE tab1(a TEXT, b TEXT) DISTRIBUTED RANDOMLY;
INSERT INTO tab1 SELECT i,i FROM GENERATE_SERIES(1,3)i;
SELECT * FROM (SELECT BTRIM(p1.b) AS param FROM tab1 p1 JOIN tab1 p2 USING(a)) t1
WHERE EXISTS
(SELECT 1 FROM
(SELECT BTRIM(p1.b) AS param FROM tab1 p1 JOIN tab1 p2 USING(a)) t2
WHERE t2.param = t1.param);
param
-------
1
2
3
(3 rows)

EXPLAIN (COSTS OFF) SELECT * FROM (SELECT BTRIM(p1.b) AS param FROM tab1 p1 JOIN tab1 p2 USING(a)) t1
WHERE EXISTS
(SELECT 1 FROM
(SELECT BTRIM(p1.b) AS param FROM tab1 p1 JOIN tab1 p2 USING(a)) t2
WHERE t2.param = t1.param);
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (p2.a = p1.a)
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: p2.a
-> Seq Scan on tab1 p2
-> Hash
-> Redistribute Motion 3:3 (slice3; segments: 3)
Hash Key: p1.a
-> Hash Join
Hash Cond: (btrim(p1.b) = btrim(p1_1.b))
-> Seq Scan on tab1 p1
-> Hash
-> Broadcast Motion 3:3 (slice4; segments: 3)
-> HashAggregate
Group Key: btrim(p1_1.b)
-> Result
-> Redistribute Motion 3:3 (slice5; segments: 3)
Hash Key: btrim(p1_1.b)
-> Hash Join
Hash Cond: (p1_1.a = p2_1.a)
-> Redistribute Motion 3:3 (slice6; segments: 3)
Hash Key: p1_1.a
-> Seq Scan on tab1 p1_1
-> Hash
-> Redistribute Motion 3:3 (slice7; segments: 3)
Hash Key: p2_1.a
-> Seq Scan on tab1 p2_1
Optimizer: Postgres-based planner
(29 rows)

-- Check that a query having pattern of Select-Project-NaryJoin,
-- also containing a Select predicate condition with the same pattern nested in a subquery runs when subplan is enforced
SET optimizer_enforce_subplans TO on;
SELECT * FROM (SELECT BTRIM(p1.b) AS param FROM tab1 p1 JOIN tab1 p2 USING(a)) t1
WHERE EXISTS
(SELECT 1 FROM
(SELECT BTRIM(p1.b) AS param FROM tab1 p1 JOIN tab1 p2 USING(a)) t2
WHERE t2.param = t1.param);
param
-------
1
2
3
(3 rows)

EXPLAIN (COSTS OFF) SELECT * FROM (SELECT BTRIM(p1.b) AS param FROM tab1 p1 JOIN tab1 p2 USING(a)) t1
WHERE EXISTS
(SELECT 1 FROM
(SELECT BTRIM(p1.b) AS param FROM tab1 p1 JOIN tab1 p2 USING(a)) t2
WHERE t2.param = t1.param);
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (p2.a = p1.a)
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: p2.a
-> Seq Scan on tab1 p2
-> Hash
-> Redistribute Motion 3:3 (slice3; segments: 3)
Hash Key: p1.a
-> Hash Join
Hash Cond: (btrim(p1.b) = btrim(p1_1.b))
-> Seq Scan on tab1 p1
-> Hash
-> Broadcast Motion 3:3 (slice4; segments: 3)
-> HashAggregate
Group Key: btrim(p1_1.b)
-> Result
-> Redistribute Motion 3:3 (slice5; segments: 3)
Hash Key: btrim(p1_1.b)
-> Hash Join
Hash Cond: (p1_1.a = p2_1.a)
-> Redistribute Motion 3:3 (slice6; segments: 3)
Hash Key: p1_1.a
-> Seq Scan on tab1 p1_1
-> Hash
-> Redistribute Motion 3:3 (slice7; segments: 3)
Hash Key: p2_1.a
-> Seq Scan on tab1 p2_1
Optimizer: Postgres-based planner
(29 rows)

reset optimizer_enforce_subplans;
set client_min_messages='warning';
drop schema qp_subquery cascade;
reset optimizer_trace_fallback;
Loading

0 comments on commit d126b4e

Please sign in to comment.