apache · walterddr · Nov 17, 2023 · Nov 3, 2023 · Nov 3, 2023 · Nov 8, 2023
diff --git a/pinot-query-planner/src/main/java/org/apache/calcite/rel/hint/PinotHintOptions.java b/pinot-query-planner/src/main/java/org/apache/calcite/rel/hint/PinotHintOptions.java
@@ -80,6 +80,10 @@ public static class JoinHintOptions {
      *   BREAK: Break right table build process, continue to perform JOIN operation, results might be partial.
      */
     public static final String JOIN_OVERFLOW_MODE = "join_overflow_mode";
+    /**
+     * Indicat that the join operator(s) within a certain selection scope are colocated
+     */
+    public static final String IS_COLOCATED_BY_JOIN_KEYS = "is_colocated_by_join_keys";
   }
 
   public static class TableHintOptions {

diff --git a/...y-planner/src/main/java/org/apache/calcite/rel/rules/PinotJoinToDynamicBroadcastRule.java b/...y-planner/src/main/java/org/apache/calcite/rel/rules/PinotJoinToDynamicBroadcastRule.java
@@ -155,8 +155,15 @@ public void onMatch(RelOptRuleCall call) {
     PinotLogicalExchange right = (PinotLogicalExchange) (join.getRight() instanceof HepRelVertex
         ? ((HepRelVertex) join.getRight()).getCurrentRel() : join.getRight());
 
-    PinotLogicalExchange dynamicBroadcastExchange =
-        PinotLogicalExchange.create(right.getInput(), RelDistributions.BROADCAST_DISTRIBUTED,
+    // when colocated join hint is given, dynamic broadcast exchange can be hash-distributed b/c
+    //    1. currently, dynamic broadcast only works against main table off leaf-stage; (e.g. receive node on leaf)
+    //    2. when hash key are the same but hash functions are different, it can be done via normal hash shuffle.
+    boolean isColocatedJoin = PinotHintStrategyTable.isHintOptionTrue(join.getHints(),
+        PinotHintOptions.JOIN_HINT_OPTIONS, PinotHintOptions.JoinHintOptions.IS_COLOCATED_BY_JOIN_KEYS);
+    PinotLogicalExchange dynamicBroadcastExchange = isColocatedJoin
+        ? PinotLogicalExchange.create(right.getInput(), RelDistributions.hash(join.analyzeCondition().rightKeys),
+        PinotRelExchangeType.PIPELINE_BREAKER)
+        : PinotLogicalExchange.create(right.getInput(), RelDistributions.BROADCAST_DISTRIBUTED,
             PinotRelExchangeType.PIPELINE_BREAKER);
     Join dynamicFilterJoin =
         new LogicalJoin(join.getCluster(), join.getTraitSet(), left.getInput(), dynamicBroadcastExchange,

diff --git a/...ery-planner/src/main/java/org/apache/calcite/rel/rules/PinotRelDistributionTraitRule.java b/...ery-planner/src/main/java/org/apache/calcite/rel/rules/PinotRelDistributionTraitRule.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.calcite.rel.rules;
+
+import com.google.common.collect.ImmutableList;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.calcite.plan.RelOptRule;
+import org.apache.calcite.plan.RelOptRuleCall;
+import org.apache.calcite.plan.RelTrait;
+import org.apache.calcite.plan.RelTraitSet;
+import org.apache.calcite.rel.RelDistribution;
+import org.apache.calcite.rel.RelDistributions;
+import org.apache.calcite.rel.RelNode;
+import org.apache.calcite.rel.core.Exchange;
+import org.apache.calcite.rel.hint.PinotHintOptions;
+import org.apache.calcite.rel.hint.PinotHintStrategyTable;
+import org.apache.calcite.rel.logical.LogicalAggregate;
+import org.apache.calcite.rel.logical.LogicalFilter;
+import org.apache.calcite.rel.logical.LogicalJoin;
+import org.apache.calcite.rel.logical.LogicalProject;
+import org.apache.calcite.rel.logical.LogicalTableScan;
+import org.apache.calcite.rel.logical.PinotLogicalExchange;
+import org.apache.calcite.tools.RelBuilderFactory;
+import org.apache.calcite.util.mapping.Mappings;
+import org.apache.pinot.query.planner.plannode.AggregateNode;
+
+
+/**
+ * Special rule for Pinot, this rule populates {@link RelDistribution} across the entire relational tree.
+ *
+ * we implement this rule as a workaround b/c {@link org.apache.calcite.plan.RelTraitPropagationVisitor}, which is
+ * deprecated. The idea is to associate every node with a RelDistribution derived from {@link RelNode#getInputs()}
+ * or from the node itself (via hints, or special handling of the type of node in question).
+ */
+public class PinotRelDistributionTraitRule extends RelOptRule {
+  public static final PinotRelDistributionTraitRule INSTANCE =
+      new PinotRelDistributionTraitRule(PinotRuleUtils.PINOT_REL_FACTORY);
+
+  public PinotRelDistributionTraitRule(RelBuilderFactory factory) {
+    super(operand(RelNode.class, any()));
+  }
+
+  @Override
+  public boolean matches(RelOptRuleCall call) {
+    return call.rels.length >= 1;
+  }
+
+  @Override
+  public void onMatch(RelOptRuleCall call) {
+    RelNode current = call.rel(0);
+    List<RelNode> inputs = current.getInputs();
+    RelDistribution relDistribution;
+
+    if (inputs == null || inputs.size() == 0) {
+      relDistribution = computeCurrentDistribution(current);
+    } else {
+      // if there's input to the current node, attempt to derive the RelDistribution.
+      relDistribution = deriveDistribution(current);
+    }
+    call.transformTo(attachTrait(current, relDistribution));
+  }
+
+  /**
+   * currently, Pinot has {@link RelTraitSet} default set to empty and thus we directly pull the cluster trait set,
+   * then plus the {@link RelDistribution} trait.
+   */
+  private static RelNode attachTrait(RelNode relNode, RelTrait trait) {
+    RelTraitSet clusterTraitSet = relNode.getCluster().traitSet();
+    if (relNode instanceof LogicalJoin) {
+      // work around {@link LogicalJoin#copy(RelTraitSet, RexNode, RelNode, RelNode, JoinRelType, boolean)} not copying
+      // properly
+      LogicalJoin join = (LogicalJoin) relNode;
+      return new LogicalJoin(join.getCluster(), clusterTraitSet.plus(trait), join.getLeft(),
+          join.getRight(), join.getCondition(), join.getVariablesSet(), join.getJoinType(), join.isSemiJoinDone(),
+          ImmutableList.copyOf(join.getSystemFieldList()));
+    } else if (relNode instanceof LogicalTableScan) {
+      LogicalTableScan tableScan = (LogicalTableScan) relNode;
+      return new LogicalTableScan(tableScan.getCluster(), clusterTraitSet.plus(trait), tableScan.getTable());
+    } else {
+      return relNode.copy(clusterTraitSet.plus(trait), relNode.getInputs());
+    }
+  }
+
+  private static RelDistribution deriveDistribution(RelNode node) {
+    List<RelNode> inputs = node.getInputs();
+    RelNode input = PinotRuleUtils.unboxRel(inputs.get(0));
+    if (node instanceof PinotLogicalExchange) {
+      // TODO: derive from input first, only if the result is ANY we change it to current
+      return computeCurrentDistribution(node);
+    } else if (node instanceof LogicalProject) {
+      assert inputs.size() == 1;
+      RelDistribution inputRelDistribution = input.getTraitSet().getDistribution();
+      LogicalProject project = (LogicalProject) node;
+      try {
+        if (inputRelDistribution != null) {
+          return inputRelDistribution.apply(project.getMapping());
+        }
+      } catch (Exception e) {
+        // ... skip;
+      }
+    } else if (node instanceof LogicalFilter) {
+      assert inputs.size() == 1;
+      RelDistribution inputRelDistribution = input.getTraitSet().getDistribution();
+      if (inputRelDistribution != null) {
+        return inputRelDistribution;
+      }
+    } else if (node instanceof LogicalAggregate) {
+      assert inputs.size() == 1;
+      RelDistribution inputRelDistribution = inputs.get(0).getTraitSet().getDistribution();
+      if (inputRelDistribution != null) {
+        // create a mapping that only contains the group set
+        LogicalAggregate agg = (LogicalAggregate) node;
+        List<Integer> groupSetIndices = new ArrayList<>();
+        agg.getGroupSet().forEach(groupSetIndices::add);
+        return inputRelDistribution.apply(Mappings.target(groupSetIndices, input.getRowType().getFieldCount()));
+      }
+    } else if (node instanceof LogicalJoin) {
+      // TODO: we only map a single RelTrait from the LEFT table, later we should support RIGHT table as well
+      assert inputs.size() == 2;
+      RelDistribution inputRelDistribution = inputs.get(0).getTraitSet().getDistribution();
+      if (inputRelDistribution != null) {
+        // Since we only support LEFT RelTrait propagation, the inputRelDistribution can directly be applied
+        // b/c the Join node always puts left relation RowTypes then right relation RowTypes sequentially.
+        return inputRelDistribution;
+      }
+    }
+    // TODO: add the rest of the nodes.
+    return computeCurrentDistribution(node);
+  }
+
+  private static RelDistribution computeCurrentDistribution(RelNode node) {
+    if (node instanceof Exchange) {
+      return ((Exchange) node).getDistribution();
+    } else if (node instanceof LogicalTableScan) {
+      LogicalTableScan tableScan = (LogicalTableScan) node;
+      // convert table scan hints into rel trait
+      String partitionKey =
+          PinotHintStrategyTable.getHintOption(tableScan.getHints(), PinotHintOptions.TABLE_HINT_OPTIONS,
+              PinotHintOptions.TableHintOptions.PARTITION_KEY);
+      if (partitionKey != null) {
+        int partitionIndex = tableScan.getRowType().getField(partitionKey, true, true).getIndex();
+        return RelDistributions.hash(ImmutableList.of(partitionIndex));
+      } else {
+        return RelDistributions.of(RelDistribution.Type.RANDOM_DISTRIBUTED, RelDistributions.EMPTY);
+      }
+    } else if (node instanceof LogicalAggregate) {
+      LogicalAggregate agg = (LogicalAggregate) node;
+      AggregateNode.AggType aggType = AggregateNode.AggType.valueOf(PinotHintStrategyTable.getHintOption(agg.getHints(),
+          PinotHintOptions.INTERNAL_AGG_OPTIONS, PinotHintOptions.InternalAggregateOptions.AGG_TYPE));
+      if (aggType == AggregateNode.AggType.FINAL || aggType == AggregateNode.AggType.DIRECT) {
+        List<Integer> groupSetIndices = new ArrayList<>();
+        agg.getGroupSet().forEach(groupSetIndices::add);
+        return RelDistributions.hash(groupSetIndices);
+      } else {
+        return RelDistributions.of(RelDistribution.Type.RANDOM_DISTRIBUTED, RelDistributions.EMPTY);
+      }
+    }
+    return RelDistributions.of(RelDistribution.Type.RANDOM_DISTRIBUTED, RelDistributions.EMPTY);
+  }
+}
diff --git a/pinot-query-planner/src/main/java/org/apache/pinot/query/QueryEnvironment.java b/pinot-query-planner/src/main/java/org/apache/pinot/query/QueryEnvironment.java
@@ -29,6 +29,7 @@
 import org.apache.calcite.config.CalciteConnectionProperty;
 import org.apache.calcite.jdbc.CalciteSchema;
 import org.apache.calcite.plan.RelOptCluster;
+import org.apache.calcite.plan.RelOptPlanner;
 import org.apache.calcite.plan.RelOptRule;
 import org.apache.calcite.plan.RelOptUtil;
 import org.apache.calcite.plan.hep.HepMatchOrder;
@@ -42,6 +43,7 @@
 import org.apache.calcite.rel.hint.PinotHintStrategyTable;
 import org.apache.calcite.rel.logical.LogicalCorrelate;
 import org.apache.calcite.rel.rules.PinotQueryRuleSets;
+import org.apache.calcite.rel.rules.PinotRelDistributionTraitRule;
 import org.apache.calcite.rel.type.RelDataTypeFactory;
 import org.apache.calcite.rex.RexBuilder;
 import org.apache.calcite.runtime.CalciteContextException;
@@ -89,7 +91,8 @@ public class QueryEnvironment {
   private final Prepare.CatalogReader _catalogReader;
   private final RelDataTypeFactory _typeFactory;
 
-  private final HepProgram _hepProgram;
+  private final HepProgram _optProgram;
+  private final HepProgram _traitProgram;
 
   // Pinot extensions
   private final WorkerManager _workerManager;
@@ -121,38 +124,8 @@ public QueryEnvironment(TypeFactory typeFactory, CalciteSchema rootSchema, Worke
             .addRelBuilderConfigTransform(c -> c.withPushJoinCondition(true))
             .addRelBuilderConfigTransform(c -> c.withAggregateUnique(true)))
         .build();
-
-    HepProgramBuilder hepProgramBuilder = new HepProgramBuilder();
-    // Set the match order as DEPTH_FIRST. The default is arbitrary which works the same as DEPTH_FIRST, but it's
-    // best to be explicit.
-    hepProgramBuilder.addMatchOrder(HepMatchOrder.DEPTH_FIRST);
-
-    // ----
-    // Run the Calcite CORE rules using 1 HepInstruction per rule. We use 1 HepInstruction per rule for simplicity:
-    // the rules used here can rest assured that they are the only ones evaluated in a dedicated graph-traversal.
-    for (RelOptRule relOptRule : PinotQueryRuleSets.BASIC_RULES) {
-      hepProgramBuilder.addRuleInstance(relOptRule);
-    }
-
-    // ----
-    // Run Pinot rule to attach aggregation auxiliary info
-    hepProgramBuilder.addRuleCollection(PinotQueryRuleSets.PINOT_AGG_PROCESS_RULES);
-
-    // ----
-    // Pushdown filters using a single HepInstruction.
-    hepProgramBuilder.addRuleCollection(PinotQueryRuleSets.FILTER_PUSHDOWN_RULES);
-
-    // ----
-    // Prune duplicate/unnecessary nodes using a single HepInstruction.
-    // TODO: We can consider using HepMatchOrder.TOP_DOWN if we find cases where it would help.
-    hepProgramBuilder.addRuleCollection(PinotQueryRuleSets.PRUNE_RULES);
-
-    // ----
-    // Run pinot specific rules that should run after all other rules, using 1 HepInstruction per rule.
-    for (RelOptRule relOptRule : PinotQueryRuleSets.PINOT_POST_RULES) {
-      hepProgramBuilder.addRuleInstance(relOptRule);
-    }
-    _hepProgram = hepProgramBuilder.build();
+    _optProgram = getOptProgram();
+    _traitProgram = getTraitProgram();
   }
 
   /**
@@ -168,7 +141,8 @@ public QueryEnvironment(TypeFactory typeFactory, CalciteSchema rootSchema, Worke
    * @return QueryPlannerResult containing the dispatchable query plan and the relRoot.
    */
   public QueryPlannerResult planQuery(String sqlQuery, SqlNodeAndOptions sqlNodeAndOptions, long requestId) {
-    try (PlannerContext plannerContext = new PlannerContext(_config, _catalogReader, _typeFactory, _hepProgram)) {
+    try (PlannerContext plannerContext = new PlannerContext(_config, _catalogReader, _typeFactory, _optProgram,
+        _traitProgram)) {
       plannerContext.setOptions(sqlNodeAndOptions.getOptions());
       RelRoot relRoot = compileQuery(sqlNodeAndOptions.getSqlNode(), plannerContext);
       // TODO: current code only assume one SubPlan per query, but we should support multiple SubPlans per query.
@@ -196,7 +170,8 @@ public QueryPlannerResult planQuery(String sqlQuery, SqlNodeAndOptions sqlNodeAn
    * @return QueryPlannerResult containing the explained query plan and the relRoot.
    */
   public QueryPlannerResult explainQuery(String sqlQuery, SqlNodeAndOptions sqlNodeAndOptions, long requestId) {
-    try (PlannerContext plannerContext = new PlannerContext(_config, _catalogReader, _typeFactory, _hepProgram)) {
+    try (PlannerContext plannerContext = new PlannerContext(_config, _catalogReader, _typeFactory, _optProgram,
+        _traitProgram)) {
       SqlExplain explain = (SqlExplain) sqlNodeAndOptions.getSqlNode();
       plannerContext.setOptions(sqlNodeAndOptions.getOptions());
       RelRoot relRoot = compileQuery(explain.getExplicandum(), plannerContext);
@@ -229,7 +204,8 @@ public String explainQuery(String sqlQuery, long requestId) {
   }
 
   public List<String> getTableNamesForQuery(String sqlQuery) {
-    try (PlannerContext plannerContext = new PlannerContext(_config, _catalogReader, _typeFactory, _hepProgram)) {
+    try (PlannerContext plannerContext = new PlannerContext(_config, _catalogReader, _typeFactory, _optProgram,
+        _traitProgram)) {
       SqlNode sqlNode = CalciteSqlParser.compileToSqlNodeAndOptions(sqlQuery).getSqlNode();
       if (sqlNode.getKind().equals(SqlKind.EXPLAIN)) {
         sqlNode = ((SqlExplain) sqlNode).getExplicandum();
@@ -335,8 +311,12 @@ private RelNode optimize(RelRoot relRoot, PlannerContext plannerContext) {
     // 4. optimize relNode
     // TODO: add support for traits, cost factory.
     try {
-      plannerContext.getRelOptPlanner().setRoot(relRoot.rel);
-      return plannerContext.getRelOptPlanner().findBestExp();
+      RelOptPlanner optPlanner = plannerContext.getRelOptPlanner();
+      optPlanner.setRoot(relRoot.rel);
+      RelNode optimized = optPlanner.findBestExp();
+      RelOptPlanner traitPlanner = plannerContext.getRelTraitPlanner();
+      traitPlanner.setRoot(optimized);
+      return traitPlanner.findBestExp();
     } catch (Exception e) {
       throw new UnsupportedOperationException(
           "Cannot generate a valid execution plan for the given query: " + RelOptUtil.toString(relRoot.rel), e);
@@ -364,4 +344,50 @@ private DispatchableSubPlan toDispatchableSubPlan(RelRoot relRoot, PlannerContex
   private HintStrategyTable getHintStrategyTable() {
     return PinotHintStrategyTable.PINOT_HINT_STRATEGY_TABLE;
   }
+
+  private static HepProgram getOptProgram() {
+    HepProgramBuilder hepProgramBuilder = new HepProgramBuilder();
+    // Set the match order as DEPTH_FIRST. The default is arbitrary which works the same as DEPTH_FIRST, but it's
+    // best to be explicit.
+    hepProgramBuilder.addMatchOrder(HepMatchOrder.DEPTH_FIRST);
+
+    // ----
+    // Run the Calcite CORE rules using 1 HepInstruction per rule. We use 1 HepInstruction per rule for simplicity:
+    // the rules used here can rest assured that they are the only ones evaluated in a dedicated graph-traversal.
+    for (RelOptRule relOptRule : PinotQueryRuleSets.BASIC_RULES) {
+      hepProgramBuilder.addRuleInstance(relOptRule);
+    }
+
+    // ----
+    // Run Pinot rule to attach aggregation auxiliary info
+    hepProgramBuilder.addRuleCollection(PinotQueryRuleSets.PINOT_AGG_PROCESS_RULES);
+
+    // ----
+    // Pushdown filters using a single HepInstruction.
+    hepProgramBuilder.addRuleCollection(PinotQueryRuleSets.FILTER_PUSHDOWN_RULES);
+
+    // ----
+    // Prune duplicate/unnecessary nodes using a single HepInstruction.
+    // TODO: We can consider using HepMatchOrder.TOP_DOWN if we find cases where it would help.
+    hepProgramBuilder.addRuleCollection(PinotQueryRuleSets.PRUNE_RULES);
+    return hepProgramBuilder.build();
+  }
+
+  private static HepProgram getTraitProgram() {
+    HepProgramBuilder hepProgramBuilder = new HepProgramBuilder();
+
+    // Set the match order as BOTTOM_UP.
+    hepProgramBuilder.addMatchOrder(HepMatchOrder.BOTTOM_UP);
+
+    // ----
+    // Run pinot specific rules that should run after all other rules, using 1 HepInstruction per rule.
+    for (RelOptRule relOptRule : PinotQueryRuleSets.PINOT_POST_RULES) {
+      hepProgramBuilder.addRuleInstance(relOptRule);
+    }
+
+    // apply RelDistribution trait to all nodes
+    hepProgramBuilder.addRuleInstance(PinotRelDistributionTraitRule.INSTANCE);
+
+    return hepProgramBuilder.build();
+  }
 }