apache · Jackie-Jiang · Jun 10, 2022 · Jun 9, 2022
diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/distinct/DistinctExecutorFactory.java b/pinot-core/src/main/java/org/apache/pinot/core/query/distinct/DistinctExecutorFactory.java
@@ -18,7 +18,6 @@
  */
 package org.apache.pinot.core.query.distinct;
 
-import com.google.common.base.Preconditions;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.pinot.common.request.context.ExpressionContext;
@@ -106,12 +105,13 @@ private static DistinctExecutor getDistinctOnlyExecutor(List<ExpressionContext>
       }
     } else {
       // Multiple columns
+      boolean hasMVExpression = false;
       List<DataType> dataTypes = new ArrayList<>(numExpressions);
       for (ExpressionContext expression : expressions) {
         TransformResultMetadata expressionMetadata = transformOperator.getResultMetadata(expression);
-        // TODO: Support MV expression
-        Preconditions.checkArgument(expressionMetadata.isSingleValue(),
-            "DISTINCT cannot be applied to multiple expressions with MV expression: %s", expression);
+        if (!expressionMetadata.isSingleValue()) {
+          hasMVExpression = true;
+        }
         dataTypes.add(expressionMetadata.getDataType());
       }
       List<Dictionary> dictionaries = new ArrayList<>(numExpressions);
@@ -127,10 +127,11 @@ private static DistinctExecutor getDistinctOnlyExecutor(List<ExpressionContext>
       }
       if (dictionaryBased) {
         // Dictionary based
-        return new DictionaryBasedMultiColumnDistinctOnlyExecutor(expressions, dictionaries, dataTypes, limit);
+        return new DictionaryBasedMultiColumnDistinctOnlyExecutor(expressions, hasMVExpression, dictionaries, dataTypes,
+            limit);
       } else {
         // Raw value based
-        return new RawMultiColumnDistinctExecutor(expressions, dataTypes, null, limit);
+        return new RawMultiColumnDistinctExecutor(expressions, hasMVExpression, dataTypes, null, limit);
       }
     }
   }
@@ -174,12 +175,13 @@ private static DistinctExecutor getDistinctOrderByExecutor(List<ExpressionContex
       }
     } else {
       // Multiple columns
+      boolean hasMVExpression = false;
       List<DataType> dataTypes = new ArrayList<>(numExpressions);
       for (ExpressionContext expression : expressions) {
         TransformResultMetadata expressionMetadata = transformOperator.getResultMetadata(expression);
-        // TODO: Support MV expression
-        Preconditions.checkArgument(expressionMetadata.isSingleValue(),
-            "DISTINCT cannot be applied to multiple expressions with MV expression: %s", expression);
+        if (!expressionMetadata.isSingleValue()) {
+          hasMVExpression = true;
+        }
         dataTypes.add(expressionMetadata.getDataType());
       }
       List<Dictionary> dictionaries = new ArrayList<>(numExpressions);
@@ -196,11 +198,11 @@ private static DistinctExecutor getDistinctOrderByExecutor(List<ExpressionContex
       }
       if (dictionaryBased) {
         // Dictionary based
-        return new DictionaryBasedMultiColumnDistinctOrderByExecutor(expressions, dictionaries, dataTypes,
-            orderByExpressions, limit);
+        return new DictionaryBasedMultiColumnDistinctOrderByExecutor(expressions, hasMVExpression, dictionaries,
+            dataTypes, orderByExpressions, limit);
       } else {
         // Raw value based
-        return new RawMultiColumnDistinctExecutor(expressions, dataTypes, orderByExpressions, limit);
+        return new RawMultiColumnDistinctExecutor(expressions, hasMVExpression, dataTypes, orderByExpressions, limit);
       }
     }
   }

diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/distinct/DistinctExecutorUtils.java b/pinot-core/src/main/java/org/apache/pinot/core/query/distinct/DistinctExecutorUtils.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.query.distinct;
+
+public class DistinctExecutorUtils {
+  private DistinctExecutorUtils() {
+  }
+
+  /**
+   * Returns an array of dictionary ids for a given document on multiple expressions (SV or MV).
+   */
+  public static int[][] getDictIds(int[][] svDictIds, int[][][] mvDictIds, int docId) {
+    int[][] dictIdsArray = null;
+
+    // Before converting to array, keep single dict ids for better performance
+    int numExpressions = svDictIds.length;
+    int[] singleDictIds = new int[numExpressions];
+
+    for (int i = 0; i < numExpressions; i++) {
+      if (svDictIds[i] != null) {
+        int dictId = svDictIds[i][docId];
+        if (dictIdsArray == null) {
+          singleDictIds[i] = dictId;
+        } else {
+          for (int[] dictIds : dictIdsArray) {
+            dictIds[i] = dictId;
+          }
+        }
+      } else {
+        int[] dictIdsForMV = mvDictIds[i][docId];
+        int numValues = dictIdsForMV.length;
+
+        // Specialize multi-value column with only one value inside
+        if (numValues == 1) {
+          int dictId = dictIdsForMV[0];
+          if (dictIdsArray == null) {
+            singleDictIds[i] = dictId;
+          } else {
+            for (int[] dictIds : dictIdsArray) {
+              dictIds[i] = dictId;
+            }
+          }
+        } else {
+          if (dictIdsArray == null) {
+            dictIdsArray = new int[numValues][];
+            for (int j = 0; j < numValues; j++) {
+              int dictId = dictIdsForMV[j];
+              dictIdsArray[j] = singleDictIds.clone();
+              dictIdsArray[j][i] = dictId;
+            }
+          } else {
+            int currentLength = dictIdsArray.length;
+            int newLength = currentLength * numValues;
+            int[][] newDictIdsArray = new int[newLength][];
+            System.arraycopy(dictIdsArray, 0, newDictIdsArray, 0, currentLength);
+            for (int j = 1; j < numValues; j++) {
+              int offset = j * currentLength;
+              for (int k = 0; k < currentLength; k++) {
+                newDictIdsArray[offset + k] = dictIdsArray[k].clone();
+              }
+            }
+            for (int j = 0; j < numValues; j++) {
+              int dictId = dictIdsForMV[j];
+              int startOffset = j * currentLength;
+              int endOffset = startOffset + currentLength;
+              for (int k = startOffset; k < endOffset; k++) {
+                newDictIdsArray[k][i] = dictId;
+              }
+            }
+            dictIdsArray = newDictIdsArray;
+          }
+        }
+      }
+    }
+
+    return dictIdsArray == null ? new int[][]{singleDictIds} : dictIdsArray;
+  }
+
+  /**
+   * Returns an array of records for a given document on multiple expressions (SV or MV).
+   */
+  public static Object[][] getRecords(Object[][] svValues, Object[][][] mvValues, int docId) {
+    Object[][] records = null;
+
+    // Before converting to array, keep single record for better performance
+    int numExpressions = svValues.length;
+    Object[] singleRecord = new Object[numExpressions];
+
+    for (int i = 0; i < numExpressions; i++) {
+      if (svValues[i] != null) {
+        Object value = svValues[i][docId];
+        if (records == null) {
+          singleRecord[i] = value;
+        } else {
+          for (Object[] record : records) {
+            record[i] = value;
+          }
+        }
+      } else {
+        Object[] valuesForMV = mvValues[i][docId];
+        int numValues = valuesForMV.length;
+
+        // Specialize multi-value column with only one value inside
+        if (numValues == 1) {
+          Object value = valuesForMV[0];
+          if (records == null) {
+            singleRecord[i] = value;
+          } else {
+            for (Object[] record : records) {
+              record[i] = value;
+            }
+          }
+        } else {
+          if (records == null) {
+            records = new Object[numValues][];
+            for (int j = 0; j < numValues; j++) {
+              Object value = valuesForMV[j];
+              records[j] = singleRecord.clone();
+              records[j][i] = value;
+            }
+          } else {
+            int currentLength = records.length;
+            int newLength = currentLength * numValues;
+            Object[][] newRecords = new Object[newLength][];
+            System.arraycopy(records, 0, newRecords, 0, currentLength);
+            for (int j = 1; j < numValues; j++) {
+              int offset = j * currentLength;
+              for (int k = 0; k < currentLength; k++) {
+                newRecords[offset + k] = records[k].clone();
+              }
+            }
+            for (int j = 0; j < numValues; j++) {
+              Object value = valuesForMV[j];
+              int startOffset = j * currentLength;
+              int endOffset = startOffset + currentLength;
+              for (int k = startOffset; k < endOffset; k++) {
+                newRecords[k][i] = value;
+              }
+            }
+            records = newRecords;
+          }
+        }
+      }
+    }
+
+    return records == null ? new Object[][]{singleRecord} : records;
+  }
+}
diff --git a/.../pinot/core/query/distinct/dictionary/DictionaryBasedMultiColumnDistinctOnlyExecutor.java b/.../pinot/core/query/distinct/dictionary/DictionaryBasedMultiColumnDistinctOnlyExecutor.java
@@ -23,6 +23,7 @@
 import org.apache.pinot.core.common.BlockValSet;
 import org.apache.pinot.core.operator.blocks.TransformBlock;
 import org.apache.pinot.core.query.distinct.DistinctExecutor;
+import org.apache.pinot.core.query.distinct.DistinctExecutorUtils;
 import org.apache.pinot.segment.spi.index.reader.Dictionary;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
 
@@ -31,28 +32,52 @@
  * {@link DistinctExecutor} for distinct only queries with multiple dictionary-encoded columns.
  */
 public class DictionaryBasedMultiColumnDistinctOnlyExecutor extends BaseDictionaryBasedMultiColumnDistinctExecutor {
+  private final boolean _hasMVExpression;
 
-  public DictionaryBasedMultiColumnDistinctOnlyExecutor(List<ExpressionContext> expressions,
+  public DictionaryBasedMultiColumnDistinctOnlyExecutor(List<ExpressionContext> expressions, boolean hasMVExpression,
       List<Dictionary> dictionaries, List<DataType> dataTypes, int limit) {
     super(expressions, dictionaries, dataTypes, limit);
+    _hasMVExpression = hasMVExpression;
   }
 
   @Override
   public boolean process(TransformBlock transformBlock) {
     int numDocs = transformBlock.getNumDocs();
     int numExpressions = _expressions.size();
-    int[][] dictIdsArray = new int[numDocs][numExpressions];
-    for (int i = 0; i < numExpressions; i++) {
-      BlockValSet blockValueSet = transformBlock.getBlockValueSet(_expressions.get(i));
-      int[] dictIdsForExpression = blockValueSet.getDictionaryIdsSV();
-      for (int j = 0; j < numDocs; j++) {
-        dictIdsArray[j][i] = dictIdsForExpression[j];
+    if (!_hasMVExpression) {
+      int[][] dictIdsArray = new int[numDocs][numExpressions];
+      for (int i = 0; i < numExpressions; i++) {
+        BlockValSet blockValueSet = transformBlock.getBlockValueSet(_expressions.get(i));
+        int[] dictIdsForExpression = blockValueSet.getDictionaryIdsSV();
+        for (int j = 0; j < numDocs; j++) {
+          dictIdsArray[j][i] = dictIdsForExpression[j];
+        }
       }
-    }
-    for (int i = 0; i < numDocs; i++) {
-      _dictIdsSet.add(new DictIds(dictIdsArray[i]));
-      if (_dictIdsSet.size() >= _limit) {
-        return true;
+      for (int i = 0; i < numDocs; i++) {
+        _dictIdsSet.add(new DictIds(dictIdsArray[i]));
+        if (_dictIdsSet.size() >= _limit) {
+          return true;
+        }
+      }
+    } else {
+      int[][] svDictIds = new int[numExpressions][];
+      int[][][] mvDictIds = new int[numExpressions][][];
+      for (int i = 0; i < numExpressions; i++) {
+        BlockValSet blockValueSet = transformBlock.getBlockValueSet(_expressions.get(i));
+        if (blockValueSet.isSingleValue()) {
+          svDictIds[i] = blockValueSet.getDictionaryIdsSV();
+        } else {
+          mvDictIds[i] = blockValueSet.getDictionaryIdsMV();
+        }
+      }
+      for (int i = 0; i < numDocs; i++) {
+        int[][] dictIdsArray = DistinctExecutorUtils.getDictIds(svDictIds, mvDictIds, i);
+        for (int[] dictIds : dictIdsArray) {
+          _dictIdsSet.add(new DictIds(dictIds));
+          if (_dictIdsSet.size() >= _limit) {
+            return true;
+          }
+        }
       }
     }
     return false;