Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ public abstract class BaseBrokerRequestHandler implements BrokerRequestHandler {
private static final String IN_SUBQUERY = "inSubquery";
private static final Expression FALSE = RequestUtils.getLiteralExpression(false);
private static final Expression TRUE = RequestUtils.getLiteralExpression(true);
private static final Expression STAR = RequestUtils.getIdentifierExpression("*");

protected final PinotConfiguration _config;
protected final RoutingManager _routingManager;
Expand Down Expand Up @@ -1459,8 +1460,17 @@ static void updateColumnNames(String rawTableName, PinotQuery pinotQuery, boolea
Map<String, String> columnNameMap) {
Map<String, String> aliasMap = new HashMap<>();
if (pinotQuery != null) {
boolean hasStar = false;
for (Expression expression : pinotQuery.getSelectList()) {
fixColumnName(rawTableName, expression, columnNameMap, aliasMap, isCaseInsensitive);
//check if the select expression is '*'
if (!hasStar && expression.equals(STAR)) {
hasStar = true;
}
}
//if query has a '*' selection along with other columns
if (hasStar) {
expandStarExpressionsToActualColumns(pinotQuery, columnNameMap);
}
Expression filterExpression = pinotQuery.getFilterExpression();
if (filterExpression != null) {
Expand All @@ -1487,6 +1497,30 @@ static void updateColumnNames(String rawTableName, PinotQuery pinotQuery, boolea
}
}

private static void expandStarExpressionsToActualColumns(PinotQuery pinotQuery, Map<String, String> columnNameMap) {
List<Expression> originalSelections = pinotQuery.getSelectList();
//expand '*'
List<Expression> expandedSelections = new ArrayList<>();
for (String tableCol : columnNameMap.values()) {
Expression newSelection = RequestUtils.createIdentifierExpression(tableCol);
//we exclude default virtual columns
if (tableCol.charAt(0) != '$') {
expandedSelections.add(newSelection);
}
}
//sort naturally
expandedSelections.sort(null);
List<Expression> newSelections = new ArrayList<>();
for (Expression originalSelection : originalSelections) {
if (originalSelection.equals(STAR)) {
newSelections.addAll(expandedSelections);
} else {
newSelections.add(originalSelection);
}
}
pinotQuery.setSelectList(newSelections);
}

/**
* Fixes the column names to the actual column names in the given broker request.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.pinot.broker.requesthandler;

import com.google.common.collect.ImmutableMap;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.pinot.common.request.Expression;
import org.apache.pinot.common.request.PinotQuery;
import org.apache.pinot.sql.parsers.CalciteSqlParser;
import org.testng.Assert;
import org.testng.annotations.Test;


public class SelectStarWithOtherColsRewriteTest {
Copy link
Contributor

@amrishlal amrishlal Dec 27, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please also add test cases for:

  • SELECT sqrt(homeRuns), * FROM baseballStats,
  • Use of more than one unqualified * in the select statement should result in syntax error (calcite is probably doing this already).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use of more than one unqualified * in the select statement should result in syntax error

Tried this with MySQL, it throws an error. With Postgres, it works. IMO if the context is clear enough (in this case we know the table name on which * is being selected), then we should return all the columns without the qualifier like how Postgres is doing it? Calcite doesn't throw a syntax error in this case btw.


private static final Map<String, String> COL_MAP;

static {
//build table schema
ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
builder.put("playerID", "playerID");
builder.put("homeRuns", "homeRuns");
builder.put("playerStint", "playerStint");
builder.put("groundedIntoDoublePlays", "groundedIntoDoublePlays");
builder.put("G_old", "G_old");
builder.put("$segmentName", "$segmentName");
builder.put("$docId", "$docId");
builder.put("$hostName", "$hostName");
COL_MAP = builder.build();
}

/**
* When the query contains only '*', it should be expanded into columns.
*/
@Test
public void testShouldExpandWhenOnlyStarIsSelected() {
String sql = "SELECT * FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
Map<String, Integer> countMap = new HashMap<>();
for (Expression selection : newSelections) {
String col = selection.getIdentifier().getName();
countMap.put(col, countMap.getOrDefault(col, 0) + 1);
}
Assert.assertEquals(countMap.size(), 5, "More new selections than expected");
Assert.assertTrue(countMap.keySet().containsAll(COL_MAP.keySet().stream().filter(a -> !a.startsWith("$")).collect(
Collectors.toList())), "New selections contain virtual columns");
countMap.forEach(
(key, value) -> Assert.assertEquals((int) value, 1, key + " has more than one occurrences in new selection"));
}

/**
* Expansion should not contain any virtual columns
*/
@Test
public void testShouldNotReturnExtraDefaultColumns() {
String sql = "SELECT $docId,*,$segmentName FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
int docIdCnt = 0;
int segmentNameCnt = 0;
for (Expression newSelection : newSelections) {
String colName = newSelection.getIdentifier().getName();
switch (colName) {
case "$docId":
docIdCnt++;
break;
case "$segmentName":
segmentNameCnt++;
break;
case "$hostName":
throw new RuntimeException("Extra default column returned");
default:
}
}
Assert.assertEquals(docIdCnt, 1);
Assert.assertEquals(segmentNameCnt, 1);
}

/**
* Columns should not be deduped
*/
@Test
public void testShouldNotDedupMultipleRequestedColumns() {
String sql = "SELECT playerID,*,G_old FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
int playerIdCnt = 0;
int goldCount = 0;
for (Expression newSelection : newSelections) {
String colName = newSelection.getIdentifier().getName();
switch (colName) {
case "playerID":
playerIdCnt++;
break;
case "G_old":
goldCount++;
break;
default:
}
}
Assert.assertEquals(playerIdCnt, 2, "playerID does not occur once");
Assert.assertEquals(goldCount, 2, "G_old occurs does not occur once");
}

/**
* Selections should be returned in the requested order
*/
@Test
public void testSelectionOrder() {
String sql = "SELECT playerID,*,G_old FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not correct / compliant with Standard SQL as far as I can tell. The column list in the select expressions should be deduplicated. So, playerID and homeRuns should be returned only once.

Even generally for non star queries, using the same column name twice in the SELECT clause should be deduplicated.

However, the scenario of aliasing must be handled.

For example, the query SELECT CustomerID AS C1, * FROM Customers must return CustomerID column in the result twice (with result column name as C1 and CustomerID respectively).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@siddharthteotia thanks for the review. Actually I wasn't aware that this is dictated by the SQL standard. Had asked both of these questions (deduplication and returning internal columns) in the original issue for clarification. Will address these.

Copy link
Contributor

@amrishlal amrishlal Dec 27, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I am seeing on mysql (no dedup when * is combined with column names in select list), but not sure how standards complaint mysql is. In either case we should check against at least two other databases before setting behavior here since it will be difficult to change once set.

mysql> select *, cases, abs(cases) from covid_cases;
+------------+-------+-------+------------+
| date       | cases | cases | abs(cases) |
+------------+-------+-------+------------+
| 2021-01-01 |    30 |    30 |         30 |
| 2021-01-02 |    60 |    60 |         60 |
| 2021-01-03 |   120 |   120 |        120 |
| 2021-01-04 |   100 |   100 |        100 |
| 2021-01-05 |    80 |    80 |         80 |
| 2021-01-06 |    70 |    70 |         70 |
| 2021-01-07 |    85 |    85 |         85 |
| 2021-01-08 |    65 |    65 |         65 |
| 2021-01-01 |    70 |    70 |         70 |
| 2021-01-02 |   100 |   100 |        100 |
+------------+-------+-------+------------+
10 rows in set (0.00 sec)

mysql> select * from covid_cases;
+------------+-------+
| date       | cases |
+------------+-------+
| 2021-01-01 |    30 |
| 2021-01-02 |    60 |
| 2021-01-03 |   120 |
| 2021-01-04 |   100 |
| 2021-01-05 |    80 |
| 2021-01-06 |    70 |
| 2021-01-07 |    85 |
| 2021-01-08 |    65 |
| 2021-01-01 |    70 |
| 2021-01-02 |   100 |
+------------+-------+
10 rows in set (0.00 sec)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried this with Postgres, same behaviour (column is returned twice). The ANSI SQL standard doesn't allow for extra columns to be added but DBs have their own extensions to it. I think we should go with how MySQL and Postgres are doing it, that is, returning the columns twice unless there are some implications.

@siddharthteotia

Copy link
Contributor

@siddharthteotia siddharthteotia Jan 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the above query mysql> select *, cases, abs(cases) from covid_cases, cases and abs(cases) are different since the latter is using function. By deduplication, I mean to do only for the identifiers (simple column names) in the SELECT list.

My preference would be to deduplicate and not return the column twice as the output is cleaner that way. So, in the above query cases column should be returned exactly once.

If the user wants to get the same column twice, they should use aliasing (like I mentioned earlier)

SELECT CustomerID AS C1, * FROM Customers

CustomerID should be returned twice -- once as C1 and next as CustomerID

But if we want to stick to how MySQL or Postgres is doing and that is the general ANSI SQL / user expectation, then let's not deduplicate. I am ok with it. Just does not look clean

Trying the query here seems to deduplicate -- https://www.w3schools.com/sql/trysql.asp?filename=trysql_select_all

Not sure which database it is using underneath

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assert.assertEquals(newSelections.get(0).getIdentifier().getName(), "playerID");
Assert.assertEquals(newSelections.get(newSelections.size() - 1).getIdentifier().getName(), "G_old");
//the expanded list should be alphabetically sorted
List<Expression> expandedSelections = newSelections.subList(1, newSelections.size() - 1);
List<Expression> assumedUnsortedSelections = new ArrayList<>(expandedSelections);
//sort alphabetically
assumedUnsortedSelections.sort(null);
Assert.assertEquals(expandedSelections, assumedUnsortedSelections, "Expanded selections not sorted alphabetically");
}

/**
* When the same column is requested twice, once with an alias and once as part of *, then it should be returned twice
*/
@Test
public void testAliasing() {
String sql = "SELECT playerID as pid,* FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
Assert.assertTrue(newSelections.get(0).isSetFunctionCall());
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperator(), "AS");
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperands().get(0).getIdentifier().getName(),
"playerID");
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperands().get(1).getIdentifier().getName(), "pid");
boolean playerIdPresent = false;
for (int i = 1; i < newSelections.size(); i++) {
if (newSelections.get(i).getIdentifier().getName().equals("playerID")) {
playerIdPresent = true;
break;
}
}
Assert.assertTrue(playerIdPresent, "playerID col is missing");
}

/**
* When a function is applied to a column, then that col is returned along with the original column
*/
@Test
public void testFuncOnColumns1() {
String sql = "SELECT sqrt(homeRuns),* FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
Assert.assertTrue(newSelections.get(0).isSetFunctionCall());
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperator(), "SQRT");
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperands().get(0).getIdentifier().getName(),
"homeRuns");
//homeRuns is returned as well
int homeRunsCnt = 0;
for (Expression selection : newSelections) {
if (selection.isSetIdentifier() && selection.getIdentifier().getName().equals("homeRuns")) {
homeRunsCnt++;
}
}
Assert.assertEquals(homeRunsCnt, 1);
}

/**
* When a function is applied to a column, then that col is returned along with the original column
*/
@Test
public void testFuncOnColumns2() {
String sql = "SELECT add(homeRuns,groundedIntoDoublePlays),* FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
Assert.assertTrue(newSelections.get(0).isSetFunctionCall());
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperator(), "ADD");
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperands().get(0).getIdentifier().getName(),
"homeRuns");
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperands().get(1).getIdentifier().getName(),
"groundedIntoDoublePlays");
//homeRuns is returned as well
int homeRunsCnt = 0;
int groundedIntoDoublePlaysCnt = 0;
for (Expression selection : newSelections) {
if (selection.isSetIdentifier() && selection.getIdentifier().getName().equals("homeRuns")) {
homeRunsCnt++;
} else if (selection.isSetIdentifier() && selection.getIdentifier().getName().equals("groundedIntoDoublePlays")) {
groundedIntoDoublePlaysCnt++;
}
}
Assert.assertEquals(homeRunsCnt, 1);
Assert.assertEquals(groundedIntoDoublePlaysCnt, 1);
}

/**
* When 'n' no. of unqualified * are present, then each column is returned 'n' times.
*/
@Test
public void testMultipleUnqualifiedStars() {
String sql = "SELECT *,* FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
Assert.assertEquals(newSelections.get(0).getIdentifier().getName(), "G_old");
Assert.assertEquals(newSelections.get(1).getIdentifier().getName(), "groundedIntoDoublePlays");
Assert.assertEquals(newSelections.get(2).getIdentifier().getName(), "homeRuns");
Assert.assertEquals(newSelections.get(3).getIdentifier().getName(), "playerID");
Assert.assertEquals(newSelections.get(4).getIdentifier().getName(), "playerStint");
Assert.assertEquals(newSelections.get(5).getIdentifier().getName(), "G_old");
Assert.assertEquals(newSelections.get(6).getIdentifier().getName(), "groundedIntoDoublePlays");
Assert.assertEquals(newSelections.get(7).getIdentifier().getName(), "homeRuns");
Assert.assertEquals(newSelections.get(8).getIdentifier().getName(), "playerID");
Assert.assertEquals(newSelections.get(9).getIdentifier().getName(), "playerStint");
}

@Test
public void testAll() {
String sql =
"SELECT abs(homeRuns),sqrt(groundedIntoDoublePlays),*,$segmentName,$hostName,playerStint as pstint,playerID "
+ "FROM baseballStats";
PinotQuery pinotQuery = CalciteSqlParser.compileToPinotQuery(sql);
BaseBrokerRequestHandler.updateColumnNames("baseballStats", pinotQuery, false, COL_MAP);
List<Expression> newSelections = pinotQuery.getSelectList();
Assert.assertTrue(newSelections.get(0).isSetFunctionCall());
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperator(), "ABS");
Assert.assertEquals(newSelections.get(0).getFunctionCall().getOperands().get(0).getIdentifier().getName(),
"homeRuns");
Assert.assertTrue(newSelections.get(1).isSetFunctionCall());
Assert.assertEquals(newSelections.get(1).getFunctionCall().getOperator(), "SQRT");
Assert.assertEquals(newSelections.get(1).getFunctionCall().getOperands().get(0).getIdentifier().getName(),
"groundedIntoDoublePlays");
Assert.assertEquals(newSelections.get(2).getIdentifier().getName(), "G_old");
Assert.assertEquals(newSelections.get(3).getIdentifier().getName(), "groundedIntoDoublePlays");
Assert.assertEquals(newSelections.get(4).getIdentifier().getName(), "homeRuns");
Assert.assertEquals(newSelections.get(5).getIdentifier().getName(), "playerID");
Assert.assertEquals(newSelections.get(6).getIdentifier().getName(), "playerStint");
Assert.assertEquals(newSelections.get(7).getIdentifier().getName(), "$segmentName");
Assert.assertEquals(newSelections.get(8).getIdentifier().getName(), "$hostName");
Assert.assertEquals(newSelections.get(9).getFunctionCall().getOperator(), "AS");
Assert.assertEquals(newSelections.get(9).getFunctionCall().getOperands().get(0).getIdentifier().getName(),
"playerStint");
Assert.assertEquals(newSelections.get(9).getFunctionCall().getOperands().get(1).getIdentifier().getName(),
"pstint");
Assert.assertEquals(newSelections.get(10).getIdentifier().getName(), "playerID");
}
}