-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for configuring Theta and Tuple aggregation functions #14167
Changes from 1 commit
736c493
20f21a9
5671705
a598538
b32b9e0
9a64fdd
ed0133b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
…ms in the ST index This patch introduces a mechanism to allow configuring the aggregation function parameters for a star-tree index for Tuple and Theta sketches. Any existing aggregation that has a precision greater or equal to that of the query precision is selected as a candidate. The behaviour of the CPC aggregator has been changed accordingly.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -50,7 +50,9 @@ | |||||
import org.apache.pinot.core.query.aggregation.groupby.ObjectGroupByResultHolder; | ||||||
import org.apache.pinot.segment.local.customobject.ThetaSketchAccumulator; | ||||||
import org.apache.pinot.segment.spi.AggregationFunctionType; | ||||||
import org.apache.pinot.segment.spi.Constants; | ||||||
import org.apache.pinot.spi.data.FieldSpec.DataType; | ||||||
import org.apache.pinot.spi.utils.CommonConstants; | ||||||
import org.apache.pinot.sql.parsers.CalciteSqlParser; | ||||||
|
||||||
|
||||||
|
@@ -92,6 +94,7 @@ public class DistinctCountThetaSketchAggregationFunction | |||||
private final List<FilterEvaluator> _filterEvaluators; | ||||||
private final ExpressionContext _postAggregationExpression; | ||||||
private final UpdateSketchBuilder _updateSketchBuilder = new UpdateSketchBuilder(); | ||||||
private int _nominalEntries = ThetaUtil.DEFAULT_NOMINAL_ENTRIES; | ||||||
protected final SetOperationBuilder _setOperationBuilder = new SetOperationBuilder(); | ||||||
protected int _accumulatorThreshold = DEFAULT_ACCUMULATOR_THRESHOLD; | ||||||
|
||||||
|
@@ -108,9 +111,9 @@ public DistinctCountThetaSketchAggregationFunction(List<ExpressionContext> argum | |||||
// Allows the user to trade-off memory usage for merge CPU; higher values use more memory | ||||||
_accumulatorThreshold = parameters.getAccumulatorThreshold(); | ||||||
// Nominal entries controls sketch accuracy and size | ||||||
int nominalEntries = parameters.getNominalEntries(); | ||||||
_updateSketchBuilder.setNominalEntries(nominalEntries); | ||||||
_setOperationBuilder.setNominalEntries(nominalEntries); | ||||||
_nominalEntries = parameters.getNominalEntries(); | ||||||
_updateSketchBuilder.setNominalEntries(_nominalEntries); | ||||||
_setOperationBuilder.setNominalEntries(_nominalEntries); | ||||||
// Sampling probability sets the initial value of Theta, defaults to 1.0 | ||||||
float p = parameters.getSamplingProbability(); | ||||||
_setOperationBuilder.setP(p); | ||||||
|
@@ -1035,6 +1038,24 @@ public Comparable mergeFinalResult(Comparable finalResult1, Comparable finalResu | |||||
return (Long) finalResult1 + (Long) finalResult2; | ||||||
} | ||||||
|
||||||
@Override | ||||||
public boolean canUseStarTree(Map<String, Object> functionParameters) { | ||||||
Object nominalEntriesParam = functionParameters.get(Constants.THETASKETCH_NOMINAL_ENTRIES); | ||||||
int starTreeNominalEntries; | ||||||
|
||||||
// Check if nominal entries values match | ||||||
if (nominalEntriesParam != null) { | ||||||
starTreeNominalEntries = Integer.parseInt(String.valueOf(nominalEntriesParam)); | ||||||
} else { | ||||||
// If the functionParameters don't have an explicit nominal entries value set, it means that the star-tree | ||||||
// index was built with | ||||||
// the default value for nominal entries | ||||||
starTreeNominalEntries = CommonConstants.Helix.DEFAULT_THETA_SKETCH_NOMINAL_ENTRIES; | ||||||
} | ||||||
// Check if the query lgK param is less than or equal to that of the StarTree aggregation | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
nit: looks like this aggregation function directly takes the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although I suppose it doesn't matter either way since the comparisons are equivalent. |
||||||
return _nominalEntries <= starTreeNominalEntries; | ||||||
} | ||||||
|
||||||
// This ensures backward compatibility with servers that still return sketches directly. | ||||||
// The AggregationDataTableReducer casts intermediate results to Objects and although the code compiles, | ||||||
// types might still be incompatible at runtime due to type erasure. | ||||||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -38,6 +38,7 @@ | |||||||
import org.apache.pinot.core.query.aggregation.groupby.ObjectGroupByResultHolder; | ||||||||
import org.apache.pinot.segment.local.customobject.TupleIntSketchAccumulator; | ||||||||
import org.apache.pinot.segment.spi.AggregationFunctionType; | ||||||||
import org.apache.pinot.segment.spi.Constants; | ||||||||
import org.apache.pinot.spi.data.FieldSpec; | ||||||||
import org.apache.pinot.spi.utils.CommonConstants; | ||||||||
|
||||||||
|
@@ -275,6 +276,24 @@ public Comparable extractFinalResult(TupleIntSketchAccumulator accumulator) { | |||||||
return Base64.getEncoder().encodeToString(accumulator.getResult().toByteArray()); | ||||||||
} | ||||||||
|
||||||||
@Override | ||||||||
public boolean canUseStarTree(Map<String, Object> functionParameters) { | ||||||||
Object nominalEntriesParam = functionParameters.get(Constants.TUPLESKETCH_NOMINAL_ENTRIES); | ||||||||
int starTreeNominalEntries; | ||||||||
|
||||||||
// Check if nominal entries values match | ||||||||
if (nominalEntriesParam != null) { | ||||||||
starTreeNominalEntries = Integer.parseInt(String.valueOf(nominalEntriesParam)); | ||||||||
} else { | ||||||||
// If the functionParameters don't have an explicit nominal entries value set, it means that the star-tree | ||||||||
// index was built with | ||||||||
// the default value for nominal entries | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
nit: formatting |
||||||||
starTreeNominalEntries = (int) Math.pow(2, CommonConstants.Helix.DEFAULT_TUPLE_SKETCH_LGK); | ||||||||
} | ||||||||
// Check if the query lgK param is less than or equal to that of the StarTree aggregation | ||||||||
return _nominalEntries <= starTreeNominalEntries; | ||||||||
} | ||||||||
|
||||||||
/** | ||||||||
* Returns the accumulator from the result holder or creates a new one if it does not exist. | ||||||||
*/ | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/** | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.pinot.core.query.aggregation.function; | ||
|
||
import java.util.List; | ||
import java.util.Map; | ||
import org.apache.pinot.common.request.Literal; | ||
import org.apache.pinot.common.request.context.ExpressionContext; | ||
import org.apache.pinot.segment.spi.Constants; | ||
import org.testng.Assert; | ||
import org.testng.annotations.Test; | ||
|
||
|
||
public class DistinctCountThetaSketchAggregationFunctionTest { | ||
|
||
@Test | ||
public void testCanUseStarTreeDefaultK() { | ||
// Default aggregation function lgK = 12 / K=4096 | ||
DistinctCountThetaSketchAggregationFunction function = | ||
new DistinctCountThetaSketchAggregationFunction(List.of(ExpressionContext.forIdentifier("col"))); | ||
|
||
Assert.assertTrue(function.canUseStarTree(Map.of())); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, "4096"))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, 4096))); | ||
Assert.assertFalse(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, 2048))); | ||
|
||
function = new DistinctCountThetaSketchAggregationFunction(List.of(ExpressionContext.forIdentifier("col"), | ||
ExpressionContext.forLiteral(Literal.stringValue("nominalEntries=16384")))); | ||
|
||
Assert.assertTrue(function.canUseStarTree(Map.of())); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, "16384"))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, 16384))); | ||
Assert.assertFalse(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, 8192))); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or else There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are correct and I have removed these test assertions. |
||
} | ||
|
||
@Test | ||
public void testCanUseCustomK() { | ||
DistinctCountThetaSketchAggregationFunction function = new DistinctCountThetaSketchAggregationFunction( | ||
List.of(ExpressionContext.forIdentifier("col"), | ||
ExpressionContext.forLiteral(Literal.stringValue("nominalEntries=32768")))); | ||
|
||
// Default StarTree lgK = 14 / K=16384 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious - do you know why we chose There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like this is not the case for the tuple sketch where the same default value is used across the star-tree index value aggregator and the query aggregation function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good observation. The ThetaSketch query time aggregation function was the earliest implementation of a Datasketches sketch and was used at Linkedin according to this presentation. This implementation provided a default of The StarTree index value aggregator for ThetaSketch was introduced later in this PR. The default chosen for the StarTree was of higher precision to allow the user greater accuracy should this be desired by overriding the default query parameter value. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense, thanks for the history! |
||
Assert.assertFalse(function.canUseStarTree(Map.of())); | ||
Assert.assertFalse(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, "16384"))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, "65536"))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, 32768))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.THETASKETCH_NOMINAL_ENTRIES, "32768"))); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/** | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.pinot.core.query.aggregation.function; | ||
|
||
import java.util.List; | ||
import java.util.Map; | ||
import org.apache.datasketches.tuple.aninteger.IntegerSummary; | ||
import org.apache.pinot.common.request.Literal; | ||
import org.apache.pinot.common.request.context.ExpressionContext; | ||
import org.apache.pinot.segment.spi.Constants; | ||
import org.testng.Assert; | ||
import org.testng.annotations.Test; | ||
|
||
|
||
public class IntegerTupleSketchAggregationFunctionTest { | ||
|
||
@Test | ||
public void testCanUseStarTreeDefaultK() { | ||
IntegerTupleSketchAggregationFunction function = | ||
new IntegerTupleSketchAggregationFunction(List.of(ExpressionContext.forIdentifier("col")), | ||
IntegerSummary.Mode.Sum); | ||
|
||
Assert.assertTrue(function.canUseStarTree(Map.of())); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, "16384"))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, 16384))); | ||
Assert.assertFalse(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, 8192))); | ||
|
||
function = new IntegerTupleSketchAggregationFunction( | ||
List.of(ExpressionContext.forIdentifier("col"), ExpressionContext.forLiteral(Literal.intValue(16384))), | ||
IntegerSummary.Mode.Sum); | ||
|
||
Assert.assertTrue(function.canUseStarTree(Map.of())); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, "16384"))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, 16384))); | ||
Assert.assertFalse(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, 8192))); | ||
} | ||
|
||
@Test | ||
public void testCanUseCustomK() { | ||
IntegerTupleSketchAggregationFunction function = new IntegerTupleSketchAggregationFunction( | ||
List.of(ExpressionContext.forIdentifier("col"), ExpressionContext.forLiteral(Literal.intValue(32768))), | ||
IntegerSummary.Mode.Sum); | ||
|
||
// Default StarTree lgK = 14 / K=16384 | ||
Assert.assertFalse(function.canUseStarTree(Map.of())); | ||
Assert.assertFalse(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, "16384"))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, "65536"))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, 32768))); | ||
Assert.assertTrue(function.canUseStarTree(Map.of(Constants.TUPLESKETCH_NOMINAL_ENTRIES, "32768"))); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: a one-liner explanation on why we're doing a <= check (#13835 (comment)) might be useful for future readers here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.