From 540b38146d13f8776273aa93da5061008bfd707d Mon Sep 17 00:00:00 2001 From: Xiaobing <61892277+klsince@users.noreply.github.com> Date: Mon, 25 Mar 2024 19:54:44 -0700 Subject: [PATCH 01/50] refine when to registerSegment while doing addSegment and replaceSegment for upsert tables for better data consistency (#12709) --- .../realtime/RealtimeTableDataManager.java | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pinot-core/src/main/java/org/apache/pinot/core/data/manager/realtime/RealtimeTableDataManager.java b/pinot-core/src/main/java/org/apache/pinot/core/data/manager/realtime/RealtimeTableDataManager.java index 9e4ae84dba1..0c62ab9b4d7 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/data/manager/realtime/RealtimeTableDataManager.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/data/manager/realtime/RealtimeTableDataManager.java @@ -548,10 +548,9 @@ private void handleUpsert(ImmutableSegment immutableSegment) { immutableSegment.getSegmentMetadata().getTotalDocs()); _serverMetrics.addValueToTableGauge(_tableNameWithType, ServerGauge.SEGMENT_COUNT, 1L); ImmutableSegmentDataManager newSegmentManager = new ImmutableSegmentDataManager(immutableSegment); - // Register the new segment after it is fully initialized by partitionUpsertMetadataManager, e.g. to fill up its - // validDocId bitmap. Otherwise, the query can return wrong results, if accessing the premature segment. if (partitionUpsertMetadataManager.isPreloading()) { - // Preloading segment is ensured to be handled by a single thread, so no need to take a lock. + // Preloading segment is ensured to be handled by a single thread, so no need to take the segment upsert lock. + // Besides, preloading happens before the table partition is made ready for any queries. partitionUpsertMetadataManager.preloadSegment(immutableSegment); registerSegment(segmentName, newSegmentManager); _logger.info("Preloaded immutable segment: {} to upsert-enabled table: {}", segmentName, _tableNameWithType); @@ -574,10 +573,21 @@ private void handleUpsert(ImmutableSegment immutableSegment) { try { SegmentDataManager oldSegmentManager = _segmentDataManagerMap.get(segmentName); if (oldSegmentManager == null) { - partitionUpsertMetadataManager.addSegment(immutableSegment); + // When adding a new segment, we should register it 'before' it is fully initialized by + // partitionUpsertMetadataManager. Because when processing docs in the new segment, the docs in the other + // segments may be invalidated, making the queries see less valid docs than expected. We should let query + // access the new segment asap even though its validDocId bitmap is still being filled by + // partitionUpsertMetadataManager. registerSegment(segmentName, newSegmentManager); + partitionUpsertMetadataManager.addSegment(immutableSegment); _logger.info("Added new immutable segment: {} to upsert-enabled table: {}", segmentName, _tableNameWithType); } else { + // When replacing a segment, we should register the new segment 'after' it is fully initialized by + // partitionUpsertMetadataManager to fill up its validDocId bitmap. Otherwise, the queries will lose the access + // to the valid docs in the old segment immediately, but the validDocId bitmap of the new segment is still + // being filled by partitionUpsertMetadataManager, making the queries see less valid docs than expected. + // When replacing a segment, the new and old segments are assumed to have same set of valid docs for data + // consistency, otherwise the new segment should be named differently to go through the addSegment flow above. IndexSegment oldSegment = oldSegmentManager.getSegment(); partitionUpsertMetadataManager.replaceSegment(immutableSegment, oldSegment); registerSegment(segmentName, newSegmentManager); From 57f50d3dec2a2023effa67ca044f61dfd6561dad Mon Sep 17 00:00:00 2001 From: Saurabh Dubey Date: Tue, 26 Mar 2024 09:22:29 +0530 Subject: [PATCH 02/50] Json extract index filter support (#12683) --------- Co-authored-by: Saurabh Dubey Co-authored-by: Saurabh Dubey --- .../function/TransformFunctionType.java | 3 +- .../JsonExtractIndexTransformFunction.java | 28 +++++---- ...JsonExtractIndexTransformFunctionTest.java | 17 ++++++ .../impl/json/MutableJsonIndexImpl.java | 28 ++++++++- .../json/ImmutableJsonIndexReader.java | 46 ++++++++++++++- .../local/segment/index/JsonIndexTest.java | 59 ++++++++++++------- .../spi/index/reader/JsonIndexReader.java | 3 +- 7 files changed, 147 insertions(+), 37 deletions(-) diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/TransformFunctionType.java b/pinot-common/src/main/java/org/apache/pinot/common/function/TransformFunctionType.java index 20bc26854cf..88c269c6aad 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/function/TransformFunctionType.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/TransformFunctionType.java @@ -117,7 +117,8 @@ public enum TransformFunctionType { ReturnTypes.cascade(opBinding -> positionalReturnTypeInferenceFromStringLiteral(opBinding, 2, SqlTypeName.VARCHAR), SqlTypeTransforms.FORCE_NULLABLE), OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY, SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER, - SqlTypeFamily.CHARACTER), ordinal -> ordinal > 2), "json_extract_index"), + SqlTypeFamily.CHARACTER, SqlTypeFamily.CHARACTER), ordinal -> ordinal > 2), "json_extract_index"), + JSON_EXTRACT_KEY("jsonExtractKey", ReturnTypes.TO_ARRAY, OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY, SqlTypeFamily.CHARACTER)), "json_extract_key"), diff --git a/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java index 160ed36b0ff..12e38ea5d60 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java @@ -53,11 +53,11 @@ public String getName() { @Override public void init(List arguments, Map columnContextMap) { - // Check that there are exactly 3 or 4 arguments - if (arguments.size() < 3 || arguments.size() > 4) { + // Check that there are exactly 3 or 4 or 5 arguments + if (arguments.size() < 3 || arguments.size() > 5) { throw new IllegalArgumentException( - "Expected 3/4 arguments for transform function: jsonExtractIndex(jsonFieldName, 'jsonPath', 'resultsType'," - + " ['defaultValue'])"); + "Expected 3/4/5 arguments for transform function: jsonExtractIndex(jsonFieldName, 'jsonPath', 'resultsType'," + + " ['defaultValue'], ['jsonFilterExpression'])"); } TransformFunction firstArgument = arguments.get(0); @@ -76,13 +76,12 @@ public void init(List arguments, Map c if (!(secondArgument instanceof LiteralTransformFunction)) { throw new IllegalArgumentException("JSON path argument must be a literal"); } - String inputJsonPath = ((LiteralTransformFunction) secondArgument).getStringLiteral(); + _jsonPathString = ((LiteralTransformFunction) secondArgument).getStringLiteral(); try { - JsonPathCache.INSTANCE.getOrCompute(inputJsonPath); + JsonPathCache.INSTANCE.getOrCompute(_jsonPathString); } catch (Exception e) { throw new IllegalArgumentException("JSON path argument is not a valid JSON path"); } - _jsonPathString = inputJsonPath.substring(1); // remove $ prefix TransformFunction thirdArgument = arguments.get(2); if (!(thirdArgument instanceof LiteralTransformFunction)) { @@ -90,14 +89,14 @@ public void init(List arguments, Map c } String resultsType = ((LiteralTransformFunction) thirdArgument).getStringLiteral().toUpperCase(); boolean isSingleValue = !resultsType.endsWith("_ARRAY"); - if (isSingleValue && inputJsonPath.contains("[*]")) { + if (isSingleValue && _jsonPathString.contains("[*]")) { throw new IllegalArgumentException( "[*] syntax in json path is unsupported for singleValue field json_extract_index"); } DataType dataType = isSingleValue ? DataType.valueOf(resultsType) : DataType.valueOf(resultsType.substring(0, resultsType.length() - 6)); - if (arguments.size() == 4) { + if (arguments.size() >= 4) { TransformFunction fourthArgument = arguments.get(3); if (!(fourthArgument instanceof LiteralTransformFunction)) { throw new IllegalArgumentException("Default value must be a literal"); @@ -105,8 +104,17 @@ public void init(List arguments, Map c _defaultValue = dataType.convert(((LiteralTransformFunction) fourthArgument).getStringLiteral()); } + String filterJsonPath = null; + if (arguments.size() == 5) { + TransformFunction fifthArgument = arguments.get(4); + if (!(fifthArgument instanceof LiteralTransformFunction)) { + throw new IllegalArgumentException("JSON path filter argument must be a literal"); + } + filterJsonPath = ((LiteralTransformFunction) fifthArgument).getStringLiteral(); + } + _resultMetadata = new TransformResultMetadata(dataType, isSingleValue, false); - _valueToMatchingDocsMap = _jsonIndexReader.getMatchingFlattenedDocsMap(_jsonPathString); + _valueToMatchingDocsMap = _jsonIndexReader.getMatchingFlattenedDocsMap(_jsonPathString, filterJsonPath); if (isSingleValue) { // For single value result type, it's more efficient to use original docIDs map _jsonIndexReader.convertFlattenedDocIdsToDocIds(_valueToMatchingDocsMap); diff --git a/pinot-core/src/test/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunctionTest.java index c61084c430a..d2cd7921077 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunctionTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunctionTest.java @@ -247,6 +247,23 @@ private void addMvTests(List testArguments) { String.format("jsonExtractIndex(%s,'%s','STRING_ARRAY')", JSON_STRING_SV_COLUMN, "$.arrayField[*].arrStringField"), "$.arrayField[*].arrStringField", DataType.STRING, false }); + + // MV with filters + testArguments.add(new Object[]{ + String.format( + "jsonExtractIndex(%s,'%s','INT_ARRAY', '0', 'REGEXP_LIKE(\"$.arrayField[*].arrStringField\", ''.*y.*'')')", + JSON_STRING_SV_COLUMN, + "$.arrayField[*].arrIntField"), "$.arrayField[?(@.arrStringField =~ /.*y.*/)].arrIntField", DataType.INT, + false + }); + + testArguments.add(new Object[]{ + String.format( + "jsonExtractIndex(%s,'%s','STRING_ARRAY', '0', '\"$.arrayField[*].arrIntField\" > 2')", + JSON_STRING_SV_COLUMN, + "$.arrayField[*].arrStringField"), "$.arrayField[?(@.arrIntField > 2)].arrStringField", DataType.STRING, + false + }); } @Test(dataProvider = "testJsonExtractIndexDefaultValue") diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/json/MutableJsonIndexImpl.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/json/MutableJsonIndexImpl.java index 8a5ca799cd3..2f8cff69da2 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/json/MutableJsonIndexImpl.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/json/MutableJsonIndexImpl.java @@ -33,6 +33,7 @@ import java.util.TreeMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.regex.Pattern; +import javax.annotation.Nullable; import org.apache.commons.lang3.tuple.Pair; import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.common.request.context.FilterContext; @@ -367,10 +368,32 @@ public void convertFlattenedDocIdsToDocIds(Map valueToFla } @Override - public Map getMatchingFlattenedDocsMap(String jsonPathKey) { + public Map getMatchingFlattenedDocsMap(String jsonPathKey, @Nullable String filterString) { Map valueToMatchingFlattenedDocIdsMap = new HashMap<>(); _readLock.lock(); try { + RoaringBitmap filteredFlattenedDocIds = null; + FilterContext filter; + if (filterString != null) { + filter = RequestContextUtils.getFilter(CalciteSqlParser.compileToExpression(filterString)); + Preconditions.checkArgument(!filter.isConstant(), "Invalid json match filter: " + filterString); + if (filter.getType() == FilterContext.Type.PREDICATE && isExclusive(filter.getPredicate().getType())) { + // Handle exclusive predicate separately because the flip can only be applied to the + // unflattened doc ids in order to get the correct result, and it cannot be nested + filteredFlattenedDocIds = getMatchingFlattenedDocIds(filter.getPredicate()); + filteredFlattenedDocIds.flip(0, (long) _nextFlattenedDocId); + } else { + filteredFlattenedDocIds = getMatchingFlattenedDocIds(filter); + } + } + // Support 2 formats: + // - JSONPath format (e.g. "$.a[1].b"='abc', "$[0]"=1, "$"='abc') + // - Legacy format (e.g. "a[1].b"='abc') + if (jsonPathKey.startsWith("$")) { + jsonPathKey = jsonPathKey.substring(1); + } else { + jsonPathKey = JsonUtils.KEY_SEPARATOR + jsonPathKey; + } Pair result = getKeyAndFlattenedDocIds(jsonPathKey); jsonPathKey = result.getLeft(); RoaringBitmap arrayIndexFlattenDocIds = result.getRight(); @@ -380,6 +403,9 @@ public Map getMatchingFlattenedDocsMap(String jsonPathKey Map subMap = getMatchingKeysMap(jsonPathKey); for (Map.Entry entry : subMap.entrySet()) { RoaringBitmap flattenedDocIds = entry.getValue().clone(); + if (filteredFlattenedDocIds != null) { + flattenedDocIds.and(filteredFlattenedDocIds); + } if (arrayIndexFlattenDocIds != null) { flattenedDocIds.and(arrayIndexFlattenDocIds); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java index 9af37b50fbe..2d67309a380 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.PriorityQueue; import java.util.regex.Pattern; +import javax.annotation.Nullable; import org.apache.commons.lang3.tuple.Pair; import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.common.request.context.FilterContext; @@ -134,7 +135,8 @@ private MutableRoaringBitmap getMatchingFlattenedDocIds(FilterContext filter) { case AND: { List children = filter.getChildren(); int numChildren = children.size(); - MutableRoaringBitmap matchingDocIds = getMatchingFlattenedDocIds(children.get(0)); + MutableRoaringBitmap matchingDocIds = + getMatchingFlattenedDocIds(children.get(0)); for (int i = 1; i < numChildren; i++) { matchingDocIds.and(getMatchingFlattenedDocIds(children.get(i))); } @@ -143,7 +145,8 @@ private MutableRoaringBitmap getMatchingFlattenedDocIds(FilterContext filter) { case OR: { List children = filter.getChildren(); int numChildren = children.size(); - MutableRoaringBitmap matchingDocIds = getMatchingFlattenedDocIds(children.get(0)); + MutableRoaringBitmap matchingDocIds = + getMatchingFlattenedDocIds(children.get(0)); for (int i = 1; i < numChildren; i++) { matchingDocIds.or(getMatchingFlattenedDocIds(children.get(i))); } @@ -331,7 +334,40 @@ public void convertFlattenedDocIdsToDocIds(Map valueToFla } @Override - public Map getMatchingFlattenedDocsMap(String jsonPathKey) { + public Map getMatchingFlattenedDocsMap(String jsonPathKey, @Nullable String filterString) { + RoaringBitmap filteredFlattenedDocIds = null; + if (filterString != null) { + FilterContext filter; + try { + filter = RequestContextUtils.getFilter(CalciteSqlParser.compileToExpression(filterString)); + Preconditions.checkArgument(!filter.isConstant()); + } catch (Exception e) { + throw new BadQueryRequestException("Invalid json match filter: " + filterString); + } + if (filter.getType() == FilterContext.Type.PREDICATE && isExclusive(filter.getPredicate().getType())) { + // Handle exclusive predicate separately because the flip can only be applied to the + // unflattened doc ids in order to get the correct result, and it cannot be nested + filteredFlattenedDocIds = getMatchingFlattenedDocIds(filter.getPredicate()).toRoaringBitmap(); + filteredFlattenedDocIds.flip(0, _numFlattenedDocs); + } else { + filteredFlattenedDocIds = getMatchingFlattenedDocIds(filter).toRoaringBitmap(); + } + } + // Support 2 formats: + // - JSONPath format (e.g. "$.a[1].b"='abc', "$[0]"=1, "$"='abc') + // - Legacy format (e.g. "a[1].b"='abc') + if (_version == BaseJsonIndexCreator.VERSION_2) { + if (jsonPathKey.startsWith("$")) { + jsonPathKey = jsonPathKey.substring(1); + } else { + jsonPathKey = JsonUtils.KEY_SEPARATOR + jsonPathKey; + } + } else { + // For V1 backward-compatibility + if (jsonPathKey.startsWith("$.")) { + jsonPathKey = jsonPathKey.substring(2); + } + } Map result = new HashMap<>(); Pair pathKey = getKeyAndFlattenedDocIds(jsonPathKey); if (pathKey.getRight() != null && pathKey.getRight().isEmpty()) { @@ -347,6 +383,10 @@ public Map getMatchingFlattenedDocsMap(String jsonPathKey for (int dictId = dictIds[0]; dictId < dictIds[1]; dictId++) { String key = _dictionary.getStringValue(dictId); RoaringBitmap docIds = _invertedIndex.getDocIds(dictId).toRoaringBitmap(); + if (filteredFlattenedDocIds != null) { + docIds.and(filteredFlattenedDocIds); + } + if (arrayIndexFlattenDocIds != null) { docIds.and(arrayIndexFlattenDocIds); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java index 1b2a910da32..14f8a79bc16 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java @@ -347,9 +347,21 @@ public void testGetValueToFlattenedDocIdsMap() // CHECKSTYLE:ON // @formatter: on - String[] testKeys = new String[]{ - ".arrField[*].intKey01", - ".arrField[*].stringKey01", + String[][] testKeys = new String[][]{ + // Without filters + {"$.arrField[*].intKey01", null}, + {"$.arrField[*].stringKey01", null}, + + // With regexp filter + {"$.arrField[*].intKey01", "REGEXP_LIKE(\"arrField..stringKey01\", '.*f.*')"}, + // With range filter + {"$.arrField[*].stringKey01", "\"arrField..intKey01\" > 2"}, + // With AND filters + {"$.arrField[*].intKey01", "\"arrField..intKey01\" > 2 AND REGEXP_LIKE(\"arrField..stringKey01\", " + + "'[a-b][a-b].*')"}, + // Exclusive filters + {"$.arrField[*].intKey01", "\"arrField[*].stringKey01\" != 'bar'"}, + {"$.arrField[*].stringKey01", "\"arrField[*].intKey01\" != '3'"}, }; String colName = "col"; @@ -371,18 +383,23 @@ public void testGetValueToFlattenedDocIdsMap() int docIdValidLength = 2; String[][][] expectedValues = new String[][][]{ {{"1", "1", "3", "5"}, {"1", "1", "6", "3"}}, - {{"abc", "foo", "bar", "fuzz"}, {"pqr", "foo", "test", "testf2"}} + {{"abc", "foo", "bar", "fuzz"}, {"pqr", "foo", "test", "testf2"}}, + {{"1", "5"}, {"1", "3"}}, + {{"bar", "fuzz"}, {"test", "testf2"}}, + {{"3"}, {}}, + {{"1", "1", "5"}, {"1", "1", "6", "3"}}, + {{"abc", "foo", "fuzz"}, {"pqr", "foo", "test"}} }; for (int i = 0; i < testKeys.length; i++) { Map context = - offHeapIndexReader.getMatchingFlattenedDocsMap(testKeys[i]); + offHeapIndexReader.getMatchingFlattenedDocsMap(testKeys[i][0], testKeys[i][1]); String[][] values = offHeapIndexReader.getValuesMV(docMask, docIdValidLength, context); for (int j = 0; j < docIdValidLength; j++) { Assert.assertEquals(values[j], expectedValues[i][j]); } - context = mutableJsonIndex.getMatchingFlattenedDocsMap(testKeys[i]); + context = mutableJsonIndex.getMatchingFlattenedDocsMap(testKeys[i][0], testKeys[i][1]); values = mutableJsonIndex.getValuesMV(docMask, docIdValidLength, context); Assert.assertEquals(values, expectedValues[i]); } @@ -402,7 +419,7 @@ public void testGetValuesForKeyAndDocs() }; // CHECKSTYLE:ON // @formatter: on - String[] testKeys = new String[]{".field1", ".field2", ".field3", ".field4"}; + String[] testKeys = new String[]{"$.field1", "$.field2", "$.field3", "$.field4"}; String colName = "col"; try ( @@ -427,7 +444,7 @@ public void testGetValuesForKeyAndDocs() new String[][]{{"value1", "value2", "value1"}, {"value2", null, "value4"}, {"value3", null, null}, {null, null, null}}; for (int i = 0; i < testKeys.length; i++) { - Map context = offHeapIndexReader.getMatchingFlattenedDocsMap(testKeys[i]); + Map context = offHeapIndexReader.getMatchingFlattenedDocsMap(testKeys[i], null); String[] values = offHeapIndexReader.getValuesSV(docMask, docMask.length, context, true); Assert.assertEquals(values, expectedValues[i]); @@ -435,7 +452,7 @@ public void testGetValuesForKeyAndDocs() values = offHeapIndexReader.getValuesSV(docMask, docMask.length, context, false); Assert.assertEquals(values, expectedValues[i]); - context = mutableJsonIndex.getMatchingFlattenedDocsMap(testKeys[i]); + context = mutableJsonIndex.getMatchingFlattenedDocsMap(testKeys[i], null); values = mutableJsonIndex.getValuesSV(docMask, docMask.length, context, true); Assert.assertEquals(values, expectedValues[i]); @@ -448,7 +465,7 @@ public void testGetValuesForKeyAndDocs() docMask = new int[]{1, 2}; expectedValues = new String[][]{{"value2", "value1"}, {null, "value4"}, {null, null}, {null, null}}; for (int i = 0; i < testKeys.length; i++) { - Map context = offHeapIndexReader.getMatchingFlattenedDocsMap(testKeys[i]); + Map context = offHeapIndexReader.getMatchingFlattenedDocsMap(testKeys[i], null); String[] values = offHeapIndexReader.getValuesSV(docMask, docMask.length, context, true); Assert.assertEquals(values, expectedValues[i]); @@ -456,7 +473,7 @@ public void testGetValuesForKeyAndDocs() values = offHeapIndexReader.getValuesSV(docMask, docMask.length, context, false); Assert.assertEquals(values, expectedValues[i]); - context = mutableJsonIndex.getMatchingFlattenedDocsMap(testKeys[i]); + context = mutableJsonIndex.getMatchingFlattenedDocsMap(testKeys[i], null); values = mutableJsonIndex.getValuesSV(docMask, docMask.length, context, true); Assert.assertEquals(values, expectedValues[i]); @@ -466,7 +483,7 @@ public void testGetValuesForKeyAndDocs() } // Immutable index, context is reused for the second method call - Map context = offHeapIndexReader.getMatchingFlattenedDocsMap(".field1"); + Map context = offHeapIndexReader.getMatchingFlattenedDocsMap("$.field1", null); docMask = new int[]{0}; String[] values = offHeapIndexReader.getValuesSV(docMask, docMask.length, context, true); Assert.assertEquals(values, new String[]{"value1"}); @@ -483,7 +500,7 @@ public void testGetValuesForKeyAndDocs() Assert.assertEquals(values, new String[]{"value2", "value1"}); // Mutable index, context is reused for the second method call - context = mutableJsonIndex.getMatchingFlattenedDocsMap(".field1");; + context = mutableJsonIndex.getMatchingFlattenedDocsMap("$.field1", null);; docMask = new int[]{0}; values = mutableJsonIndex.getValuesSV(docMask, docMask.length, context, true); Assert.assertEquals(values, new String[]{"value1"}); @@ -525,9 +542,9 @@ public void testSkipInvalidJsonEnable() throws Exception { for (String record : records) { mutableJsonIndex.add(record); } - Map onHeapRes = onHeapIndexReader.getMatchingFlattenedDocsMap(""); - Map offHeapRes = offHeapIndexReader.getMatchingFlattenedDocsMap(""); - Map mutableRes = mutableJsonIndex.getMatchingFlattenedDocsMap(""); + Map onHeapRes = onHeapIndexReader.getMatchingFlattenedDocsMap("$", null); + Map offHeapRes = offHeapIndexReader.getMatchingFlattenedDocsMap("$", null); + Map mutableRes = mutableJsonIndex.getMatchingFlattenedDocsMap("$", null); Map expectedRes = Collections.singletonMap(JsonUtils.SKIPPED_VALUE_REPLACEMENT, RoaringBitmap.bitmapOf(0)); Assert.assertEquals(expectedRes, onHeapRes); @@ -563,8 +580,8 @@ public void testGetMatchingValDocIdsPairForArrayPath() throws Exception { File offHeapIndexFile = new File(INDEX_DIR, OFF_HEAP_COLUMN_NAME + V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION); Assert.assertTrue(offHeapIndexFile.exists()); - String[] keys = {".foo[0].bar[1]", ".foo[1].bar[0]", ".foo2[0]", ".foo[100].bar[100]", ".foo[0].bar[*]", - ".foo[*].bar[0]", ".foo[*].bar[*]"}; + String[] keys = {"$.foo[0].bar[1]", "$.foo[1].bar[0]", "$.foo2[0]", "$.foo[100].bar[100]", "$.foo[0].bar[*]", + "$.foo[*].bar[0]", "$.foo[*].bar[*]"}; List> expected = List.of( Map.of("y", RoaringBitmap.bitmapOf(0), "z", RoaringBitmap.bitmapOf(1)), Map.of("a", RoaringBitmap.bitmapOf(0)), @@ -593,11 +610,11 @@ public void testGetMatchingValDocIdsPairForArrayPath() throws Exception { } for (int i = 0; i < keys.length; i++) { - Map onHeapRes = onHeapIndexReader.getMatchingFlattenedDocsMap(keys[i]); + Map onHeapRes = onHeapIndexReader.getMatchingFlattenedDocsMap(keys[i], null); onHeapIndexReader.convertFlattenedDocIdsToDocIds(onHeapRes); - Map offHeapRes = offHeapIndexReader.getMatchingFlattenedDocsMap(keys[i]); + Map offHeapRes = offHeapIndexReader.getMatchingFlattenedDocsMap(keys[i], null); offHeapIndexReader.convertFlattenedDocIdsToDocIds(offHeapRes); - Map mutableRes = mutableJsonIndex.getMatchingFlattenedDocsMap(keys[i]); + Map mutableRes = mutableJsonIndex.getMatchingFlattenedDocsMap(keys[i], null); mutableJsonIndex.convertFlattenedDocIdsToDocIds(mutableRes); Assert.assertEquals(expected.get(i), onHeapRes); Assert.assertEquals(expected.get(i), offHeapRes); diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/JsonIndexReader.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/JsonIndexReader.java index 73ef8450ee2..44f7dc82c6c 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/JsonIndexReader.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/JsonIndexReader.java @@ -19,6 +19,7 @@ package org.apache.pinot.segment.spi.index.reader; import java.util.Map; +import javax.annotation.Nullable; import org.apache.pinot.segment.spi.index.IndexReader; import org.roaringbitmap.RoaringBitmap; import org.roaringbitmap.buffer.MutableRoaringBitmap; @@ -60,7 +61,7 @@ String[] getValuesSV(int[] docIds, int length, Map matchi * For a JSON key, returns a Map from each value to the flattened docId posting list. This map should be used to * avoid reading and converting the posting list of flattened docIds to real docIds */ - Map getMatchingFlattenedDocsMap(String key); + Map getMatchingFlattenedDocsMap(String key, @Nullable String filterJsonString); /** * Converts the flattened docIds to real docIds using the map returned by getMatchingFlattenedDocsMap From 824b5a0806c38c680c8bb627616ff658cabbf790 Mon Sep 17 00:00:00 2001 From: "Xiaotian (Jackie) Jiang" <17555551+Jackie-Jiang@users.noreply.github.com> Date: Mon, 25 Mar 2024 21:59:38 -0700 Subject: [PATCH 03/50] Pull janino dependency to root pom (#12724) --- pinot-query-planner/pom.xml | 2 -- pom.xml | 41 +++++++++++-------------------------- 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/pinot-query-planner/pom.xml b/pinot-query-planner/pom.xml index e04e9dc9de4..3b9a6016d4c 100644 --- a/pinot-query-planner/pom.xml +++ b/pinot-query-planner/pom.xml @@ -55,12 +55,10 @@ org.codehaus.janino janino - 3.1.6 org.codehaus.janino commons-compiler - 3.1.6 diff --git a/pom.xml b/pom.xml index 36d5e30698b..6bb7476afeb 100644 --- a/pom.xml +++ b/pom.xml @@ -168,7 +168,8 @@ 0.4.4 4.1 2.25.3 - 2.12.5 + 2.12.7 + 3.1.12 7.9.0 6.4.0 8.2.0 @@ -1148,14 +1149,6 @@ org.apache.calcite.avatica avatica-metrics - - org.apache.httpcomponents - httpclient - - - org.apache.httpcomponents - httpcore - org.apache.commons commons-dbcp2 @@ -1168,26 +1161,6 @@ net.hydromatic aggdesigner-algorithm - - com.jayway.jsonpath - json-path - - - joda-time - joda-time - - - org.codehaus.janino - janino - - - org.codehaus.janino - commons-compiler - - - com.google.code.findbugs - jsr305 - @@ -1195,6 +1168,16 @@ calcite-babel ${calcite.version} + + org.codehaus.janino + janino + ${janino.version} + + + org.codehaus.janino + commons-compiler + ${janino.version} + com.jcabi jcabi-log From 47b66df2f2137f20f952f1dcb25d57bf514f1681 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Mar 2024 22:06:12 -0700 Subject: [PATCH 04/50] Bump express in /pinot-controller/src/main/resources (#12723) --- .../src/main/resources/package-lock.json | 100 +++++++++--------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/pinot-controller/src/main/resources/package-lock.json b/pinot-controller/src/main/resources/package-lock.json index 4bde712761a..d41e32c9eaa 100644 --- a/pinot-controller/src/main/resources/package-lock.json +++ b/pinot-controller/src/main/resources/package-lock.json @@ -1826,21 +1826,21 @@ "dev": true }, "node_modules/body-parser": { - "version": "1.20.0", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.0.tgz", - "integrity": "sha512-DfJ+q6EPcGKZD1QWUjSpqp+Q7bDQTsQIF4zfUAtZ6qk+H/3/QRhg9CEp39ss+/T2vw0+HaidC0ecJj/DRLIaKg==", + "version": "1.20.2", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz", + "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==", "dev": true, "dependencies": { "bytes": "3.1.2", - "content-type": "~1.0.4", + "content-type": "~1.0.5", "debug": "2.6.9", "depd": "2.0.0", "destroy": "1.2.0", "http-errors": "2.0.0", "iconv-lite": "0.4.24", "on-finished": "2.4.1", - "qs": "6.10.3", - "raw-body": "2.5.1", + "qs": "6.11.0", + "raw-body": "2.5.2", "type-is": "~1.6.18", "unpipe": "1.0.0" }, @@ -2641,9 +2641,9 @@ ] }, "node_modules/content-type": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz", - "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==", + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz", + "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==", "dev": true, "engines": { "node": ">= 0.6" @@ -2658,9 +2658,9 @@ } }, "node_modules/cookie": { - "version": "0.5.0", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz", - "integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==", + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz", + "integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==", "dev": true, "engines": { "node": ">= 0.6" @@ -4616,17 +4616,17 @@ "integrity": "sha512-DLHwOGYeGnATM6tOMOWgs9dbzCjO+DwO3YGaha2R6kmLCE5iL8dz5sOywWeJs4P1rhxpdaVILKhCB4mUrTbbGg==" }, "node_modules/express": { - "version": "4.18.1", - "resolved": "https://registry.npmjs.org/express/-/express-4.18.1.tgz", - "integrity": "sha512-zZBcOX9TfehHQhtupq57OF8lFZ3UZi08Y97dwFCkD8p9d/d2Y3M+ykKcwaMDEL+4qyUolgBDX6AblpR3fL212Q==", + "version": "4.19.2", + "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz", + "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==", "dev": true, "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", - "body-parser": "1.20.0", + "body-parser": "1.20.2", "content-disposition": "0.5.4", "content-type": "~1.0.4", - "cookie": "0.5.0", + "cookie": "0.6.0", "cookie-signature": "1.0.6", "debug": "2.6.9", "depd": "2.0.0", @@ -4642,7 +4642,7 @@ "parseurl": "~1.3.3", "path-to-regexp": "0.1.7", "proxy-addr": "~2.0.7", - "qs": "6.10.3", + "qs": "6.11.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", "send": "0.18.0", @@ -7525,7 +7525,7 @@ "node_modules/media-typer": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", - "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=", + "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==", "dev": true, "engines": { "node": ">= 0.6" @@ -9993,9 +9993,9 @@ } }, "node_modules/qs": { - "version": "6.10.3", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.10.3.tgz", - "integrity": "sha512-wr7M2E0OFRfIfJZjKGieI8lBKb7fRCH4Fv5KNPEs7gJ8jadvotdsS08PzOKR7opXhZ/Xkjtt3WF9g38drmyRqQ==", + "version": "6.11.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz", + "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==", "dev": true, "dependencies": { "side-channel": "^1.0.4" @@ -10070,9 +10070,9 @@ } }, "node_modules/raw-body": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz", - "integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==", + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz", + "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==", "dev": true, "dependencies": { "bytes": "3.1.2", @@ -15582,21 +15582,21 @@ "dev": true }, "body-parser": { - "version": "1.20.0", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.0.tgz", - "integrity": "sha512-DfJ+q6EPcGKZD1QWUjSpqp+Q7bDQTsQIF4zfUAtZ6qk+H/3/QRhg9CEp39ss+/T2vw0+HaidC0ecJj/DRLIaKg==", + "version": "1.20.2", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz", + "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==", "dev": true, "requires": { "bytes": "3.1.2", - "content-type": "~1.0.4", + "content-type": "~1.0.5", "debug": "2.6.9", "depd": "2.0.0", "destroy": "1.2.0", "http-errors": "2.0.0", "iconv-lite": "0.4.24", "on-finished": "2.4.1", - "qs": "6.10.3", - "raw-body": "2.5.1", + "qs": "6.11.0", + "raw-body": "2.5.2", "type-is": "~1.6.18", "unpipe": "1.0.0" }, @@ -16264,9 +16264,9 @@ } }, "content-type": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz", - "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA==", + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz", + "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==", "dev": true }, "convert-source-map": { @@ -16278,9 +16278,9 @@ } }, "cookie": { - "version": "0.5.0", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz", - "integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==", + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz", + "integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==", "dev": true }, "cookie-signature": { @@ -17821,17 +17821,17 @@ "integrity": "sha512-DLHwOGYeGnATM6tOMOWgs9dbzCjO+DwO3YGaha2R6kmLCE5iL8dz5sOywWeJs4P1rhxpdaVILKhCB4mUrTbbGg==" }, "express": { - "version": "4.18.1", - "resolved": "https://registry.npmjs.org/express/-/express-4.18.1.tgz", - "integrity": "sha512-zZBcOX9TfehHQhtupq57OF8lFZ3UZi08Y97dwFCkD8p9d/d2Y3M+ykKcwaMDEL+4qyUolgBDX6AblpR3fL212Q==", + "version": "4.19.2", + "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz", + "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==", "dev": true, "requires": { "accepts": "~1.3.8", "array-flatten": "1.1.1", - "body-parser": "1.20.0", + "body-parser": "1.20.2", "content-disposition": "0.5.4", "content-type": "~1.0.4", - "cookie": "0.5.0", + "cookie": "0.6.0", "cookie-signature": "1.0.6", "debug": "2.6.9", "depd": "2.0.0", @@ -17847,7 +17847,7 @@ "parseurl": "~1.3.3", "path-to-regexp": "0.1.7", "proxy-addr": "~2.0.7", - "qs": "6.10.3", + "qs": "6.11.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", "send": "0.18.0", @@ -20116,7 +20116,7 @@ "media-typer": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", - "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=", + "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==", "dev": true }, "mem": { @@ -22065,9 +22065,9 @@ "dev": true }, "qs": { - "version": "6.10.3", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.10.3.tgz", - "integrity": "sha512-wr7M2E0OFRfIfJZjKGieI8lBKb7fRCH4Fv5KNPEs7gJ8jadvotdsS08PzOKR7opXhZ/Xkjtt3WF9g38drmyRqQ==", + "version": "6.11.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz", + "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==", "dev": true, "requires": { "side-channel": "^1.0.4" @@ -22123,9 +22123,9 @@ "dev": true }, "raw-body": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz", - "integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==", + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz", + "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==", "dev": true, "requires": { "bytes": "3.1.2", From 4c32feb16a9d058d4c3c8b73080dcb06e44e4c48 Mon Sep 17 00:00:00 2001 From: sullis Date: Mon, 25 Mar 2024 22:14:15 -0700 Subject: [PATCH 05/50] upgrade ow2 ASM to 9.7 (#12720) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6bb7476afeb..245dd4f7ff0 100644 --- a/pom.xml +++ b/pom.xml @@ -1361,7 +1361,7 @@ org.ow2.asm asm - 9.3 + 9.7 net.java.dev.jna From 7f266c3651a604a4272e92d1a0e8709f034b0647 Mon Sep 17 00:00:00 2001 From: sullis Date: Mon, 25 Mar 2024 22:14:42 -0700 Subject: [PATCH 06/50] upgrade kotlin to 1.9.23 (#12719) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 245dd4f7ff0..4906468cb64 100644 --- a/pom.xml +++ b/pom.xml @@ -208,7 +208,7 @@ 1.14.6 - 1.9.22 + 1.9.23 3.9.0 2.0.3 From 0c61bb67af6e7c0a3a55f3a5a5ec419d129d207c Mon Sep 17 00:00:00 2001 From: Shounak kulkarni Date: Tue, 26 Mar 2024 16:09:40 +0500 Subject: [PATCH 07/50] Bugfix. Avoid passing null table name input to translation util (#12726) --- .../api/resources/PinotTaskRestletResource.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTaskRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTaskRestletResource.java index 2c7703be5de..f6b38fdc88f 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTaskRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTaskRestletResource.java @@ -556,7 +556,7 @@ public List getCronSchedulerJobKeys() @Produces(MediaType.APPLICATION_JSON) @ApiOperation("Fetch cron scheduler job keys") public Map getCronSchedulerJobDetails( - @ApiParam(value = "Table name (with type suffix)") @QueryParam("tableName") String tableName, + @ApiParam(value = "Table name (with type suffix)", required = true) @QueryParam("tableName") String tableName, @ApiParam(value = "Task type") @QueryParam("taskType") String taskType, @Context HttpHeaders headers) throws SchedulerException { Scheduler scheduler = _pinotTaskManager.getScheduler(); @@ -618,15 +618,17 @@ public Map getCronSchedulerJobDetails( public Map scheduleTasks(@ApiParam(value = "Task type") @QueryParam("taskType") String taskType, @ApiParam(value = "Table name (with type suffix)") @QueryParam("tableName") String tableName, @Context HttpHeaders headers) { - tableName = DatabaseUtils.translateTableName(tableName, headers); if (taskType != null) { // Schedule task for the given task type - String taskName = tableName != null ? _pinotTaskManager.scheduleTask(taskType, tableName) + String taskName = tableName != null + ? _pinotTaskManager.scheduleTask(taskType, DatabaseUtils.translateTableName(tableName, headers)) : _pinotTaskManager.scheduleTask(taskType); return Collections.singletonMap(taskType, taskName); } else { // Schedule tasks for all task types - return tableName != null ? _pinotTaskManager.scheduleTasks(tableName) : _pinotTaskManager.scheduleTasks(); + return tableName != null + ? _pinotTaskManager.scheduleTasks(DatabaseUtils.translateTableName(tableName, headers)) + : _pinotTaskManager.scheduleTasks(); } } From eceffb9bca504d39ed2a20419a3f4e40cbc642d0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Mar 2024 09:33:34 -0700 Subject: [PATCH 08/50] Bump aws.sdk.version from 2.25.3 to 2.25.17 (#12725) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4906468cb64..918ffb6d834 100644 --- a/pom.xml +++ b/pom.xml @@ -167,7 +167,7 @@ 0.15.0 0.4.4 4.1 - 2.25.3 + 2.25.17 2.12.7 3.1.12 7.9.0 From 5201d2f8eed8432ab61480da78259e1430f95ae8 Mon Sep 17 00:00:00 2001 From: Xiang Fu Date: Tue, 26 Mar 2024 10:54:28 -0700 Subject: [PATCH 09/50] upgrade redis version in superset pinot docker image (#12729) --- docker/images/pinot-superset/requirements-db.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/images/pinot-superset/requirements-db.txt b/docker/images/pinot-superset/requirements-db.txt index caf7d4a3ff7..322ab46c63a 100644 --- a/docker/images/pinot-superset/requirements-db.txt +++ b/docker/images/pinot-superset/requirements-db.txt @@ -17,4 +17,4 @@ # under the License. # pinotdb>=0.4.5 -redis==4.5.4 +redis>=4.6.0,<5.0 From 2b47523fafe58a009042f6f2f0210246279bf51b Mon Sep 17 00:00:00 2001 From: Shreyaa Sharma <66686803+cypherean@users.noreply.github.com> Date: Tue, 26 Mar 2024 23:41:47 +0530 Subject: [PATCH 10/50] Consistency in API response for live broker (#12201) --- .../api/resources/PinotTableInstances.java | 7 +++- .../helix/core/PinotHelixResourceManager.java | 41 +++++++++++++++++-- ...inotHelixResourceManagerStatelessTest.java | 24 +++++++++++ 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTableInstances.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTableInstances.java index 0cac8cc8c41..7b6ee536624 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTableInstances.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTableInstances.java @@ -145,6 +145,7 @@ public String getTableInstances( return ret.toString(); } + @Deprecated @GET @Path("/tables/{tableName}/livebrokers") @Authorize(targetType = TargetType.TABLE, paramName = "tableName", action = Actions.Table.GET_BROKER) @@ -175,9 +176,11 @@ public List getLiveBrokersForTable( @ApiResponses(value = { @ApiResponse(code = 200, message = "Success"), @ApiResponse(code = 500, message = "Internal server error") }) - public Map> getLiveBrokers(@Context HttpHeaders headers) { + public Map> getLiveBrokers(@Context HttpHeaders headers, + @ApiParam(value = "Table names (with or without type)", allowMultiple = true) @QueryParam("tables") + List tables) { try { - return _pinotHelixResourceManager.getTableToLiveBrokersMapping(headers.getHeaderString(DATABASE)); + return _pinotHelixResourceManager.getTableToLiveBrokersMapping(headers.getHeaderString(DATABASE), tables); } catch (Exception e) { throw new ControllerApplicationException(LOGGER, e.getMessage(), Response.Status.NOT_FOUND); } diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java index 1874ecea9f7..130389f04ef 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java @@ -26,6 +26,7 @@ import com.google.common.cache.LoadingCache; import com.google.common.collect.BiMap; import com.google.common.collect.HashBiMap; +import com.google.common.collect.Sets; import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; @@ -3984,7 +3985,7 @@ public TableStats getTableStats(String tableNameWithType) { * @return Map of tableName to list of ONLINE brokers serving the table */ public Map> getTableToLiveBrokersMapping() { - return getTableToLiveBrokersMapping(null); + return getTableToLiveBrokersMapping(null, null); } /** @@ -3993,11 +3994,39 @@ public Map> getTableToLiveBrokersMapping() { * @return Map of tableName to list of ONLINE brokers serving the table */ public Map> getTableToLiveBrokersMapping(@Nullable String databaseName) { + return getTableToLiveBrokersMapping(databaseName, null); + } + + /** + * Returns map of tableName in default database to list of live brokers + * @param tables table list to get the tables from + * @return Map of tableName to list of ONLINE brokers serving the table + */ + public Map> getTableToLiveBrokersMapping(@Nullable List tables) { + return getTableToLiveBrokersMapping(null, tables); + } + + /** + * Returns map of tableName to list of live brokers + * @param databaseName database to get the tables from + * @param tables table list to get the tables from + * @return Map of tableName to list of ONLINE brokers serving the table + */ + public Map> getTableToLiveBrokersMapping(@Nullable String databaseName, + @Nullable List tables) { ExternalView ev = _helixDataAccessor.getProperty(_keyBuilder.externalView(Helix.BROKER_RESOURCE_INSTANCE)); if (ev == null) { throw new IllegalStateException("Failed to find external view for " + Helix.BROKER_RESOURCE_INSTANCE); } + Set tableSet = null; + if (CollectionUtils.isNotEmpty(tables)) { + tableSet = Sets.newHashSetWithExpectedSize(tables.size()); + for (String table : tables) { + tableSet.add(DatabaseUtils.translateTableName(table, databaseName)); + } + } + // Map of instanceId -> InstanceConfig Map instanceConfigMap = HelixHelper.getInstanceConfigs(_helixZkManager).stream() .collect(Collectors.toMap(InstanceConfig::getInstanceName, Function.identity())); @@ -4005,8 +4034,12 @@ public Map> getTableToLiveBrokersMapping(@Nullable St Map> result = new HashMap<>(); ZNRecord znRecord = ev.getRecord(); for (Map.Entry> tableToBrokersEntry : znRecord.getMapFields().entrySet()) { - String tableName = tableToBrokersEntry.getKey(); - if (!DatabaseUtils.isPartOfDatabase(tableName, databaseName)) { + String tableNameWithType = tableToBrokersEntry.getKey(); + if (!DatabaseUtils.isPartOfDatabase(tableNameWithType, databaseName)) { + continue; + } + if (tableSet != null && !tableSet.contains(tableNameWithType) && !tableSet.contains( + TableNameBuilder.extractRawTableName(tableNameWithType))) { continue; } Map brokersToState = tableToBrokersEntry.getValue(); @@ -4019,7 +4052,7 @@ public Map> getTableToLiveBrokersMapping(@Nullable St } } if (!hosts.isEmpty()) { - result.put(tableName, hosts); + result.put(tableNameWithType, hosts); } } return result; diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManagerStatelessTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManagerStatelessTest.java index 658808daf34..78460c4d2e2 100644 --- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManagerStatelessTest.java +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManagerStatelessTest.java @@ -410,6 +410,30 @@ public void testGetLiveBrokers() assertThrows(TableNotFoundException.class, () -> _helixResourceManager.getLiveBrokersForTable("fake_OFFLINE")); assertThrows(TableNotFoundException.class, () -> _helixResourceManager.getLiveBrokersForTable("fake_REALTIME")); + // Test retrieving table name to live broker mapping for table without type suffix + Map> rawTableToLiveBrokersMapping = + _helixResourceManager.getTableToLiveBrokersMapping(null, List.of(RAW_TABLE_NAME)); + assertEquals(rawTableToLiveBrokersMapping.size(), 2); + assertEquals(rawTableToLiveBrokersMapping.get(OFFLINE_TABLE_NAME).size(), NUM_BROKER_INSTANCES); + assertEquals(rawTableToLiveBrokersMapping.get(REALTIME_TABLE_NAME).size(), NUM_BROKER_INSTANCES); + + // Test retrieving table names list to live broker mapping for each table without type suffix + Map> tablesListToLiveBrokersMapping = + _helixResourceManager.getTableToLiveBrokersMapping(List.of(OFFLINE_TABLE_NAME, REALTIME_TABLE_NAME)); + assertEquals(tablesListToLiveBrokersMapping.size(), 2); + assertEquals(tablesListToLiveBrokersMapping.get(OFFLINE_TABLE_NAME).size(), NUM_BROKER_INSTANCES); + assertEquals(tablesListToLiveBrokersMapping.get(REALTIME_TABLE_NAME).size(), NUM_BROKER_INSTANCES); + + // Test retrieving table name to live broker mapping for table with type suffix + Map> offlineTableToLiveBrokersMapping = + _helixResourceManager.getTableToLiveBrokersMapping(List.of(OFFLINE_TABLE_NAME)); + assertEquals(offlineTableToLiveBrokersMapping.size(), 1); + assertEquals(offlineTableToLiveBrokersMapping.get(OFFLINE_TABLE_NAME).size(), NUM_BROKER_INSTANCES); + + // Test that default value behaves the same as empty for optional argument + tableToLiveBrokersMapping = _helixResourceManager.getTableToLiveBrokersMapping(); + assertEquals(tableToLiveBrokersMapping.size(), 2); + // Delete the tables _helixResourceManager.deleteRealtimeTable(RAW_TABLE_NAME); _helixResourceManager.deleteOfflineTable(RAW_TABLE_NAME); From 3697552290f64b62e938152b891764adfb0ee031 Mon Sep 17 00:00:00 2001 From: Xiang Fu Date: Tue, 26 Mar 2024 13:54:14 -0700 Subject: [PATCH 11/50] Remove commons-logging exclusion (#12730) --- pinot-common/pom.xml | 6 ------ pinot-distribution/pom.xml | 4 ---- pinot-integration-test-base/pom.xml | 4 ---- pinot-integration-tests/pom.xml | 4 ---- .../pinot-stream-ingestion/pinot-pulsar/pom.xml | 6 ------ pom.xml | 16 ++++++++++++++++ 6 files changed, 16 insertions(+), 24 deletions(-) diff --git a/pinot-common/pom.xml b/pinot-common/pom.xml index 65f9778177a..b6e47114e0f 100644 --- a/pinot-common/pom.xml +++ b/pinot-common/pom.xml @@ -193,12 +193,6 @@ org.apache.httpcomponents httpclient - - - commons-logging - commons-logging - - org.apache.httpcomponents diff --git a/pinot-distribution/pom.xml b/pinot-distribution/pom.xml index af5be1d2378..a9bb9f5ecd8 100644 --- a/pinot-distribution/pom.xml +++ b/pinot-distribution/pom.xml @@ -112,10 +112,6 @@ org.apache.hadoop hadoop-mapreduce-client-core - - commons-logging - commons-logging - diff --git a/pinot-integration-test-base/pom.xml b/pinot-integration-test-base/pom.xml index 2c6eb3dd31f..6f07a8f5d4d 100644 --- a/pinot-integration-test-base/pom.xml +++ b/pinot-integration-test-base/pom.xml @@ -60,10 +60,6 @@ org.slf4j slf4j-log4j12 - - commons-logging - commons-logging - diff --git a/pinot-integration-tests/pom.xml b/pinot-integration-tests/pom.xml index 1db7433c3f4..3280f652cab 100644 --- a/pinot-integration-tests/pom.xml +++ b/pinot-integration-tests/pom.xml @@ -196,10 +196,6 @@ org.slf4j slf4j-log4j12 - - commons-logging - commons-logging - diff --git a/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml b/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml index eaaf7867296..8ef56f74fff 100644 --- a/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml +++ b/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml @@ -196,12 +196,6 @@ org.apache.pinot pinot-spi - - - commons-logging - commons-logging - - diff --git a/pom.xml b/pom.xml index 918ffb6d834..0d623d3d578 100644 --- a/pom.xml +++ b/pom.xml @@ -1701,6 +1701,22 @@ validate + + enforce-banned-dependencies + + enforce + + + + + + commons-logging:commons-logging + + + + true + + From af614ede58c338241e510fa944d29855b38a8b37 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 15:46:17 -0700 Subject: [PATCH 12/50] Bump io.grpc:grpc-protobuf-lite from 1.19.0 to 1.62.2 (#12733) --- pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml b/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml index 8ef56f74fff..4ccd6b1d0c3 100644 --- a/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml +++ b/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml @@ -46,7 +46,7 @@ 2.6.2 1.17 1.2 - 1.19.0 + 1.62.2 From da2461675084c0a2bae570f3b88521a5d5d3cfa9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 28 Mar 2024 10:23:43 -0700 Subject: [PATCH 13/50] Bump com.google.cloud:libraries-bom from 26.32.0 to 26.34.0 (#12740) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 0d623d3d578..1cc9e2ecc00 100644 --- a/pom.xml +++ b/pom.xml @@ -192,7 +192,7 @@ 2.2 - 26.32.0 + 26.34.0 1.23.0 2.10.1 33.1.0-jre From 03e945bf206627b0e0b06fafc402377d5de08bfc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 28 Mar 2024 10:24:02 -0700 Subject: [PATCH 14/50] Bump org.apache.maven.plugins:maven-assembly-plugin from 3.6.0 to 3.7.1 (#12741) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 1cc9e2ecc00..3550227eeae 100644 --- a/pom.xml +++ b/pom.xml @@ -1587,7 +1587,7 @@ org.apache.maven.plugins maven-assembly-plugin - 3.6.0 + 3.7.1 maven-resources-plugin From bb2eb49475dcdefecc3c3a9e459868687e3b41ef Mon Sep 17 00:00:00 2001 From: "Xiaotian (Jackie) Jiang" <17555551+Jackie-Jiang@users.noreply.github.com> Date: Thu, 28 Mar 2024 10:24:44 -0700 Subject: [PATCH 15/50] Replace custom fmpp plugin with fmpp-maven-plugin (#12737) --- contrib/pinot-fmpp-maven-plugin/pom.xml | 127 -------- .../java/org/apache/pinot/fmpp/FMPPMojo.java | 270 ------------------ .../apache/pinot/fmpp/MavenDataLoader.java | 55 ---- pinot-common/pom.xml | 57 ++-- pinot-common/src/main/codegen/config.fmpp | 2 + pom.xml | 48 ++-- 6 files changed, 59 insertions(+), 500 deletions(-) delete mode 100644 contrib/pinot-fmpp-maven-plugin/pom.xml delete mode 100644 contrib/pinot-fmpp-maven-plugin/src/main/java/org/apache/pinot/fmpp/FMPPMojo.java delete mode 100644 contrib/pinot-fmpp-maven-plugin/src/main/java/org/apache/pinot/fmpp/MavenDataLoader.java diff --git a/contrib/pinot-fmpp-maven-plugin/pom.xml b/contrib/pinot-fmpp-maven-plugin/pom.xml deleted file mode 100644 index 0fbdc44f48a..00000000000 --- a/contrib/pinot-fmpp-maven-plugin/pom.xml +++ /dev/null @@ -1,127 +0,0 @@ - - - - 4.0.0 - - pinot - org.apache.pinot - 1.2.0-SNAPSHOT - ../.. - - - pinot-fmpp-maven-plugin - Pinot FMPP plugin - https://pinot.apache.org/ - maven-plugin - - ${basedir}/../.. - 3.8.2 - 0.9.16 - 2.3.32 - - - - - commons-io - commons-io - - - org.apache.maven - maven-core - ${maven.version} - - - org.codehaus.plexus - plexus-utils - - - org.eclipse.sisu - org.eclipse.sisu.plexus - - - - - org.apache.maven - maven-plugin-api - ${maven.version} - - - org.eclipse.sisu - org.eclipse.sisu.plexus - - - - - net.sourceforge.fmpp - fmpp - ${fmpp.version} - - - org.freemarker - freemarker - - - - - org.freemarker - freemarker - ${freemarker.version} - - - - - - - org.apache.maven.plugins - maven-checkstyle-plugin - - true - - - - org.apache.maven.plugins - maven-plugin-plugin - - pinot-fmpp - - - - default-descriptor - - descriptor - - process-classes - - - help-descriptor - - helpmojo - - process-classes - - - - - - diff --git a/contrib/pinot-fmpp-maven-plugin/src/main/java/org/apache/pinot/fmpp/FMPPMojo.java b/contrib/pinot-fmpp-maven-plugin/src/main/java/org/apache/pinot/fmpp/FMPPMojo.java deleted file mode 100644 index 787ac7606cd..00000000000 --- a/contrib/pinot-fmpp-maven-plugin/src/main/java/org/apache/pinot/fmpp/FMPPMojo.java +++ /dev/null @@ -1,270 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.fmpp; - -import com.google.common.base.Joiner; -import com.google.common.base.Stopwatch; -import fmpp.Engine; -import fmpp.ProgressListener; -import fmpp.progresslisteners.TerseConsoleProgressListener; -import fmpp.setting.Settings; -import fmpp.util.MiscUtil; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.TimeUnit; -import org.apache.commons.io.FileUtils; -import org.apache.maven.plugin.AbstractMojo; -import org.apache.maven.plugin.MojoExecutionException; -import org.apache.maven.plugin.MojoFailureException; -import org.apache.maven.project.MavenProject; - -import static java.lang.String.format; - - -/** - * a maven plugin to run the freemarker generation incrementally - * (if output has not changed, the files are not touched) - * - * @goal generate - * @phase generate-sources - */ -public class FMPPMojo extends AbstractMojo { - - /** - * Used to add new source directories to the build. - * - * @parameter default-value="${project}" - * @required - * @readonly - **/ - private MavenProject project; - - /** - * Where to find the FreeMarker template files. - * - * @parameter default-value="src/main/resources/fmpp/templates/" - * @required - */ - private File templates; - - /** - * Where to write the generated files of the output files. - * - * @parameter default-value="${project.build.directory}/generated-sources/fmpp/" - * @required - */ - private File output; - - /** - * Location of the FreeMarker config file. - * - * @parameter default-value="src/main/resources/fmpp/config.fmpp" - * @required - */ - private File config; - - /** - * compilation scope to be added to ("compile" or "test") - * - * @parameter default-value="compile" - * @required - */ - private String scope; - - /** - * FMPP data model build parameter. - * - * @see FMPP Data Model Building - * @parameter default-value="" - */ - private String data; - - /** - * if maven properties are added as data - * - * @parameter default-value="true" - * @required - */ - private boolean addMavenDataLoader; - - @Override - public void execute() - throws MojoExecutionException, MojoFailureException { - if (project == null) { - throw new MojoExecutionException("This plugin can only be used inside a project."); - } - String outputPath = output.getAbsolutePath(); - if ((!output.exists() && !output.mkdirs()) || !output.isDirectory()) { - throw new MojoFailureException("can not write to output dir: " + outputPath); - } - String templatesPath = templates.getAbsolutePath(); - if (!templates.exists() || !templates.isDirectory()) { - throw new MojoFailureException("templates not found in dir: " + outputPath); - } - - // add the output directory path to the project source directories - switch (scope) { - case "compile": - project.addCompileSourceRoot(outputPath); - break; - case "test": - project.addTestCompileSourceRoot(outputPath); - break; - default: - throw new MojoFailureException("scope must be compile or test"); - } - - final Stopwatch sw = Stopwatch.createStarted(); - try { - getLog().info( - format("Freemarker generation:\n scope: %s,\n config: %s,\n templates: %s", scope, config.getAbsolutePath(), - templatesPath)); - final File tmp = Files.createTempDirectory("freemarker-tmp").toFile(); - String tmpPath = tmp.getAbsolutePath(); - final String tmpPathNormalized = tmpPath.endsWith(File.separator) ? tmpPath : tmpPath + File.separator; - Settings settings = new Settings(new File(".")); - settings.set(Settings.NAME_SOURCE_ROOT, templatesPath); - settings.set(Settings.NAME_OUTPUT_ROOT, tmp.getAbsolutePath()); - settings.load(config); - settings.addProgressListener(new TerseConsoleProgressListener()); - settings.addProgressListener(new ProgressListener() { - @Override - public void notifyProgressEvent(Engine engine, int event, File src, int pMode, Throwable error, Object param) - throws Exception { - if (event == EVENT_END_PROCESSING_SESSION) { - getLog().info(format("Freemarker generation took %dms", sw.elapsed(TimeUnit.MILLISECONDS))); - sw.reset(); - Report report = moveIfChanged(tmp, tmpPathNormalized); - if (!tmp.delete()) { - throw new MojoFailureException(format("can not delete %s", tmp)); - } - getLog().info(format("Incremental output update took %dms", sw.elapsed(TimeUnit.MILLISECONDS))); - getLog().info(format("new: %d", report.newFiles)); - getLog().info(format("changed: %d", report.changedFiles)); - getLog().info(format("unchanged: %d", report.unchangedFiles)); - } - } - }); - List dataValues = new ArrayList<>(); - if (addMavenDataLoader) { - getLog().info("Adding maven data loader"); - settings.setEngineAttribute(MavenDataLoader.MAVEN_DATA_ATTRIBUTE, new MavenDataLoader.MavenData(project)); - dataValues.add(format("maven: %s()", MavenDataLoader.class.getName())); - } - if (data != null) { - dataValues.add(data); - } - if (!dataValues.isEmpty()) { - String dataString = Joiner.on(",").join(dataValues); - getLog().info("Setting data loader " + dataString); - - settings.add(Settings.NAME_DATA, dataString); - } - settings.execute(); - } catch (Exception e) { - throw new MojoFailureException(MiscUtil.causeMessages(e), e); - } - } - - private static final class Report { - int changedFiles; - int unchangedFiles; - int newFiles; - - Report(int changedFiles, int unchangedFiles, int newFiles) { - super(); - this.changedFiles = changedFiles; - this.unchangedFiles = unchangedFiles; - this.newFiles = newFiles; - } - - public Report() { - this(0, 0, 0); - } - - void add(Report other) { - changedFiles += other.changedFiles; - unchangedFiles += other.unchangedFiles; - newFiles += other.newFiles; - } - - public void addChanged() { - ++changedFiles; - } - - public void addNew() { - ++newFiles; - } - - public void addUnchanged() { - ++unchangedFiles; - } - } - - private Report moveIfChanged(File root, String tmpPath) - throws MojoFailureException, IOException { - Report report = new Report(); - for (File file : root.listFiles()) { - if (file.isDirectory()) { - report.add(moveIfChanged(file, tmpPath)); - if (!file.delete()) { - throw new MojoFailureException(format("can not delete %s", file)); - } - } else { - String absPath = file.getAbsolutePath(); - if (!absPath.startsWith(tmpPath)) { - throw new MojoFailureException(format("%s should start with %s", absPath, tmpPath)); - } - String relPath = absPath.substring(tmpPath.length()); - File outputFile = new File(output, relPath); - if (!outputFile.exists()) { - report.addNew(); - } else if (!FileUtils.contentEquals(file, outputFile)) { - getLog().info(format("%s has changed", relPath)); - if (!outputFile.delete()) { - throw new MojoFailureException(format("can not delete %s", outputFile)); - } - report.addChanged(); - } else { - report.addUnchanged(); - } - if (!outputFile.exists()) { - File parentDir = outputFile.getParentFile(); - if (parentDir.exists() && !parentDir.isDirectory()) { - throw new MojoFailureException( - format("can not move %s to %s as %s is not a dir", file, outputFile, parentDir)); - } - if (!parentDir.exists() && !parentDir.mkdirs()) { - throw new MojoFailureException( - format("can not move %s to %s as dir %s can not be created", file, outputFile, parentDir)); - } - FileUtils.moveFile(file, outputFile); - } else { - if (!file.delete()) { - throw new MojoFailureException(format("can not delete %s", file)); - } - } - } - } - return report; - } -} diff --git a/contrib/pinot-fmpp-maven-plugin/src/main/java/org/apache/pinot/fmpp/MavenDataLoader.java b/contrib/pinot-fmpp-maven-plugin/src/main/java/org/apache/pinot/fmpp/MavenDataLoader.java deleted file mode 100644 index df85ad891b1..00000000000 --- a/contrib/pinot-fmpp-maven-plugin/src/main/java/org/apache/pinot/fmpp/MavenDataLoader.java +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.fmpp; - -import fmpp.Engine; -import fmpp.tdd.DataLoader; -import java.util.List; -import org.apache.maven.project.MavenProject; - - -/** - * A data loader for Maven - */ -public class MavenDataLoader implements DataLoader { - public static final class MavenData { - private final MavenProject project; - - public MavenData(MavenProject project) { - this.project = project; - } - - public MavenProject getProject() { - return project; - } - } - - public static final String MAVEN_DATA_ATTRIBUTE = "maven.data"; - - @Override - public Object load(Engine e, List args) - throws Exception { - if (!args.isEmpty()) { - throw new IllegalArgumentException("maven model data loader has no parameters"); - } - - MavenData data = (MavenData) e.getAttribute(MAVEN_DATA_ATTRIBUTE); - return data; - } -} diff --git a/pinot-common/pom.xml b/pinot-common/pom.xml index b6e47114e0f..f9a3f64c415 100644 --- a/pinot-common/pom.xml +++ b/pinot-common/pom.xml @@ -42,6 +42,7 @@ true + org.apache.maven.plugins @@ -55,19 +56,21 @@ + org.xolstice.maven.plugins protobuf-maven-plugin + + + - org.apache.maven.plugins maven-dependency-plugin unpack-parser-template - initialize + generate-sources unpack @@ -76,6 +79,7 @@ org.apache.calcite calcite-core + ${calcite.version} jar true ${project.build.directory}/ @@ -86,44 +90,50 @@ + - org.apache.pinot - pinot-fmpp-maven-plugin - ${project.version} + maven-resources-plugin - generate-fmpp-sources + copy-fmpp-resources generate-sources - generate + copy-resources - ${project.basedir}/src/main/codegen/config.fmpp - ${project.build.directory}/generated-sources/fmpp - ${project.build.directory}/codegen/templates - tdd(${project.basedir}/src/main/codegen/config.fmpp), default:tdd(${project.build.directory}/codegen/default_config.fmpp) + ${project.build.directory}/codegen + + + src/main/codegen + false + + + - org.codehaus.mojo - build-helper-maven-plugin + com.googlecode.fmpp-maven-plugin + fmpp-maven-plugin - add-generated-sources - process-sources + generate-fmpp-sources + generate-sources - add-source + generate - - ${project.build.directory}/generated-sources/javacc - + ${project.build.directory}/codegen/config.fmpp + ${project.build.directory}/generated-sources + ${project.build.directory}/codegen/templates + org.codehaus.mojo javacc-maven-plugin @@ -135,18 +145,19 @@ javacc - ${project.build.directory}/generated-sources/fmpp + ${project.build.directory}/generated-sources/javacc - **/Parser.jj + Parser.jj 2 false - ${project.build.directory}/generated-sources/javacc + ${project.build.directory}/generated-sources + diff --git a/pinot-common/src/main/codegen/config.fmpp b/pinot-common/src/main/codegen/config.fmpp index 178029a3b8a..c2fb71ea01d 100644 --- a/pinot-common/src/main/codegen/config.fmpp +++ b/pinot-common/src/main/codegen/config.fmpp @@ -18,6 +18,8 @@ # data: { + default: tdd("../default_config.fmpp") + # Data declarations for this parser. # # Default declarations are in default_config.fmpp; if you do not include a diff --git a/pom.xml b/pom.xml index 3550227eeae..fc4f5c7648a 100644 --- a/pom.xml +++ b/pom.xml @@ -56,8 +56,6 @@ pinot-connectors pinot-segment-local pinot-compatibility-verifier - contrib/pinot-fmpp-maven-plugin - pinot-query-planner pinot-query-runtime @@ -1777,6 +1775,29 @@ ${project.build.sourceEncoding} + + org.apache.maven.plugins + maven-remote-resources-plugin + 3.2.0 + + + com.googlecode.fmpp-maven-plugin + fmpp-maven-plugin + 1.0 + + + org.codehaus.mojo + javacc-maven-plugin + 3.1.0 + + + net.java.dev.javacc + javacc + + 5.0 + + + net.alchim31.maven scala-maven-plugin @@ -2218,7 +2239,6 @@ org.apache.maven.plugins maven-remote-resources-plugin - 1.6.0 @@ -2235,28 +2255,6 @@ - - org.codehaus.mojo - build-helper-maven-plugin - 3.5.0 - - - org.codehaus.mojo - javacc-maven-plugin - 2.6 - - - net.java.dev.javacc - javacc - 7.0.13 - - - - - org.apache.maven.plugins - maven-plugin-plugin - 3.6.0 - From 470f8ba2e07254df2312c8961c35baedf3db35ad Mon Sep 17 00:00:00 2001 From: marregui Date: Thu, 28 Mar 2024 18:28:05 +0100 Subject: [PATCH 16/50] Add GZIP Compression Codec (#11434) (#12668) --- .../NoDictionaryCompressionQueriesTest.java | 73 ++++- ...nchmarkNoDictionaryIntegerCompression.java | 44 ++- .../BenchmarkNoDictionaryLongCompression.java | 41 ++- ...enchmarkNoDictionaryStringCompression.java | 186 ++++++------- .../compression/ChunkCompressorFactory.java | 9 +- .../local/io/compression/GzipCompressor.java | 66 +++++ .../io/compression/GzipDecompressor.java | 63 +++++ .../impl/BaseChunkForwardIndexWriter.java | 2 +- .../VarByteChunkForwardIndexWriterV4.java | 1 + .../forward/BaseChunkForwardIndexReader.java | 10 +- .../VarByteChunkForwardIndexReaderV4.java | 1 + .../local/io/compression/TestCompression.java | 127 +++++++-- .../VarByteChunkSVForwardIndexTest.java | 13 + .../index/loader/ForwardIndexHandlerTest.java | 261 +++++++++--------- .../spi/compression/ChunkCompressionType.java | 2 +- .../spi/compression/ChunkCompressor.java | 8 +- .../spi/compression/ChunkDecompressor.java | 8 +- .../segment/spi/index/FieldIndexConfigs.java | 6 +- .../segment/spi/index/ForwardIndexConfig.java | 4 + .../pinot/spi/config/table/FieldConfig.java | 13 +- 20 files changed, 659 insertions(+), 279 deletions(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipCompressor.java create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipDecompressor.java diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/NoDictionaryCompressionQueriesTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/NoDictionaryCompressionQueriesTest.java index 9fe631aa8f4..7ae5dbe2cff 100644 --- a/pinot-core/src/test/java/org/apache/pinot/queries/NoDictionaryCompressionQueriesTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/queries/NoDictionaryCompressionQueriesTest.java @@ -69,16 +69,18 @@ public class NoDictionaryCompressionQueriesTest extends BaseQueriesTest { private static final String PASS_THROUGH_STRING = "PASS_THROUGH_STRING"; private static final String ZSTANDARD_STRING = "ZSTANDARD_STRING"; private static final String LZ4_STRING = "LZ4_STRING"; + private static final String GZIP_STRING = "GZIP_STRING"; private static final String SNAPPY_LONG = "SNAPPY_LONG"; private static final String PASS_THROUGH_LONG = "PASS_THROUGH_LONG"; private static final String ZSTANDARD_LONG = "ZSTANDARD_LONG"; private static final String LZ4_LONG = "LZ4_LONG"; - + private static final String GZIP_LONG = "GZIP_LONG"; private static final String SNAPPY_INTEGER = "SNAPPY_INTEGER"; private static final String PASS_THROUGH_INTEGER = "PASS_THROUGH_INTEGER"; private static final String ZSTANDARD_INTEGER = "ZSTANDARD_INTEGER"; private static final String LZ4_INTEGER = "LZ4_INTEGER"; + private static final String GZIP_INTEGER = "GZIP_INTEGER"; private static final List RAW_SNAPPY_INDEX_COLUMNS = Arrays.asList(SNAPPY_STRING, SNAPPY_LONG, SNAPPY_INTEGER); @@ -90,6 +92,7 @@ public class NoDictionaryCompressionQueriesTest extends BaseQueriesTest { Arrays.asList(PASS_THROUGH_STRING, PASS_THROUGH_LONG, PASS_THROUGH_INTEGER); private static final List RAW_LZ4_INDEX_COLUMNS = Arrays.asList(LZ4_STRING, LZ4_LONG, LZ4_INTEGER); + private static final List RAW_GZIP_INDEX_COLUMNS = Arrays.asList(GZIP_STRING, GZIP_LONG, GZIP_INTEGER); private IndexSegment _indexSegment; private List _indexSegments; @@ -123,6 +126,7 @@ public void setUp() indexColumns.addAll(RAW_PASS_THROUGH_INDEX_COLUMNS); indexColumns.addAll(RAW_ZSTANDARD_INDEX_COLUMNS); indexColumns.addAll(RAW_LZ4_INDEX_COLUMNS); + indexColumns.addAll(RAW_GZIP_INDEX_COLUMNS); indexLoadingConfig.addNoDictionaryColumns(indexColumns); ImmutableSegment immutableSegment = @@ -143,7 +147,7 @@ private void buildSegment() List fieldConfigs = new ArrayList<>( RAW_SNAPPY_INDEX_COLUMNS.size() + RAW_ZSTANDARD_INDEX_COLUMNS.size() + RAW_PASS_THROUGH_INDEX_COLUMNS.size() - + RAW_LZ4_INDEX_COLUMNS.size()); + + RAW_LZ4_INDEX_COLUMNS.size() + RAW_GZIP_INDEX_COLUMNS.size()); for (String indexColumn : RAW_SNAPPY_INDEX_COLUMNS) { fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.RAW, Collections.emptyList(), @@ -165,11 +169,17 @@ private void buildSegment() FieldConfig.CompressionCodec.LZ4, null)); } + for (String indexColumn : RAW_GZIP_INDEX_COLUMNS) { + fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.RAW, Collections.emptyList(), + FieldConfig.CompressionCodec.GZIP, null)); + } + List noDictionaryColumns = new ArrayList<>(); noDictionaryColumns.addAll(RAW_SNAPPY_INDEX_COLUMNS); noDictionaryColumns.addAll(RAW_ZSTANDARD_INDEX_COLUMNS); noDictionaryColumns.addAll(RAW_PASS_THROUGH_INDEX_COLUMNS); noDictionaryColumns.addAll(RAW_LZ4_INDEX_COLUMNS); + noDictionaryColumns.addAll(RAW_GZIP_INDEX_COLUMNS); TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(TABLE_NAME).setNoDictionaryColumns(noDictionaryColumns) @@ -179,14 +189,17 @@ private void buildSegment() .addSingleValueDimension(PASS_THROUGH_STRING, FieldSpec.DataType.STRING) .addSingleValueDimension(ZSTANDARD_STRING, FieldSpec.DataType.STRING) .addSingleValueDimension(LZ4_STRING, FieldSpec.DataType.STRING) + .addSingleValueDimension(GZIP_STRING, FieldSpec.DataType.STRING) .addSingleValueDimension(SNAPPY_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(ZSTANDARD_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(PASS_THROUGH_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(LZ4_INTEGER, FieldSpec.DataType.INT) + .addSingleValueDimension(GZIP_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(SNAPPY_LONG, FieldSpec.DataType.LONG) .addSingleValueDimension(ZSTANDARD_LONG, FieldSpec.DataType.LONG) .addSingleValueDimension(PASS_THROUGH_LONG, FieldSpec.DataType.LONG) - .addSingleValueDimension(LZ4_LONG, FieldSpec.DataType.LONG).build(); + .addSingleValueDimension(LZ4_LONG, FieldSpec.DataType.LONG) + .addSingleValueDimension(GZIP_LONG, FieldSpec.DataType.LONG).build(); SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema); config.setOutDir(INDEX_DIR.getPath()); config.setTableName(TABLE_NAME); @@ -227,14 +240,17 @@ private List createTestData() { row.putValue(ZSTANDARD_STRING, tempStringRows[i]); row.putValue(PASS_THROUGH_STRING, tempStringRows[i]); row.putValue(LZ4_STRING, tempStringRows[i]); + row.putValue(GZIP_STRING, tempStringRows[i]); row.putValue(SNAPPY_INTEGER, tempIntRows[i]); row.putValue(ZSTANDARD_INTEGER, tempIntRows[i]); row.putValue(PASS_THROUGH_INTEGER, tempIntRows[i]); row.putValue(LZ4_INTEGER, tempIntRows[i]); + row.putValue(GZIP_INTEGER, tempIntRows[i]); row.putValue(SNAPPY_LONG, tempLongRows[i]); row.putValue(ZSTANDARD_LONG, tempLongRows[i]); row.putValue(PASS_THROUGH_LONG, tempLongRows[i]); row.putValue(LZ4_LONG, tempLongRows[i]); + row.putValue(GZIP_LONG, tempLongRows[i]); rows.add(row); } return rows; @@ -246,18 +262,19 @@ private List createTestData() { @Test public void testQueriesWithCompressionCodec() { String query = "SELECT SNAPPY_STRING, ZSTANDARD_STRING, PASS_THROUGH_STRING, LZ4_STRING, " - + "SNAPPY_INTEGER, ZSTANDARD_INTEGER, PASS_THROUGH_INTEGER, LZ4_INTEGER, " - + "SNAPPY_LONG, ZSTANDARD_LONG, PASS_THROUGH_LONG, LZ4_LONG FROM MyTable LIMIT 1000"; + + "GZIP_STRING, SNAPPY_INTEGER, ZSTANDARD_INTEGER, PASS_THROUGH_INTEGER, LZ4_INTEGER, " + + "GZIP_INTEGER, SNAPPY_LONG, ZSTANDARD_LONG, PASS_THROUGH_LONG, LZ4_LONG, GZIP_LONG FROM MyTable LIMIT 1000"; ArrayList expected = new ArrayList<>(); for (GenericRow row : _rows) { expected.add(new Serializable[]{ - String.valueOf(row.getValue(SNAPPY_STRING)), String.valueOf(row.getValue(ZSTANDARD_STRING)), - String.valueOf(row.getValue(PASS_THROUGH_STRING)), String.valueOf(row.getValue(LZ4_STRING)), - (Integer) row.getValue(SNAPPY_INTEGER), (Integer) row.getValue(ZSTANDARD_INTEGER), - (Integer) row.getValue(PASS_THROUGH_INTEGER), (Integer) row.getValue(LZ4_INTEGER), - (Long) row.getValue(SNAPPY_LONG), (Long) row.getValue(ZSTANDARD_LONG), (Long) row.getValue(PASS_THROUGH_LONG), - (Long) row.getValue(LZ4_LONG) + String.valueOf(row.getValue(SNAPPY_STRING)), String.valueOf(row.getValue(ZSTANDARD_STRING)), String.valueOf( + row.getValue(PASS_THROUGH_STRING)), String.valueOf(row.getValue(LZ4_STRING)), String.valueOf( + row.getValue(GZIP_STRING)), (Integer) row.getValue(SNAPPY_INTEGER), (Integer) row.getValue( + ZSTANDARD_INTEGER), (Integer) row.getValue(PASS_THROUGH_INTEGER), (Integer) row.getValue( + LZ4_INTEGER), (Integer) row.getValue(GZIP_INTEGER), (Long) row.getValue(SNAPPY_LONG), (Long) row.getValue( + ZSTANDARD_LONG), (Long) row.getValue(PASS_THROUGH_LONG), (Long) row.getValue(LZ4_LONG), (Long) row.getValue( + GZIP_LONG) }); } testSelectQueryHelper(query, expected.size(), expected); @@ -297,6 +314,23 @@ public void testLZ4IntegerFilterQueriesWithCompressionCodec() { testSelectQueryHelper(query, expected.size(), expected); } + /** + * Tests for filter over integer values GZIP compression codec queries. + */ + @Test + public void testGZIPIntegerFilterQueriesWithCompressionCodec() { + String query = "SELECT GZIP_INTEGER FROM MyTable WHERE GZIP_INTEGER > 1000 LIMIT 1000"; + ArrayList expected = new ArrayList<>(); + + for (GenericRow row : _rows) { + int value = (Integer) row.getValue(GZIP_INTEGER); + if (value > 1000) { + expected.add(new Serializable[]{value}); + } + } + testSelectQueryHelper(query, expected.size(), expected); + } + /** * Tests for filter over integer values compression codec queries. */ @@ -365,6 +399,23 @@ public void testLZ4StringFilterQueriesWithCompressionCodec() { testSelectQueryHelper(query, expected.size(), expected); } + /** + * Tests for filter over string values GZIP compression codec queries. + */ + @Test + public void testGZIPStringFilterQueriesWithCompressionCodec() { + String query = "SELECT GZIP_STRING FROM MyTable WHERE GZIP_STRING = 'hello_world_123' LIMIT 1000"; + ArrayList expected = new ArrayList<>(); + + for (GenericRow row : _rows) { + String value = String.valueOf(row.getValue(GZIP_STRING)); + if (value.equals("hello_world_123")) { + expected.add(new Serializable[]{value}); + } + } + testSelectQueryHelper(query, expected.size(), expected); + } + /** * Tests for filter over string values snappy compression codec queries. */ diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryIntegerCompression.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryIntegerCompression.java index 6c1a0e3ae21..f51ff94f15a 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryIntegerCompression.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryIntegerCompression.java @@ -24,6 +24,10 @@ import java.util.concurrent.TimeUnit; import net.jpountz.lz4.LZ4Factory; import org.apache.commons.lang3.RandomUtils; +import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.compression.ChunkCompressor; +import org.apache.pinot.segment.spi.compression.ChunkDecompressor; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -68,7 +72,13 @@ public static class BenchmarkNoDictionaryIntegerCompressionState { private static ByteBuffer _lz4CompressedIntegerInput; private static ByteBuffer _lz4IntegerDecompressed; + private static ByteBuffer _gzipCompressedIntegerOutput; + private static ByteBuffer _gzipCompressedIntegerInput; + private static ByteBuffer _gzipIntegerDecompressed; + private static LZ4Factory _factory; + private static ChunkCompressor _gzipCompressor; + private static ChunkDecompressor _gzipDecompressor; @Setup(Level.Invocation) public void setUp() @@ -84,12 +94,14 @@ public void setUp() // position for lz4 is required _uncompressedInt.flip(); _factory.fastCompressor().compress(_uncompressedInt, _lz4CompressedIntegerInput); + _gzipCompressor.compress(_uncompressedInt, _gzipCompressedIntegerInput); _zstdIntegerDecompressed.rewind(); _zstandardCompressedIntegerInput.flip(); _uncompressedInt.flip(); _snappyIntegerDecompressed.rewind(); _lz4CompressedIntegerInput.flip(); + _gzipCompressedIntegerInput.flip(); } private void generateRandomIntegerBuffer() { @@ -102,8 +114,10 @@ private void generateRandomIntegerBuffer() { } private void initializeCompressors() { - //Initialize compressors and decompressors for lz4 + //Initialize compressors and decompressors for lz4 and gzip _factory = LZ4Factory.fastestInstance(); + _gzipCompressor = ChunkCompressorFactory.getCompressor(ChunkCompressionType.GZIP); + _gzipDecompressor = ChunkCompressorFactory.getDecompressor(ChunkCompressionType.GZIP); } private void allocateBufferMemory() { @@ -117,6 +131,9 @@ private void allocateBufferMemory() { _lz4CompressedIntegerOutput = ByteBuffer.allocateDirect(_uncompressedInt.capacity() * 2); _snappyCompressedIntegerOutput = ByteBuffer.allocateDirect(_uncompressedInt.capacity() * 2); _zstdCompressedIntegerOutput = ByteBuffer.allocateDirect(_uncompressedInt.capacity() * 2); + _gzipIntegerDecompressed = ByteBuffer.allocateDirect(_uncompressedInt.capacity() * 2); + _gzipCompressedIntegerOutput = ByteBuffer.allocateDirect(_uncompressedInt.capacity() * 2); + _gzipCompressedIntegerInput = ByteBuffer.allocateDirect(_uncompressedInt.capacity() * 2); } @TearDown(Level.Invocation) @@ -128,10 +145,13 @@ public void tearDown() _zstdIntegerDecompressed.clear(); _lz4CompressedIntegerOutput.clear(); _lz4IntegerDecompressed.clear(); + _gzipCompressedIntegerOutput.clear(); + _gzipIntegerDecompressed.clear(); _uncompressedInt.rewind(); _zstandardCompressedIntegerInput.rewind(); _lz4CompressedIntegerInput.rewind(); + _gzipCompressedIntegerInput.rewind(); } } @@ -207,9 +227,27 @@ public int benchmarkLZ4HCIntegerDecompression(BenchmarkNoDictionaryIntegerCompre return state._lz4IntegerDecompressed.position(); } + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + public int benchmarkGZIPIntegerCompression(BenchmarkNoDictionaryIntegerCompressionState state) + throws IOException { + state._gzipCompressor.compress(state._uncompressedInt, state._gzipCompressedIntegerOutput); + return state._gzipCompressedIntegerOutput.position(); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + public int benchmarkGZIPIntegerDecompression(BenchmarkNoDictionaryIntegerCompressionState state) + throws IOException { + state._gzipDecompressor.decompress(state._gzipCompressedIntegerInput, state._gzipIntegerDecompressed); + return state._gzipIntegerDecompressed.position(); + } + public static void main(String[] args) throws Exception { - new Runner(new OptionsBuilder().include(BenchmarkNoDictionaryIntegerCompression.class.getSimpleName()).build()) - .run(); + new Runner( + new OptionsBuilder().include(BenchmarkNoDictionaryIntegerCompression.class.getSimpleName()).build()).run(); } } diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryLongCompression.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryLongCompression.java index b81d26a19d2..1819278b289 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryLongCompression.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryLongCompression.java @@ -24,6 +24,10 @@ import java.util.concurrent.TimeUnit; import net.jpountz.lz4.LZ4Factory; import org.apache.commons.lang3.RandomUtils; +import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.compression.ChunkCompressor; +import org.apache.pinot.segment.spi.compression.ChunkDecompressor; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -67,8 +71,13 @@ public static class BenchmarkNoDictionaryLongCompressionState { private static ByteBuffer _lz4CompressedLongOutput; private static ByteBuffer _lz4CompressedLongInput; private static ByteBuffer _lz4LongDecompressed; + private static ByteBuffer _gzipCompressedLongOutput; + private static ByteBuffer _gzipCompressedLongInput; + private static ByteBuffer _gzipLongDecompressed; private static LZ4Factory _factory; + private static ChunkCompressor _gzipCompressor; + private static ChunkDecompressor _gzipDecompressor; @Setup(Level.Invocation) public void setUp() @@ -84,12 +93,14 @@ public void setUp() // position for lz4 is required _uncompressedLong.flip(); _factory.fastCompressor().compress(_uncompressedLong, _lz4CompressedLongInput); + _gzipCompressor.compress(_uncompressedLong, _gzipCompressedLongInput); _zstandardLongDecompressedOutput.rewind(); _zstandardCompressedLongInput.flip(); _uncompressedLong.flip(); _snappyLongDecompressedOutput.flip(); _lz4CompressedLongInput.flip(); + _gzipCompressedLongInput.flip(); } private void generateRandomLongBuffer() { @@ -102,8 +113,10 @@ private void generateRandomLongBuffer() { } private void initializeCompressors() { - //Initialize compressors and decompressors for lz4 + //Initialize compressors and decompressors for lz4 and gzip _factory = LZ4Factory.fastestInstance(); + _gzipCompressor = ChunkCompressorFactory.getCompressor(ChunkCompressionType.GZIP); + _gzipDecompressor = ChunkCompressorFactory.getDecompressor(ChunkCompressionType.GZIP); } private void allocateBufferMemory() { @@ -116,6 +129,9 @@ private void allocateBufferMemory() { _lz4LongDecompressed = ByteBuffer.allocateDirect(_uncompressedLong.capacity() * 2); _lz4CompressedLongOutput = ByteBuffer.allocateDirect(_uncompressedLong.capacity() * 2); _lz4CompressedLongInput = ByteBuffer.allocateDirect(_uncompressedLong.capacity() * 2); + _gzipLongDecompressed = ByteBuffer.allocateDirect(_uncompressedLong.capacity() * 2); + _gzipCompressedLongOutput = ByteBuffer.allocateDirect(_uncompressedLong.capacity() * 2); + _gzipCompressedLongInput = ByteBuffer.allocateDirect(_uncompressedLong.capacity() * 2); } @TearDown(Level.Invocation) @@ -127,10 +143,13 @@ public void tearDown() _zstandardLongDecompressedOutput.clear(); _lz4CompressedLongOutput.clear(); _lz4LongDecompressed.clear(); + _gzipCompressedLongOutput.clear(); + _gzipLongDecompressed.clear(); _uncompressedLong.rewind(); _zstandardCompressedLongInput.rewind(); _lz4CompressedLongInput.rewind(); + _gzipCompressedLongInput.rewind(); } } @@ -210,6 +229,26 @@ public int benchmarkLZ4HCLongDecompression( return state._lz4LongDecompressed.position(); } + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + public int benchmarkGZIPLongCompression( + BenchmarkNoDictionaryLongCompression.BenchmarkNoDictionaryLongCompressionState state) + throws IOException { + state._gzipCompressor.compress(state._uncompressedLong, state._gzipCompressedLongOutput); + return state._gzipCompressedLongOutput.position(); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MILLISECONDS) + public int benchmarkGZIPLongDecompression( + BenchmarkNoDictionaryLongCompression.BenchmarkNoDictionaryLongCompressionState state) + throws IOException { + state._gzipDecompressor.decompress(state._gzipCompressedLongInput, state._gzipLongDecompressed); + return state._gzipLongDecompressed.position(); + } + public static void main(String[] args) throws Exception { new Runner(new OptionsBuilder().include(BenchmarkNoDictionaryLongCompression.class.getSimpleName()).build()).run(); diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryStringCompression.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryStringCompression.java index 0cf5a3df7e3..50f7687c9ad 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryStringCompression.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryStringCompression.java @@ -23,8 +23,11 @@ import java.nio.ByteBuffer; import java.util.Random; import java.util.concurrent.TimeUnit; -import net.jpountz.lz4.LZ4Factory; import org.apache.commons.lang3.RandomStringUtils; +import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.compression.ChunkCompressor; +import org.apache.pinot.segment.spi.compression.ChunkDecompressor; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -50,181 +53,172 @@ @Warmup(iterations = 3) @Measurement(iterations = 5) @State(Scope.Benchmark) -// Test to get memory statistics for snappy, zstandard and lz4 string compression techniques +// Test to get memory statistics for snappy, zstandard, lz4 and gzip string compression techniques public class BenchmarkNoDictionaryStringCompression { @Param({"500000", "1000000", "2000000", "3000000", "4000000", "5000000"}) public static int _rowLength; - public static Random _random = new Random(); + private static final int MAX_CHARS_IN_LINE = 30; + private static final Random RANDOM = new Random(); + private static final ChunkCompressor LZ4_COMPRESSOR = ChunkCompressorFactory.getCompressor(ChunkCompressionType.LZ4); + private static final ChunkDecompressor LZ4_DECOMPRESSOR = + ChunkCompressorFactory.getDecompressor(ChunkCompressionType.LZ4); + private static final ChunkCompressor GZIP_COMPRESSOR = + ChunkCompressorFactory.getCompressor(ChunkCompressionType.GZIP); + private static final ChunkDecompressor GZIP_DECOMPRESSOR = + ChunkCompressorFactory.getDecompressor(ChunkCompressionType.GZIP); @State(Scope.Thread) - public static class BenchmarkNoDictionaryStringCompressionState { - private static ByteBuffer _uncompressedString; - private static ByteBuffer _snappyCompressedStringInput; - private static ByteBuffer _zstandardCompressedStringInput; - private static ByteBuffer _snappyCompressedStringOutput; - private static ByteBuffer _zstandardCompressedStringOutput; - private static ByteBuffer _snappyStringDecompressed; - private static ByteBuffer _zstandardStringDecompressed; - private static ByteBuffer _lz4CompressedStringOutput; - private static ByteBuffer _lz4CompressedStringInput; - private static ByteBuffer _lz4StringDecompressed; - - private static LZ4Factory _factory; + public static class CompressionBuffers { + + private ByteBuffer _snappyCompressedStringInput; + private ByteBuffer _zstandardCompressedStringInput; + private ByteBuffer _lz4CompressedStringInput; + private ByteBuffer _gzipCompressedStringInput; + private ByteBuffer _uncompressedString; + private ByteBuffer _stringDecompressed; + private ByteBuffer _stringCompressed; + + @Setup(Level.Trial) + public void setUp0() { + // generate random block of text alongside initialising memory buffers + byte[][] tempRows = new byte[_rowLength][]; + int size = 0; + for (int i = 0; i < _rowLength; i++) { + String value = RandomStringUtils.random(RANDOM.nextInt(MAX_CHARS_IN_LINE), true, true); + byte[] bytes = value.getBytes(UTF_8); + tempRows[i] = bytes; + size += bytes.length; + } + _uncompressedString = ByteBuffer.allocateDirect(size); + for (int i = 0; i < _rowLength; i++) { + _uncompressedString.put(tempRows[i]); + } + _uncompressedString.flip(); + + int capacity = _uncompressedString.capacity() * 2; + _stringDecompressed = ByteBuffer.allocateDirect(capacity); + _stringCompressed = ByteBuffer.allocateDirect(capacity); + _snappyCompressedStringInput = ByteBuffer.allocateDirect(capacity); + _zstandardCompressedStringInput = ByteBuffer.allocateDirect(capacity); + _lz4CompressedStringInput = ByteBuffer.allocateDirect(capacity); + _gzipCompressedStringInput = ByteBuffer.allocateDirect(capacity); + } @Setup(Level.Invocation) public void setUp() throws Exception { - initializeCompressors(); - generateRandomStringBuffer(); - allocateMemory(); - + _uncompressedString.rewind(); + _snappyCompressedStringInput.clear(); + _zstandardCompressedStringInput.clear(); + _lz4CompressedStringInput.clear(); + _gzipCompressedStringInput.clear(); + _stringDecompressed.clear(); + _stringCompressed.clear(); + + // prepare compressed buffers Snappy.compress(_uncompressedString, _snappyCompressedStringInput); Zstd.compress(_zstandardCompressedStringInput, _uncompressedString); // ZSTD compressor with change the position of _uncompressedString, a flip() operation over input to reset // position for lz4 is required _uncompressedString.flip(); - _factory.fastCompressor().compress(_uncompressedString, _lz4CompressedStringInput); - - _zstandardStringDecompressed.rewind(); _zstandardCompressedStringInput.flip(); - _uncompressedString.flip(); - _snappyStringDecompressed.flip(); - _lz4CompressedStringInput.flip(); - } - - private void initializeCompressors() { - //Initialize compressors and decompressors for lz4 - _factory = LZ4Factory.fastestInstance(); - } - - private void generateRandomStringBuffer() { - String[] tempRows = new String[_rowLength]; - int maxStringLengthInBytes = 0; - int numChars = 100; - for (int i = 0; i < _rowLength; i++) { - String value = RandomStringUtils.random(_random.nextInt(numChars), true, true); - maxStringLengthInBytes = Math.max(maxStringLengthInBytes, value.getBytes(UTF_8).length); - tempRows[i] = value; - } - - _uncompressedString = ByteBuffer.allocateDirect(_rowLength * maxStringLengthInBytes); - for (int i = 0; i < _rowLength; i++) { - _uncompressedString.put(tempRows[i].getBytes(UTF_8)); - } + LZ4_COMPRESSOR.compress(_uncompressedString, _lz4CompressedStringInput); _uncompressedString.flip(); - } - private void allocateMemory() { - _snappyCompressedStringOutput = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); - _zstandardCompressedStringOutput = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); - _snappyStringDecompressed = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); - _zstandardStringDecompressed = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); - _snappyCompressedStringInput = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); - _zstandardCompressedStringInput = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); - _lz4StringDecompressed = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); - _lz4CompressedStringOutput = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); - _lz4CompressedStringInput = ByteBuffer.allocateDirect(_uncompressedString.capacity() * 2); + GZIP_COMPRESSOR.compress(_uncompressedString, _gzipCompressedStringInput); + _uncompressedString.flip(); } @TearDown(Level.Invocation) public void tearDown() throws Exception { - _snappyCompressedStringOutput.clear(); - _snappyStringDecompressed.clear(); - _zstandardCompressedStringOutput.clear(); - _zstandardStringDecompressed.clear(); - _lz4CompressedStringOutput.clear(); - _lz4StringDecompressed.clear(); - - _uncompressedString.rewind(); - _zstandardCompressedStringInput.rewind(); - _lz4CompressedStringInput.rewind(); + _snappyCompressedStringInput.clear(); + _zstandardCompressedStringInput.clear(); + _lz4CompressedStringInput.clear(); + _gzipCompressedStringInput.clear(); + _uncompressedString.clear(); + _stringDecompressed.clear(); + _stringCompressed.clear(); } } @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) - public int benchmarkSnappyStringCompression(BenchmarkNoDictionaryStringCompressionState state) + public int benchmarkSnappyStringCompression(CompressionBuffers state) throws IOException { - int size = Snappy.compress(state._uncompressedString, state._snappyCompressedStringOutput); + int size = Snappy.compress(state._uncompressedString, state._stringCompressed); return size; } @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) - public int benchmarkSnappyStringDecompression(BenchmarkNoDictionaryStringCompressionState state) + public int benchmarkSnappyStringDecompression(CompressionBuffers state) throws IOException { - int size = Snappy.uncompress(state._snappyCompressedStringInput, state._snappyStringDecompressed); + int size = Snappy.uncompress(state._snappyCompressedStringInput, state._stringDecompressed); return size; } @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) - public int benchmarkZstandardStringCompression(BenchmarkNoDictionaryStringCompressionState state) - throws IOException { - int size = Zstd.compress(state._zstandardCompressedStringOutput, state._uncompressedString); + public int benchmarkZstandardStringCompression(CompressionBuffers state) { + int size = Zstd.compress(state._stringCompressed, state._uncompressedString); return size; } @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) - public int benchmarkZstandardStringDecompression(BenchmarkNoDictionaryStringCompressionState state) - throws IOException { - int size = Zstd.decompress(state._zstandardStringDecompressed, state._zstandardCompressedStringInput); + public int benchmarkZstandardStringDecompression(CompressionBuffers state) { + int size = Zstd.decompress(state._stringDecompressed, state._zstandardCompressedStringInput); return size; } @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) - public int benchmarkLZ4StringCompression( - BenchmarkNoDictionaryStringCompression.BenchmarkNoDictionaryStringCompressionState state) + public int benchmarkLZ4HCStringCompression(CompressionBuffers state) throws IOException { - state._factory.fastCompressor().compress(state._uncompressedString, state._lz4CompressedStringOutput); - return state._lz4CompressedStringOutput.position(); + LZ4_COMPRESSOR.compress(state._uncompressedString, state._stringCompressed); + return state._stringCompressed.position(); } @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) - public int benchmarkLZ4StringDecompression( - BenchmarkNoDictionaryStringCompression.BenchmarkNoDictionaryStringCompressionState state) + public int benchmarkLZ4HCStringDecompression(CompressionBuffers state) throws IOException { - state._factory.fastDecompressor().decompress(state._lz4CompressedStringInput, state._lz4StringDecompressed); - return state._lz4StringDecompressed.position(); + LZ4_DECOMPRESSOR.decompress(state._lz4CompressedStringInput, state._stringDecompressed); + return state._stringDecompressed.position(); } @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) - public int benchmarkLZ4HCStringCompression( - BenchmarkNoDictionaryStringCompression.BenchmarkNoDictionaryStringCompressionState state) + public int benchmarkGZIPStringCompression(CompressionBuffers state) throws IOException { - state._factory.highCompressor().compress(state._uncompressedString, state._lz4CompressedStringOutput); - return state._lz4CompressedStringOutput.position(); + GZIP_COMPRESSOR.compress(state._uncompressedString, state._stringCompressed); + return state._stringCompressed.position(); } @Benchmark @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.MILLISECONDS) - public int benchmarkLZ4HCStringDecompression( - BenchmarkNoDictionaryStringCompression.BenchmarkNoDictionaryStringCompressionState state) + public int benchmarkGZIPStringDecompression(CompressionBuffers state) throws IOException { - state._factory.fastDecompressor().decompress(state._lz4CompressedStringInput, state._lz4StringDecompressed); - return state._lz4StringDecompressed.position(); + GZIP_DECOMPRESSOR.decompress(state._gzipCompressedStringInput, state._stringDecompressed); + return state._stringDecompressed.position(); } public static void main(String[] args) throws Exception { - new Runner(new OptionsBuilder().include(BenchmarkNoDictionaryStringCompression.class.getSimpleName()).build()) - .run(); + new Runner( + new OptionsBuilder().include(BenchmarkNoDictionaryStringCompression.class.getSimpleName()).build()).run(); } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ChunkCompressorFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ChunkCompressorFactory.java index b2d06b97180..15def2f733b 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ChunkCompressorFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ChunkCompressorFactory.java @@ -50,8 +50,7 @@ public static ChunkCompressor getCompressor(ChunkCompressionType compressionType * size. Most formats do this anyway, but LZ4 requires a length prefix. * @return Compressor for the specified type. */ - public static ChunkCompressor getCompressor(ChunkCompressionType compressionType, - boolean upgradeToLengthPrefixed) { + public static ChunkCompressor getCompressor(ChunkCompressionType compressionType, boolean upgradeToLengthPrefixed) { switch (compressionType) { case PASS_THROUGH: @@ -69,6 +68,9 @@ public static ChunkCompressor getCompressor(ChunkCompressionType compressionType case LZ4_LENGTH_PREFIXED: return LZ4WithLengthCompressor.INSTANCE; + case GZIP: + return new GzipCompressor(); + default: throw new IllegalArgumentException("Illegal compressor name " + compressionType); } @@ -97,6 +99,9 @@ public static ChunkDecompressor getDecompressor(ChunkCompressionType compression case LZ4_LENGTH_PREFIXED: return LZ4WithLengthDecompressor.INSTANCE; + case GZIP: + return new GzipDecompressor(); + default: throw new IllegalArgumentException("Illegal compressor name " + compressionType); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipCompressor.java new file mode 100644 index 00000000000..3a83f7c8d2b --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipCompressor.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.compression; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.zip.Deflater; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.compression.ChunkCompressor; + + +/** + * Implementation of {@link ChunkCompressor} using GZIP compression algorithm. + */ +class GzipCompressor implements ChunkCompressor { + + private final Deflater _compressor; + + public GzipCompressor() { + _compressor = new Deflater(); + } + + @Override + public int compress(ByteBuffer inUncompressed, ByteBuffer outCompressed) + throws IOException { + _compressor.reset(); + _compressor.setInput(inUncompressed); + _compressor.finish(); + _compressor.deflate(outCompressed); + outCompressed.flip(); + return outCompressed.limit(); + } + + @Override + public int maxCompressedSize(int uncompressedSize) { + // https://github.com/luvit/zlib/blob/8de57bce969eb9dafc1f1f5c256ac608d0a73ec4/compress.c#L75 + return uncompressedSize + (uncompressedSize >> 12) + (uncompressedSize >> 14) + (uncompressedSize >> 25) + 13; + } + + @Override + public ChunkCompressionType compressionType() { + return ChunkCompressionType.GZIP; + } + + @Override + public void close() + throws IOException { + _compressor.end(); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipDecompressor.java new file mode 100644 index 00000000000..b07d8acdbc0 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipDecompressor.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.compression; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; +import org.apache.pinot.segment.spi.compression.ChunkDecompressor; + + +/** + * Implementation of {@link ChunkDecompressor} using GZIP decompression algorithm. + */ +class GzipDecompressor implements ChunkDecompressor { + + private final Inflater _decompressor; + + public GzipDecompressor() { + _decompressor = new Inflater(); + } + + @Override + public int decompress(ByteBuffer compressedInput, ByteBuffer decompressedOutput) + throws IOException { + _decompressor.reset(); + _decompressor.setInput(compressedInput); + try { + _decompressor.inflate(decompressedOutput); + } catch (DataFormatException e) { + throw new IOException(e); + } + decompressedOutput.flip(); + return decompressedOutput.limit(); + } + + @Override + public int decompressedLength(ByteBuffer compressedInput) { + return -1; + } + + @Override + public void close() + throws IOException { + _decompressor.end(); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java index 70d8f387064..0cdff5ce615 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java @@ -114,6 +114,7 @@ public void close() _header.flip(); _dataFile.write(_header, 0); _dataFile.close(); + _chunkCompressor.close(); } /** @@ -192,7 +193,6 @@ protected void writeChunk() { } _dataOffset += sizeToWrite; - _chunkBuffer.clear(); } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index 868511a437c..440808a6b0b 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -325,5 +325,6 @@ public void close() CleanerUtil.cleanQuietly(_compressionBuffer); CleanerUtil.cleanQuietly(_chunkBuffer); FileUtils.deleteQuietly(_dataBuffer); + _chunkCompressor.close(); } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/BaseChunkForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/BaseChunkForwardIndexReader.java index c7855ee54fd..745bd18fde0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/BaseChunkForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/BaseChunkForwardIndexReader.java @@ -241,12 +241,10 @@ protected long getChunkPosition(int chunkId) { protected long getChunkPositionAndRecordRanges(int chunkId, List ranges) { if (_headerEntryChunkOffsetSize == Integer.BYTES) { - ranges.add( - new ByteRange(_dataHeaderStart + chunkId * _headerEntryChunkOffsetSize, Integer.BYTES)); + ranges.add(new ByteRange(_dataHeaderStart + chunkId * _headerEntryChunkOffsetSize, Integer.BYTES)); return _dataHeader.getInt(chunkId * _headerEntryChunkOffsetSize); } else { - ranges.add( - new ByteRange(_dataHeaderStart + chunkId * _headerEntryChunkOffsetSize, Long.BYTES)); + ranges.add(new ByteRange(_dataHeaderStart + chunkId * _headerEntryChunkOffsetSize, Long.BYTES)); return _dataHeader.getLong(chunkId * _headerEntryChunkOffsetSize); } } @@ -446,9 +444,11 @@ public void readValuesSV(int[] docIds, int length, double[] values, ChunkReaderC } @Override - public void close() { + public void close() + throws IOException { // NOTE: DO NOT close the PinotDataBuffer here because it is tracked by the caller and might be reused later. The // caller is responsible of closing the PinotDataBuffer. + _chunkDecompressor.close(); } private boolean isContiguousRange(int[] docIds, int length) { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index 47c30aec6b1..f0a3658cb3b 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -266,6 +266,7 @@ public byte[][] getBytesMV(int docId, VarByteChunkForwardIndexReaderV4.ReaderCon @Override public void close() throws IOException { + _chunkDecompressor.close(); } @Override diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/TestCompression.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/TestCompression.java index 9f711929e97..245803ec533 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/TestCompression.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/TestCompression.java @@ -21,6 +21,10 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.ChunkCompressor; import org.apache.pinot.segment.spi.compression.ChunkDecompressor; @@ -41,45 +45,120 @@ public Object[][] formats() { buffer.put(input); buffer.flip(); return new Object[][]{ - {ChunkCompressionType.PASS_THROUGH, buffer.slice()}, - {ChunkCompressionType.SNAPPY, buffer.slice()}, - {ChunkCompressionType.LZ4, buffer.slice()}, - {ChunkCompressionType.LZ4_LENGTH_PREFIXED, buffer.slice()}, - {ChunkCompressionType.ZSTANDARD, buffer.slice()} + {ChunkCompressionType.PASS_THROUGH, buffer.slice()}, {ChunkCompressionType.SNAPPY, buffer.slice()}, + {ChunkCompressionType.LZ4, buffer.slice()}, {ChunkCompressionType.LZ4_LENGTH_PREFIXED, buffer.slice()}, + {ChunkCompressionType.ZSTANDARD, buffer.slice()}, {ChunkCompressionType.GZIP, buffer.slice()} }; } @Test(dataProvider = "formats") public void testRoundtrip(ChunkCompressionType type, ByteBuffer rawInput) throws IOException { - ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(type); - assertEquals(compressor.compressionType(), type, "upgrade is opt in"); - roundtrip(compressor, rawInput); + try (ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(type)) { + assertEquals(compressor.compressionType(), type, "upgrade is opt in"); + roundtrip(compressor, rawInput); + } } @Test(dataProvider = "formats") public void testRoundtripWithUpgrade(ChunkCompressionType type, ByteBuffer rawInput) throws IOException { - ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(type, true); - assertNotEquals(compressor.compressionType(), ChunkCompressionType.LZ4, - "LZ4 compression type does not support length prefix"); - roundtrip(compressor, rawInput); + try (ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(type, true)) { + assertNotEquals(compressor.compressionType(), ChunkCompressionType.LZ4, + "LZ4 compression type does not support length prefix"); + roundtrip(compressor, rawInput); + } } - private void roundtrip(ChunkCompressor compressor, ByteBuffer rawInput) + @Test(dataProvider = "formats") + public void testConcurrent(ChunkCompressionType type, ByteBuffer ignore) { + + String expected = "The gzip file format is:\n" + + "- a 10-byte header, containing a magic number (1f 8b), the compression method (08 for DEFLATE), " + + "1-byte of header flags, a 4-byte timestamp, compression flags and the operating system ID.\n" + + "- optional extra headers as allowed by the header flags, including the original filename, a " + + "comment field, an 'extra' field, and the lower half of a CRC-32 checksum for the header section.\n" + + "- a body, containing a DEFLATE-compressed payload.\n" + + "- an 8-byte trailer, containing a CRC-32 checksum and the length of the original uncompressed " + + "data, modulo 232.[4]\n" + + "gzip is normally used to compress just single files. Compressed archives are typically created " + + "by assembling collections of files into a single tar archive and then compressing that archive " + + "with gzip.\n gzip is not to be confused with ZIP, which can hold collections of files without " + + "an external archiver, but is less compact than compressed tarballs holding the same data, because " + + "it compresses files individually and cannot take advantage of redundancy between files.\n\n"; + byte[] input = expected.getBytes(StandardCharsets.UTF_8); + ByteBuffer rawInput = ByteBuffer.allocateDirect(input.length).put(input).flip(); + + Thread[] workers = new Thread[5]; + ByteBuffer[] compressed = new ByteBuffer[workers.length]; + ByteBuffer[] decompressed = new ByteBuffer[workers.length]; + CountDownLatch done = new CountDownLatch(workers.length); + AtomicInteger errors = new AtomicInteger(); + for (int i = 0; i < workers.length; i++) { + int idx = i; + workers[i] = new Thread(() -> { + try { + // compress + try (ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(type)) { + compressed[idx] = ByteBuffer.allocateDirect(compressor.maxCompressedSize(rawInput.limit())); + compressor.compress(rawInput.slice(), compressed[idx]); + } + + // small context switch + TimeUnit.MILLISECONDS.sleep(1L + (long) (ThreadLocalRandom.current().nextDouble() * 10.0)); + + // decompress + try (ChunkDecompressor decompressor = ChunkCompressorFactory.getDecompressor(type)) { + int size = decompressor.decompressedLength(compressed[idx]); + if (type == ChunkCompressionType.LZ4 || type == ChunkCompressionType.GZIP) { + size = rawInput.limit(); + } + decompressed[idx] = ByteBuffer.allocateDirect(size); + decompressor.decompress(compressed[idx], decompressed[idx]); + } + } catch (Throwable e) { + e.printStackTrace(); + errors.incrementAndGet(); + } finally { + done.countDown(); + } + }); + workers[i].start(); + } + + try { + done.await(60L, TimeUnit.SECONDS); // it will not take this long + } catch (InterruptedException e) { + throw new AssertionError("timed-out"); + } + + // there are no errors + assertEquals(errors.get(), 0); + + // all decompressed buffers contain the original text + for (int i = 0; i < workers.length; i++) { + assertEquals(StandardCharsets.UTF_8.decode(decompressed[i]).toString(), expected); + compressed[i].clear(); + decompressed[i].clear(); + } + } + + private static void roundtrip(ChunkCompressor compressor, ByteBuffer rawInput) throws IOException { ByteBuffer compressedOutput = ByteBuffer.allocateDirect(compressor.maxCompressedSize(rawInput.limit())); compressor.compress(rawInput.slice(), compressedOutput); - ChunkDecompressor decompressor = ChunkCompressorFactory.getDecompressor(compressor.compressionType()); - int decompressedLength = decompressor.decompressedLength(compressedOutput); - assertTrue(compressor.compressionType() == ChunkCompressionType.LZ4 || decompressedLength > 0); - ByteBuffer decompressedOutput = ByteBuffer.allocateDirect( - compressor.compressionType() == ChunkCompressionType.LZ4 ? rawInput.limit() : decompressedLength); - decompressor.decompress(compressedOutput, decompressedOutput); - byte[] expected = new byte[rawInput.limit()]; - rawInput.get(expected); - byte[] actual = new byte[decompressedOutput.limit()]; - decompressedOutput.get(actual); - assertEquals(actual, expected, "content differs after compression roundt rip"); + try (ChunkDecompressor decompressor = ChunkCompressorFactory.getDecompressor(compressor.compressionType())) { + int decompressedLength = decompressor.decompressedLength(compressedOutput); + boolean isLz4OrGzip = compressor.compressionType() == ChunkCompressionType.LZ4 + || compressor.compressionType() == ChunkCompressionType.GZIP; + assertTrue(isLz4OrGzip || decompressedLength > 0); + ByteBuffer decompressedOutput = ByteBuffer.allocateDirect(isLz4OrGzip ? rawInput.limit() : decompressedLength); + decompressor.decompress(compressedOutput, decompressedOutput); + byte[] expected = new byte[rawInput.limit()]; + rawInput.get(expected); + byte[] actual = new byte[decompressedOutput.limit()]; + decompressedOutput.get(actual); + assertEquals(actual, expected, "content differs after compression roundt rip"); + } } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/VarByteChunkSVForwardIndexTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/VarByteChunkSVForwardIndexTest.java index 17f169081bb..55551d9e930 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/VarByteChunkSVForwardIndexTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/VarByteChunkSVForwardIndexTest.java @@ -75,6 +75,12 @@ public void testWithLZ4Compression() test(ChunkCompressionType.LZ4); } + @Test + public void testWithGZIPCompression() + throws Exception { + test(ChunkCompressionType.GZIP); + } + /** * This test writes {@link #NUM_ENTRIES} using {@link VarByteChunkForwardIndexWriter}. It then reads * the strings & bytes using {@link VarByteChunkSVForwardIndexReader}, and asserts that what was written is the @@ -177,36 +183,43 @@ public void testVarCharWithDifferentSizes() testLargeVarcharHelper(ChunkCompressionType.PASS_THROUGH, 10, 1000); testLargeVarcharHelper(ChunkCompressionType.ZSTANDARD, 10, 1000); testLargeVarcharHelper(ChunkCompressionType.LZ4, 10, 1000); + testLargeVarcharHelper(ChunkCompressionType.GZIP, 10, 1000); testLargeVarcharHelper(ChunkCompressionType.SNAPPY, 100, 1000); testLargeVarcharHelper(ChunkCompressionType.PASS_THROUGH, 100, 1000); testLargeVarcharHelper(ChunkCompressionType.ZSTANDARD, 100, 1000); testLargeVarcharHelper(ChunkCompressionType.LZ4, 100, 1000); + testLargeVarcharHelper(ChunkCompressionType.GZIP, 100, 1000); testLargeVarcharHelper(ChunkCompressionType.SNAPPY, 1000, 1000); testLargeVarcharHelper(ChunkCompressionType.PASS_THROUGH, 1000, 1000); testLargeVarcharHelper(ChunkCompressionType.ZSTANDARD, 1000, 1000); testLargeVarcharHelper(ChunkCompressionType.LZ4, 1000, 1000); + testLargeVarcharHelper(ChunkCompressionType.GZIP, 1000, 1000); testLargeVarcharHelper(ChunkCompressionType.SNAPPY, 10000, 100); testLargeVarcharHelper(ChunkCompressionType.PASS_THROUGH, 10000, 100); testLargeVarcharHelper(ChunkCompressionType.ZSTANDARD, 10000, 100); testLargeVarcharHelper(ChunkCompressionType.LZ4, 10000, 100); + testLargeVarcharHelper(ChunkCompressionType.GZIP, 10000, 100); testLargeVarcharHelper(ChunkCompressionType.SNAPPY, 100000, 10); testLargeVarcharHelper(ChunkCompressionType.PASS_THROUGH, 100000, 10); testLargeVarcharHelper(ChunkCompressionType.ZSTANDARD, 100000, 10); testLargeVarcharHelper(ChunkCompressionType.LZ4, 100000, 10); + testLargeVarcharHelper(ChunkCompressionType.GZIP, 100000, 10); testLargeVarcharHelper(ChunkCompressionType.SNAPPY, 1000000, 10); testLargeVarcharHelper(ChunkCompressionType.PASS_THROUGH, 1000000, 10); testLargeVarcharHelper(ChunkCompressionType.ZSTANDARD, 1000000, 10); testLargeVarcharHelper(ChunkCompressionType.LZ4, 1000000, 10); + testLargeVarcharHelper(ChunkCompressionType.GZIP, 1000000, 10); testLargeVarcharHelper(ChunkCompressionType.SNAPPY, 2000000, 10); testLargeVarcharHelper(ChunkCompressionType.PASS_THROUGH, 2000000, 10); testLargeVarcharHelper(ChunkCompressionType.ZSTANDARD, 2000000, 10); testLargeVarcharHelper(ChunkCompressionType.LZ4, 2000000, 10); + testLargeVarcharHelper(ChunkCompressionType.GZIP, 2000000, 10); } private void testLargeVarcharHelper(ChunkCompressionType compressionType, int numChars, int numDocs) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java index 53f6995a578..1df3e703641 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java @@ -86,21 +86,23 @@ public class ForwardIndexHandlerTest { private static final String DIM_PASS_THROUGH_STRING = "DIM_PASS_THROUGH_STRING"; private static final String DIM_ZSTANDARD_STRING = "DIM_ZSTANDARD_STRING"; private static final String DIM_LZ4_STRING = "DIM_LZ4_STRING"; + private static final String DIM_GZIP_STRING = "DIM_GZIP_STRING"; private static final String DIM_SNAPPY_LONG = "DIM_SNAPPY_LONG"; private static final String DIM_PASS_THROUGH_LONG = "DIM_PASS_THROUGH_LONG"; private static final String DIM_ZSTANDARD_LONG = "DIM_ZSTANDARD_LONG"; private static final String DIM_LZ4_LONG = "DIM_LZ4_LONG"; - + private static final String DIM_GZIP_LONG = "DIM_GZIP_LONG"; private static final String DIM_SNAPPY_INTEGER = "DIM_SNAPPY_INTEGER"; private static final String DIM_PASS_THROUGH_INTEGER = "DIM_PASS_THROUGH_INTEGER"; private static final String DIM_ZSTANDARD_INTEGER = "DIM_ZSTANDARD_INTEGER"; private static final String DIM_LZ4_INTEGER = "DIM_LZ4_INTEGER"; - + private static final String DIM_GZIP_INTEGER = "DIM_GZIP_INTEGER"; private static final String DIM_SNAPPY_BYTES = "DIM_SNAPPY_BYTES"; private static final String DIM_PASS_THROUGH_BYTES = "DIM_PASS_THROUGH_BYTES"; private static final String DIM_ZSTANDARD_BYTES = "DIM_ZSTANDARD_BYTES"; private static final String DIM_LZ4_BYTES = "DIM_LZ4_BYTES"; + private static final String DIM_GZIP_BYTES = "DIM_GZIP_BYTES"; // Sorted columns private static final String DIM_RAW_SORTED_INTEGER = "DIM_RAW_SORTED_INTEGER"; @@ -110,11 +112,13 @@ public class ForwardIndexHandlerTest { private static final String METRIC_SNAPPY_INTEGER = "METRIC_SNAPPY_INTEGER"; private static final String METRIC_ZSTANDARD_INTEGER = "METRIC_ZSTANDARD_INTEGER"; private static final String METRIC_LZ4_INTEGER = "METRIC_LZ4_INTEGER"; + private static final String METRIC_GZIP_INTEGER = "METRIC_GZIP_INTEGER"; private static final String METRIC_SNAPPY_BIG_DECIMAL = "METRIC_SNAPPY_BIG_DECIMAL"; private static final String METRIC_PASS_THROUGH_BIG_DECIMAL = "METRIC_PASS_THROUGH_BIG_DECIMAL"; private static final String METRIC_ZSTANDARD_BIG_DECIMAL = "METRIC_ZSTANDARD_BIG_DECIMAL"; private static final String METRIC_LZ4_BIG_DECIMAL = "METRIC_LZ4_BIG_DECIMAL"; + private static final String METRIC_GZIP_BIG_DECIMAL = "METRIC_GZIP_BIG_DECIMAL"; // Multi-value columns private static final String DIM_MV_PASS_THROUGH_INTEGER = "DIM_MV_PASS_THROUGH_INTEGER"; @@ -187,16 +191,20 @@ public class ForwardIndexHandlerTest { Arrays.asList(DIM_LZ4_STRING, DIM_LZ4_LONG, DIM_LZ4_INTEGER, DIM_LZ4_BYTES, METRIC_LZ4_BIG_DECIMAL, METRIC_LZ4_INTEGER); - private static final List DICT_ENABLED_COLUMNS_WITH_FORWARD_INDEX = Arrays.asList(DIM_DICT_INTEGER, - DIM_DICT_LONG, DIM_DICT_STRING, DIM_DICT_BYES, DIM_DICT_MV_BYTES, DIM_DICT_MV_STRING, - DIM_DICT_MV_INTEGER, DIM_DICT_MV_LONG); + private static final List RAW_GZIP_INDEX_COLUMNS = + Arrays.asList(DIM_GZIP_STRING, DIM_GZIP_LONG, DIM_GZIP_INTEGER, DIM_GZIP_BYTES, METRIC_GZIP_BIG_DECIMAL, + METRIC_GZIP_INTEGER); + + private static final List DICT_ENABLED_COLUMNS_WITH_FORWARD_INDEX = + Arrays.asList(DIM_DICT_INTEGER, DIM_DICT_LONG, DIM_DICT_STRING, DIM_DICT_BYES, DIM_DICT_MV_BYTES, + DIM_DICT_MV_STRING, DIM_DICT_MV_INTEGER, DIM_DICT_MV_LONG); private static final List DICT_ENABLED_MV_COLUMNS_WITH_FORWARD_INDEX = Arrays.asList(DIM_DICT_MV_INTEGER, DIM_DICT_MV_LONG, DIM_DICT_MV_STRING, DIM_DICT_MV_BYTES); - private static final List SV_FORWARD_INDEX_DISABLED_COLUMNS = Arrays.asList( - DIM_SV_FORWARD_INDEX_DISABLED_INTEGER, DIM_SV_FORWARD_INDEX_DISABLED_LONG, DIM_SV_FORWARD_INDEX_DISABLED_STRING, - DIM_SV_FORWARD_INDEX_DISABLED_BYTES); + private static final List SV_FORWARD_INDEX_DISABLED_COLUMNS = + Arrays.asList(DIM_SV_FORWARD_INDEX_DISABLED_INTEGER, DIM_SV_FORWARD_INDEX_DISABLED_LONG, + DIM_SV_FORWARD_INDEX_DISABLED_STRING, DIM_SV_FORWARD_INDEX_DISABLED_BYTES); private static final List MV_FORWARD_INDEX_DISABLED_COLUMNS = Arrays.asList(DIM_MV_FORWARD_INDEX_DISABLED_INTEGER, DIM_MV_FORWARD_INDEX_DISABLED_LONG, @@ -241,13 +249,14 @@ private void buildSegment() List fieldConfigs = new ArrayList<>( RAW_SNAPPY_INDEX_COLUMNS.size() + RAW_SORTED_INDEX_COLUMNS.size() + RAW_ZSTANDARD_INDEX_COLUMNS.size() - + RAW_PASS_THROUGH_INDEX_COLUMNS.size() + RAW_LZ4_INDEX_COLUMNS.size() + + RAW_PASS_THROUGH_INDEX_COLUMNS.size() + RAW_LZ4_INDEX_COLUMNS.size() + RAW_GZIP_INDEX_COLUMNS.size() + SV_FORWARD_INDEX_DISABLED_COLUMNS.size() + MV_FORWARD_INDEX_DISABLED_COLUMNS.size() + MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.size() + FORWARD_INDEX_DISABLED_RAW_COLUMNS.size() + 2); for (String indexColumn : RAW_SNAPPY_INDEX_COLUMNS) { - fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.RAW, Collections.emptyList(), - CompressionCodec.SNAPPY, null)); + fieldConfigs.add( + new FieldConfig(indexColumn, FieldConfig.EncodingType.RAW, Collections.emptyList(), CompressionCodec.SNAPPY, + null)); } for (String indexColumn : RAW_SORTED_INDEX_COLUMNS) { @@ -266,46 +275,56 @@ private void buildSegment() } for (String indexColumn : RAW_LZ4_INDEX_COLUMNS) { - fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.RAW, Collections.emptyList(), - CompressionCodec.LZ4, null)); + fieldConfigs.add( + new FieldConfig(indexColumn, FieldConfig.EncodingType.RAW, Collections.emptyList(), CompressionCodec.LZ4, + null)); + } + + for (String indexColumn : RAW_GZIP_INDEX_COLUMNS) { + fieldConfigs.add( + new FieldConfig(indexColumn, FieldConfig.EncodingType.RAW, Collections.emptyList(), CompressionCodec.GZIP, + null)); } for (String indexColumn : SV_FORWARD_INDEX_DISABLED_COLUMNS) { - fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.DICTIONARY, Collections.singletonList( - FieldConfig.IndexType.INVERTED), null, + fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.DICTIONARY, + Collections.singletonList(FieldConfig.IndexType.INVERTED), null, Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); } for (String indexColumn : MV_FORWARD_INDEX_DISABLED_COLUMNS) { - fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.DICTIONARY, Collections.singletonList( - FieldConfig.IndexType.INVERTED), null, + fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.DICTIONARY, + Collections.singletonList(FieldConfig.IndexType.INVERTED), null, Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); } for (String indexColumn : MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS) { - fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.DICTIONARY, Collections.singletonList( - FieldConfig.IndexType.INVERTED), null, + fieldConfigs.add(new FieldConfig(indexColumn, FieldConfig.EncodingType.DICTIONARY, + Collections.singletonList(FieldConfig.IndexType.INVERTED), null, Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); } for (String indexColumn : FORWARD_INDEX_DISABLED_RAW_COLUMNS) { fieldConfigs.add( new FieldConfig(indexColumn, FieldConfig.EncodingType.RAW, Collections.emptyList(), CompressionCodec.LZ4, - Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); + Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); } - fieldConfigs.add(new FieldConfig(DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX, - FieldConfig.EncodingType.DICTIONARY, Collections.emptyList(), null, - Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); + fieldConfigs.add( + new FieldConfig(DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX, FieldConfig.EncodingType.DICTIONARY, + Collections.emptyList(), null, + Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); - fieldConfigs.add(new FieldConfig(DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITH_RANGE_INDEX, - FieldConfig.EncodingType.DICTIONARY, Collections.singletonList(FieldConfig.IndexType.RANGE), null, - Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); + fieldConfigs.add( + new FieldConfig(DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITH_RANGE_INDEX, FieldConfig.EncodingType.DICTIONARY, + Collections.singletonList(FieldConfig.IndexType.RANGE), null, + Collections.singletonMap(FieldConfig.FORWARD_INDEX_DISABLED, Boolean.TRUE.toString()))); _noDictionaryColumns.addAll(RAW_SNAPPY_INDEX_COLUMNS); _noDictionaryColumns.addAll(RAW_ZSTANDARD_INDEX_COLUMNS); _noDictionaryColumns.addAll(RAW_PASS_THROUGH_INDEX_COLUMNS); _noDictionaryColumns.addAll(RAW_LZ4_INDEX_COLUMNS); + _noDictionaryColumns.addAll(RAW_GZIP_INDEX_COLUMNS); _noDictionaryColumns.addAll(FORWARD_INDEX_DISABLED_RAW_COLUMNS); _noDictionaryColumns.addAll(RAW_SORTED_INDEX_COLUMNS); @@ -330,30 +349,35 @@ private void buildSegment() .addSingleValueDimension(DIM_PASS_THROUGH_STRING, FieldSpec.DataType.STRING) .addSingleValueDimension(DIM_ZSTANDARD_STRING, FieldSpec.DataType.STRING) .addSingleValueDimension(DIM_LZ4_STRING, FieldSpec.DataType.STRING) + .addSingleValueDimension(DIM_GZIP_STRING, FieldSpec.DataType.STRING) .addSingleValueDimension(DIM_SNAPPY_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(DIM_RAW_SORTED_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(DIM_ZSTANDARD_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(DIM_PASS_THROUGH_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(DIM_LZ4_INTEGER, FieldSpec.DataType.INT) + .addSingleValueDimension(DIM_GZIP_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(DIM_SNAPPY_LONG, FieldSpec.DataType.LONG) .addSingleValueDimension(DIM_ZSTANDARD_LONG, FieldSpec.DataType.LONG) .addSingleValueDimension(DIM_PASS_THROUGH_LONG, FieldSpec.DataType.LONG) .addSingleValueDimension(DIM_LZ4_LONG, FieldSpec.DataType.LONG) + .addSingleValueDimension(DIM_GZIP_LONG, FieldSpec.DataType.LONG) .addSingleValueDimension(DIM_SNAPPY_BYTES, FieldSpec.DataType.BYTES) .addSingleValueDimension(DIM_PASS_THROUGH_BYTES, FieldSpec.DataType.BYTES) .addSingleValueDimension(DIM_ZSTANDARD_BYTES, FieldSpec.DataType.BYTES) .addSingleValueDimension(DIM_LZ4_BYTES, FieldSpec.DataType.BYTES) + .addSingleValueDimension(DIM_GZIP_BYTES, FieldSpec.DataType.BYTES) .addMetric(METRIC_SNAPPY_BIG_DECIMAL, FieldSpec.DataType.BIG_DECIMAL) .addMetric(METRIC_PASS_THROUGH_BIG_DECIMAL, FieldSpec.DataType.BIG_DECIMAL) .addMetric(METRIC_ZSTANDARD_BIG_DECIMAL, FieldSpec.DataType.BIG_DECIMAL) .addMetric(METRIC_LZ4_BIG_DECIMAL, FieldSpec.DataType.BIG_DECIMAL) + .addMetric(METRIC_GZIP_BIG_DECIMAL, FieldSpec.DataType.BIG_DECIMAL) .addSingleValueDimension(DIM_DICT_INTEGER, FieldSpec.DataType.INT) .addSingleValueDimension(DIM_DICT_LONG, FieldSpec.DataType.LONG) .addSingleValueDimension(DIM_DICT_STRING, FieldSpec.DataType.STRING) .addSingleValueDimension(DIM_DICT_BYES, FieldSpec.DataType.BYTES) .addMetric(METRIC_PASS_THROUGH_INTEGER, FieldSpec.DataType.INT) - .addMetric(METRIC_SNAPPY_INTEGER, FieldSpec.DataType.INT) - .addMetric(METRIC_LZ4_INTEGER, FieldSpec.DataType.INT) + .addMetric(METRIC_SNAPPY_INTEGER, FieldSpec.DataType.INT).addMetric(METRIC_LZ4_INTEGER, FieldSpec.DataType.INT) + .addMetric(METRIC_GZIP_INTEGER, FieldSpec.DataType.INT) .addMetric(METRIC_ZSTANDARD_INTEGER, FieldSpec.DataType.INT) .addMultiValueDimension(DIM_MV_PASS_THROUGH_INTEGER, FieldSpec.DataType.INT) .addMultiValueDimension(DIM_MV_PASS_THROUGH_LONG, FieldSpec.DataType.LONG) @@ -480,13 +504,16 @@ private List createTestData() { row.putValue(DIM_ZSTANDARD_STRING, tempStringRows[i]); row.putValue(DIM_PASS_THROUGH_STRING, tempStringRows[i]); row.putValue(DIM_LZ4_STRING, tempStringRows[i]); + row.putValue(DIM_GZIP_STRING, tempStringRows[i]); // Raw integer columns row.putValue(DIM_SNAPPY_INTEGER, tempIntRows[i]); row.putValue(DIM_ZSTANDARD_INTEGER, tempIntRows[i]); row.putValue(DIM_PASS_THROUGH_INTEGER, tempIntRows[i]); row.putValue(DIM_LZ4_INTEGER, tempIntRows[i]); + row.putValue(DIM_GZIP_INTEGER, tempIntRows[i]); row.putValue(METRIC_LZ4_INTEGER, tempIntRows[i]); + row.putValue(METRIC_GZIP_INTEGER, tempIntRows[i]); row.putValue(METRIC_PASS_THROUGH_INTEGER, tempIntRows[i]); row.putValue(METRIC_ZSTANDARD_INTEGER, tempIntRows[i]); row.putValue(METRIC_SNAPPY_INTEGER, tempIntRows[i]); @@ -497,18 +524,21 @@ private List createTestData() { row.putValue(DIM_ZSTANDARD_LONG, tempLongRows[i]); row.putValue(DIM_PASS_THROUGH_LONG, tempLongRows[i]); row.putValue(DIM_LZ4_LONG, tempLongRows[i]); + row.putValue(DIM_GZIP_LONG, tempLongRows[i]); // Raw Byte columns row.putValue(DIM_SNAPPY_BYTES, tempBytesRows[i]); row.putValue(DIM_ZSTANDARD_BYTES, tempBytesRows[i]); row.putValue(DIM_PASS_THROUGH_BYTES, tempBytesRows[i]); row.putValue(DIM_LZ4_BYTES, tempBytesRows[i]); + row.putValue(DIM_GZIP_BYTES, tempBytesRows[i]); // Raw BigDecimal column row.putValue(METRIC_SNAPPY_BIG_DECIMAL, tempBigDecimalRows[i]); row.putValue(METRIC_ZSTANDARD_BIG_DECIMAL, tempBigDecimalRows[i]); row.putValue(METRIC_PASS_THROUGH_BIG_DECIMAL, tempBigDecimalRows[i]); row.putValue(METRIC_LZ4_BIG_DECIMAL, tempBigDecimalRows[i]); + row.putValue(METRIC_GZIP_BIG_DECIMAL, tempBigDecimalRows[i]); // Dictionary SV columns row.putValue(DIM_DICT_INTEGER, tempIntRows[i]); @@ -556,7 +586,8 @@ private List createTestData() { } @Test - public void testComputeOperationNoOp() throws Exception { + public void testComputeOperationNoOp() + throws Exception { // Setup SegmentMetadataImpl existingSegmentMetadata = new SegmentMetadataImpl(_segmentDirectory); SegmentDirectory segmentLocalFSDirectory = @@ -574,7 +605,8 @@ public void testComputeOperationNoOp() throws Exception { } @Test - public void testComputeOperationEnableDictionary() throws Exception { + public void testComputeOperationEnableDictionary() + throws Exception { // Setup SegmentMetadataImpl existingSegmentMetadata = new SegmentMetadataImpl(_segmentDirectory); SegmentDirectory segmentLocalFSDirectory = @@ -628,13 +660,13 @@ public void testComputeOperationEnableDictionary() throws Exception { assertEquals(operationMap.get(DIM_RAW_SORTED_INTEGER), Collections.singletonList(ForwardIndexHandler.Operation.ENABLE_DICTIONARY)); - // Tear down segmentLocalFSDirectory.close(); } @Test - public void testComputeOperationDisableDictionary() throws Exception { + public void testComputeOperationDisableDictionary() + throws Exception { // Setup SegmentMetadataImpl existingSegmentMetadata = new SegmentMetadataImpl(_segmentDirectory); SegmentDirectory segmentLocalFSDirectory = @@ -677,7 +709,8 @@ public void testComputeOperationDisableDictionary() throws Exception { } @Test - public void testComputeOperationChangeCompression() throws Exception { + public void testComputeOperationChangeCompression() + throws Exception { // Setup SegmentMetadataImpl existingSegmentMetadata = new SegmentMetadataImpl(_segmentDirectory); SegmentDirectory segmentLocalFSDirectory = @@ -696,9 +729,8 @@ public void testComputeOperationChangeCompression() throws Exception { randIdx = rand.nextInt(fieldConfigs.size()); name = fieldConfigs.get(randIdx).getName(); } while (SV_FORWARD_INDEX_DISABLED_COLUMNS.contains(name) || MV_FORWARD_INDEX_DISABLED_COLUMNS.contains(name) - || MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.contains(name) - || FORWARD_INDEX_DISABLED_RAW_COLUMNS.contains(name) - || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX.equals(name) + || MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.contains(name) || FORWARD_INDEX_DISABLED_RAW_COLUMNS.contains( + name) || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX.equals(name) || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITH_RANGE_INDEX.equals(name)); FieldConfig config = fieldConfigs.remove(randIdx); CompressionCodec newCompressionType = null; @@ -794,8 +826,8 @@ public void testComputeOperationDisableForwardIndex() assertEquals(operationMap.size(), 1); Set operations = new HashSet<>(operationMap.get(DIM_LZ4_INTEGER)); assertEquals(operations.size(), 2); - Set expectedOperations = - new HashSet<>(Arrays.asList(ForwardIndexHandler.Operation.DISABLE_FORWARD_INDEX, + Set expectedOperations = new HashSet<>( + Arrays.asList(ForwardIndexHandler.Operation.DISABLE_FORWARD_INDEX, ForwardIndexHandler.Operation.ENABLE_DICTIONARY)); assertEquals(expectedOperations, operations); @@ -827,7 +859,7 @@ public void testComputeOperationDisableForwardIndex() operations = new HashSet<>(operationMap.get(DIM_LZ4_LONG)); assertEquals(operations.size(), 2); expectedOperations = new HashSet<>(Arrays.asList(ForwardIndexHandler.Operation.DISABLE_FORWARD_INDEX, - ForwardIndexHandler.Operation.ENABLE_DICTIONARY)); + ForwardIndexHandler.Operation.ENABLE_DICTIONARY)); assertEquals(expectedOperations, operations); operations = new HashSet<>(operationMap.get(DIM_SNAPPY_STRING)); assertEquals(operations.size(), 2); @@ -1108,8 +1140,7 @@ public void testChangeCompressionForSingleColumn() String columnName = config.getName(); FieldConfig newConfig = - new FieldConfig(columnName, FieldConfig.EncodingType.RAW, Collections.emptyList(), compressionType, - null); + new FieldConfig(columnName, FieldConfig.EncodingType.RAW, Collections.emptyList(), compressionType, null); fieldConfigs.add(newConfig); TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(TABLE_NAME) @@ -1237,9 +1268,8 @@ public void testChangeCompressionForMultipleColumns() randomIdx = rand.nextInt(fieldConfigs.size()); name = fieldConfigs.get(randomIdx).getName(); } while (SV_FORWARD_INDEX_DISABLED_COLUMNS.contains(name) || MV_FORWARD_INDEX_DISABLED_COLUMNS.contains(name) - || MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.contains(name) - || FORWARD_INDEX_DISABLED_RAW_COLUMNS.contains(name) - || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX.equals(name) + || MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.contains(name) || FORWARD_INDEX_DISABLED_RAW_COLUMNS.contains( + name) || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX.equals(name) || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITH_RANGE_INDEX.equals(name)); FieldConfig config1 = fieldConfigs.remove(randomIdx); String column1 = config1.getName(); @@ -1253,9 +1283,8 @@ public void testChangeCompressionForMultipleColumns() randomIdx = rand.nextInt(fieldConfigs.size()); name = fieldConfigs.get(randomIdx).getName(); } while (SV_FORWARD_INDEX_DISABLED_COLUMNS.contains(name) || MV_FORWARD_INDEX_DISABLED_COLUMNS.contains(name) - || MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.contains(name) - || FORWARD_INDEX_DISABLED_RAW_COLUMNS.contains(name) - || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX.equals(name) + || MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.contains(name) || FORWARD_INDEX_DISABLED_RAW_COLUMNS.contains( + name) || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX.equals(name) || DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITH_RANGE_INDEX.equals(name)); FieldConfig config2 = fieldConfigs.remove(randomIdx); String column2 = config2.getName(); @@ -1369,7 +1398,8 @@ public void testEnableDictionaryForMultipleColumns() } @Test - public void testEnableDictionaryForSortedColumn() throws Exception { + public void testEnableDictionaryForSortedColumn() + throws Exception { IndexLoadingConfig indexLoadingConfig = new IndexLoadingConfig(null, _tableConfig); for (int i = 0; i < RAW_SORTED_INDEX_COLUMNS.size(); i++) { @@ -1467,8 +1497,8 @@ public void testDisableForwardIndexForMultipleDictColumns() IndexLoadingConfig indexLoadingConfig = new IndexLoadingConfig(null, _tableConfig); Random rand = new Random(); - String col1 = DICT_ENABLED_COLUMNS_WITH_FORWARD_INDEX.get( - rand.nextInt(DICT_ENABLED_COLUMNS_WITH_FORWARD_INDEX.size())); + String col1 = + DICT_ENABLED_COLUMNS_WITH_FORWARD_INDEX.get(rand.nextInt(DICT_ENABLED_COLUMNS_WITH_FORWARD_INDEX.size())); indexLoadingConfig.addForwardIndexDisabledColumns(col1); indexLoadingConfig.addInvertedIndexColumns(col1); String col2; @@ -1648,8 +1678,7 @@ public void testDisableForwardIndexForMultipleRawColumns() IndexLoadingConfig indexLoadingConfig = new IndexLoadingConfig(null, _tableConfig); Random rand = new Random(); - String col1 = RAW_LZ4_INDEX_COLUMNS.get( - rand.nextInt(RAW_LZ4_INDEX_COLUMNS.size())); + String col1 = RAW_LZ4_INDEX_COLUMNS.get(rand.nextInt(RAW_LZ4_INDEX_COLUMNS.size())); indexLoadingConfig.addForwardIndexDisabledColumns(col1); indexLoadingConfig.removeNoDictionaryColumns(col1); indexLoadingConfig.addInvertedIndexColumns(col1); @@ -1678,10 +1707,10 @@ public void testDisableForwardIndexForMultipleRawColumns() } else if (dataType == FieldSpec.DataType.BIG_DECIMAL) { dictionaryElementSize = 4; } - validateMetadataProperties(col1, true, dictionaryElementSize, metadata.getCardinality(), - metadata.getTotalDocs(), dataType, metadata.getFieldType(), metadata.isSorted(), - metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), - metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), false); + validateMetadataProperties(col1, true, dictionaryElementSize, metadata.getCardinality(), metadata.getTotalDocs(), + dataType, metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), + metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), + metadata.getMinValue(), metadata.getMaxValue(), false); // Col2 validation. validateIndexMap(col2, true, true); @@ -1696,10 +1725,10 @@ public void testDisableForwardIndexForMultipleRawColumns() } else if (dataType == FieldSpec.DataType.BIG_DECIMAL) { dictionaryElementSize = 4; } - validateMetadataProperties(col2, true, dictionaryElementSize, metadata.getCardinality(), - metadata.getTotalDocs(), dataType, metadata.getFieldType(), metadata.isSorted(), - metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), - metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), false); + validateMetadataProperties(col2, true, dictionaryElementSize, metadata.getCardinality(), metadata.getTotalDocs(), + dataType, metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), + metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), + metadata.getMinValue(), metadata.getMaxValue(), false); } @Test @@ -1801,10 +1830,10 @@ public void testDisableForwardIndexForRawAndInvertedIndexDisabledColumns() // In column metadata, nothing other than hasDictionary and dictionaryElementSize should change. ColumnMetadata metadata = existingSegmentMetadata.getColumnMetadataFor(column); FieldSpec.DataType dataType = metadata.getDataType(); - validateMetadataProperties(column, false, 0, metadata.getCardinality(), - metadata.getTotalDocs(), dataType, metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), - metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), - metadata.getMinValue(), metadata.getMaxValue(), false); + validateMetadataProperties(column, false, 0, metadata.getCardinality(), metadata.getTotalDocs(), dataType, + metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), + metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), metadata.getMinValue(), + metadata.getMaxValue(), false); } } @@ -1923,8 +1952,8 @@ public void testEnableForwardIndexInDictModeForMVForwardIndexDisabledColumnWithD Random rand = new Random(); // Remove from forward index list but keep the inverted index enabled - String column = MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS - .get(rand.nextInt(MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.size())); + String column = MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.get( + rand.nextInt(MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.size())); indexLoadingConfig.removeForwardIndexDisabledColumns(column); ForwardIndexHandler fwdIndexHandler = new ForwardIndexHandler(segmentLocalFSDirectory, indexLoadingConfig, _schema); @@ -2020,20 +2049,20 @@ public void testEnableForwardIndexInRawModeForMultipleForwardIndexDisabledColumn validateIndexMap(col1, false, false); validateForwardIndex(col1, CompressionCodec.LZ4, metadata.isSorted()); // In column metadata, nothing should change. - validateMetadataProperties(col1, false, 0, metadata.getCardinality(), - metadata.getTotalDocs(), metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), - metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), - metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), false); + validateMetadataProperties(col1, false, 0, metadata.getCardinality(), metadata.getTotalDocs(), + metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), + metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), + metadata.getMinValue(), metadata.getMaxValue(), false); // Col2 validation. metadata = existingSegmentMetadata.getColumnMetadataFor(col2); validateIndexMap(col2, false, false); validateForwardIndex(col2, CompressionCodec.LZ4, metadata.isSorted()); // In column metadata, nothing should change. - validateMetadataProperties(col2, false, 0, metadata.getCardinality(), - metadata.getTotalDocs(), metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), - metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), - metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), false); + validateMetadataProperties(col2, false, 0, metadata.getCardinality(), metadata.getTotalDocs(), + metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), + metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), + metadata.getMinValue(), metadata.getMaxValue(), false); } @Test @@ -2047,8 +2076,8 @@ public void testEnableForwardIndexInRawModeForMVForwardIndexDisabledColumnWithDu Random rand = new Random(); // Remove from forward index list but keep the inverted index enabled - String column = MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS - .get(rand.nextInt(MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.size())); + String column = MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.get( + rand.nextInt(MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.size())); indexLoadingConfig.removeForwardIndexDisabledColumns(column); indexLoadingConfig.removeInvertedIndexColumns(column); indexLoadingConfig.addNoDictionaryColumns(column); @@ -2066,10 +2095,10 @@ public void testEnableForwardIndexInRawModeForMVForwardIndexDisabledColumnWithDu validateForwardIndex(column, CompressionCodec.LZ4, metadata.isSorted()); // In column metadata, some values can change since MV columns with duplicates lose the duplicates on forward index // regeneration. - validateMetadataProperties(column, false, 0, metadata.getCardinality(), - metadata.getTotalDocs(), metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), - metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), - metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), true); + validateMetadataProperties(column, false, 0, metadata.getCardinality(), metadata.getTotalDocs(), + metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), + metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), + metadata.getMinValue(), metadata.getMaxValue(), true); } @Test @@ -2111,11 +2140,10 @@ public void testEnableForwardIndexInRawModeForSingleForwardIndexDisabledColumn() validateForwardIndex(column, CompressionCodec.LZ4, metadata.isSorted()); // In column metadata, nothing should change. - validateMetadataProperties(column, false, 0, - metadata.getCardinality(), metadata.getTotalDocs(), metadata.getDataType(), metadata.getFieldType(), - metadata.isSorted(), metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), - metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), metadata.getMinValue(), - metadata.getMaxValue(), false); + validateMetadataProperties(column, false, 0, metadata.getCardinality(), metadata.getTotalDocs(), + metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), + metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), + metadata.getMinValue(), metadata.getMaxValue(), false); } } @@ -2146,8 +2174,7 @@ public void testEnableForwardIndexForInvertedIndexDisabledColumn() validateIndexMap(DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX, true, true); validateIndexesForForwardIndexDisabledColumns(DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX); - ForwardIndexHandler fwdIndexHandler = - new ForwardIndexHandler(segmentLocalFSDirectory, indexLoadingConfig, _schema); + ForwardIndexHandler fwdIndexHandler = new ForwardIndexHandler(segmentLocalFSDirectory, indexLoadingConfig, _schema); fwdIndexHandler.updateIndices(writer); fwdIndexHandler.postUpdateIndicesCleanup(writer); @@ -2164,8 +2191,8 @@ public void testEnableForwardIndexForInvertedIndexDisabledColumn() validateMetadataProperties(DIM_SV_FORWARD_INDEX_DISABLED_INTEGER_WITHOUT_INV_IDX, metadata.hasDictionary(), metadata.getColumnMaxLength(), metadata.getCardinality(), metadata.getTotalDocs(), metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), - metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), metadata.getMinValue(), - metadata.getMaxValue(), false); + metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), + false); } @Test @@ -2198,8 +2225,7 @@ public void testEnableForwardIndexForDictionaryDisabledColumns() validateIndexMap(DIM_RAW_SV_FORWARD_INDEX_DISABLED_INTEGER, false, true); validateIndexesForForwardIndexDisabledColumns(DIM_RAW_MV_FORWARD_INDEX_DISABLED_INTEGER); - ForwardIndexHandler fwdIndexHandler = - new ForwardIndexHandler(segmentLocalFSDirectory, indexLoadingConfig, _schema); + ForwardIndexHandler fwdIndexHandler = new ForwardIndexHandler(segmentLocalFSDirectory, indexLoadingConfig, _schema); fwdIndexHandler.updateIndices(writer); fwdIndexHandler.postUpdateIndicesCleanup(writer); @@ -2213,20 +2239,18 @@ public void testEnableForwardIndexForDictionaryDisabledColumns() validateIndexesForForwardIndexDisabledColumns(DIM_RAW_MV_FORWARD_INDEX_DISABLED_INTEGER); // In column metadata, nothing should change. - ColumnMetadata metadata = - existingSegmentMetadata.getColumnMetadataFor(DIM_RAW_SV_FORWARD_INDEX_DISABLED_INTEGER); + ColumnMetadata metadata = existingSegmentMetadata.getColumnMetadataFor(DIM_RAW_SV_FORWARD_INDEX_DISABLED_INTEGER); validateMetadataProperties(DIM_RAW_SV_FORWARD_INDEX_DISABLED_INTEGER, metadata.hasDictionary(), metadata.getColumnMaxLength(), metadata.getCardinality(), metadata.getTotalDocs(), metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), - metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), metadata.getMinValue(), - metadata.getMaxValue(), false); - metadata = - existingSegmentMetadata.getColumnMetadataFor(DIM_RAW_MV_FORWARD_INDEX_DISABLED_INTEGER); + metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), + false); + metadata = existingSegmentMetadata.getColumnMetadataFor(DIM_RAW_MV_FORWARD_INDEX_DISABLED_INTEGER); validateMetadataProperties(DIM_RAW_MV_FORWARD_INDEX_DISABLED_INTEGER, metadata.hasDictionary(), metadata.getColumnMaxLength(), metadata.getCardinality(), metadata.getTotalDocs(), metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), - metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), metadata.getMinValue(), - metadata.getMaxValue(), false); + metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), + false); } @Test @@ -2242,10 +2266,10 @@ public void testAddOtherIndexForForwardIndexDisabledColumn() // Add column to range index list. Must be a numerical type. String column; do { - column = MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS - .get(rand.nextInt(MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.size())); - } while (!column.equals(DIM_MV_FORWARD_INDEX_DISABLED_DUPLICATES_STRING) - && !column.equals(DIM_MV_FORWARD_INDEX_DISABLED_DUPLICATES_BYTES)); + column = MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.get( + rand.nextInt(MV_FORWARD_INDEX_DISABLED_DUPLICATES_COLUMNS.size())); + } while (!column.equals(DIM_MV_FORWARD_INDEX_DISABLED_DUPLICATES_STRING) && !column.equals( + DIM_MV_FORWARD_INDEX_DISABLED_DUPLICATES_BYTES)); indexLoadingConfig.addRangeIndexColumns(column); RangeIndexHandler rangeIndexHandler = new RangeIndexHandler(segmentLocalFSDirectory, indexLoadingConfig); @@ -2271,10 +2295,10 @@ public void testAddOtherIndexForForwardIndexDisabledColumn() // In column metadata, some values can change since MV columns with duplicates lose the duplicates on forward index // regeneration. ColumnMetadata metadata = existingSegmentMetadata.getColumnMetadataFor(column); - validateMetadataProperties(column, true, 7, metadata.getCardinality(), - metadata.getTotalDocs(), metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), - metadata.isSingleValue(), metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), - metadata.isAutoGenerated(), metadata.getMinValue(), metadata.getMaxValue(), true); + validateMetadataProperties(column, true, 7, metadata.getCardinality(), metadata.getTotalDocs(), + metadata.getDataType(), metadata.getFieldType(), metadata.isSorted(), metadata.isSingleValue(), + metadata.getMaxNumberOfMultiValues(), metadata.getTotalNumberOfEntries(), metadata.isAutoGenerated(), + metadata.getMinValue(), metadata.getMaxValue(), true); // Validate that expected metadata properties don't match. totalNumberOfEntries will definitely not match since // duplicates will be removed, but maxNumberOfMultiValues may still match if the row with max multi-values didn't @@ -2332,34 +2356,24 @@ public void testDictionaryOverride() { IndexType index1 = Mockito.mock(IndexType.class); Mockito.when(index1.getId()).thenReturn("index1"); IndexConfig indexConf = new IndexConfig(true); - FieldIndexConfigs fieldIndexConfigs = new FieldIndexConfigs.Builder() - .add(index1, indexConf) - .build(); + FieldIndexConfigs fieldIndexConfigs = new FieldIndexConfigs.Builder().add(index1, indexConf).build(); // No need to disable dictionary - boolean result = DictionaryIndexType.ignoreDictionaryOverride(false, true, - 2, fieldSpec, - fieldIndexConfigs, 5, 20); + boolean result = DictionaryIndexType.ignoreDictionaryOverride(false, true, 2, fieldSpec, fieldIndexConfigs, 5, 20); Assert.assertEquals(result, true); // Set a higher noDictionarySizeRatioThreshold - result = DictionaryIndexType.ignoreDictionaryOverride(false, true, - 5, fieldSpec, - fieldIndexConfigs, 5, 20); + result = DictionaryIndexType.ignoreDictionaryOverride(false, true, 5, fieldSpec, fieldIndexConfigs, 5, 20); Assert.assertEquals(result, false); // optimizeDictionary and optimizeDictionaryForMetrics both turned on - result = DictionaryIndexType.ignoreDictionaryOverride(true, true, - 5, fieldSpec, - fieldIndexConfigs, 5, 20); + result = DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, fieldSpec, fieldIndexConfigs, 5, 20); Assert.assertEquals(result, false); // Don't ignore for Json. We want to disable dictionary for json. fieldSpec = new DimensionFieldSpec(); fieldSpec.setName("test"); fieldSpec.setDataType(FieldSpec.DataType.JSON); - result = DictionaryIndexType.ignoreDictionaryOverride(true, true, - 5, fieldSpec, - fieldIndexConfigs, 5, 20); + result = DictionaryIndexType.ignoreDictionaryOverride(true, true, 5, fieldSpec, fieldIndexConfigs, 5, 20); Assert.assertEquals(result, true); } @@ -2558,7 +2572,8 @@ private void validateForwardIndex(String columnName, @Nullable CompressionCodec } } - private void testIndexExists(String columnName, IndexType indexType) throws Exception { + private void testIndexExists(String columnName, IndexType indexType) + throws Exception { SegmentMetadataImpl existingSegmentMetadata = new SegmentMetadataImpl(_segmentDirectory); SegmentDirectory segmentLocalFSDirectory = new SegmentLocalFSDirectory(_segmentDirectory, existingSegmentMetadata, ReadMode.mmap); diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkCompressionType.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkCompressionType.java index 97d7057d03c..79c678c2609 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkCompressionType.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkCompressionType.java @@ -19,7 +19,7 @@ package org.apache.pinot.segment.spi.compression; public enum ChunkCompressionType { - PASS_THROUGH(0), SNAPPY(1), ZSTANDARD(2), LZ4(3), LZ4_LENGTH_PREFIXED(4); + PASS_THROUGH(0), SNAPPY(1), ZSTANDARD(2), LZ4(3), LZ4_LENGTH_PREFIXED(4), GZIP(5); private static final ChunkCompressionType[] VALUES = values(); diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkCompressor.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkCompressor.java index a6ab78c4ea8..4ce9ce82be5 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkCompressor.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkCompressor.java @@ -18,6 +18,7 @@ */ package org.apache.pinot.segment.spi.compression; +import java.io.Closeable; import java.io.IOException; import java.nio.ByteBuffer; @@ -25,7 +26,7 @@ /** * Interface to compress a chunk of data. */ -public interface ChunkCompressor { +public interface ChunkCompressor extends Closeable { /** * This method compresses the given data. The output compressed ByteBuffer is returned ready for read. @@ -51,4 +52,9 @@ int compress(ByteBuffer inUncompressed, ByteBuffer outCompressed) * @return this compressor's type */ ChunkCompressionType compressionType(); + + @Override + default void close() throws IOException { + // no-op + } } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkDecompressor.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkDecompressor.java index 2eeb33d6c97..b3f563bb442 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkDecompressor.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/ChunkDecompressor.java @@ -18,6 +18,7 @@ */ package org.apache.pinot.segment.spi.compression; +import java.io.Closeable; import java.io.IOException; import java.nio.ByteBuffer; @@ -25,7 +26,7 @@ /** * Interface to decompress a chunk of data. */ -public interface ChunkDecompressor { +public interface ChunkDecompressor extends Closeable { /** * This method decompresses chunk of data that was compressed using {@link @@ -48,4 +49,9 @@ int decompress(ByteBuffer compressedInput, ByteBuffer decompressedOutput) */ int decompressedLength(ByteBuffer compressedInput) throws IOException; + + @Override + default void close() throws IOException { + // no-op + } } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/FieldIndexConfigs.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/FieldIndexConfigs.java index 3d192aa6a1a..1351b35d96d 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/FieldIndexConfigs.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/FieldIndexConfigs.java @@ -57,10 +57,8 @@ private FieldIndexConfigs(Map configMap) { } public Map unwrapIndexes() { - Function, JsonNode> serializer = - entry -> entry.getValue().toJsonNode(); - return _configMap.entrySet().stream() - .filter(e -> e.getValue() != null) + Function, JsonNode> serializer = entry -> entry.getValue().toJsonNode(); + return _configMap.entrySet().stream().filter(e -> e.getValue() != null) .collect(Collectors.toMap(entry -> entry.getKey().getId(), serializer)); } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/ForwardIndexConfig.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/ForwardIndexConfig.java index 132705036b3..70de007f8eb 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/ForwardIndexConfig.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/ForwardIndexConfig.java @@ -74,6 +74,10 @@ public ForwardIndexConfig(@Nullable Boolean disabled, @Nullable CompressionCodec _chunkCompressionType = ChunkCompressionType.LZ4; _dictIdCompressionType = null; break; + case GZIP: + _chunkCompressionType = ChunkCompressionType.GZIP; + _dictIdCompressionType = null; + break; case MV_ENTRY_DICT: _dictIdCompressionType = DictIdCompressionType.MV_ENTRY_DICT; _chunkCompressionType = null; diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java index 704cb2e01c6..201edeb39aa 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java @@ -52,8 +52,8 @@ public class FieldConfig extends BaseJsonConfig { public static final String TEXT_INDEX_LUCENE_USE_COMPOUND_FILE = "luceneUseCompoundFile"; public static final String TEXT_INDEX_LUCENE_MAX_BUFFER_SIZE_MB = "luceneMaxBufferSizeMB"; public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS = "luceneAnalyzerClass"; - public static final String TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS - = "org.apache.lucene.analysis.standard.StandardAnalyzer"; + public static final String TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS = + "org.apache.lucene.analysis.standard.StandardAnalyzer"; public static final String TEXT_INDEX_STOP_WORD_SEPERATOR = ","; // "native" for native, default is Lucene public static final String TEXT_FST_TYPE = "fstType"; @@ -102,8 +102,8 @@ public FieldConfig(@JsonProperty(value = "name", required = true) String name, Preconditions.checkArgument(name != null, "'name' must be configured"); _name = name; _encodingType = encodingType == null ? EncodingType.DICTIONARY : encodingType; - _indexTypes = indexTypes != null ? indexTypes : ( - indexType == null ? Lists.newArrayList() : Lists.newArrayList(indexType)); + _indexTypes = + indexTypes != null ? indexTypes : (indexType == null ? Lists.newArrayList() : Lists.newArrayList(indexType)); _compressionCodec = compressionCodec; _timestampConfig = timestampConfig; _properties = properties; @@ -129,6 +129,7 @@ public enum CompressionCodec { // CLP is a special type of compression codec that isn't generally applicable to all RAW columns and has a // special handling for log lines (see {@link CLPForwardIndexCreatorV1}) CLP(false, false), + GZIP(true, false), // For MV dictionary encoded forward index, add a second level dictionary encoding for the multi-value entries MV_ENTRY_DICT(false, true); @@ -258,8 +259,8 @@ public Builder withTierOverwrites(JsonNode tierOverwrites) { } public FieldConfig build() { - return new FieldConfig(_name, _encodingType, null, _indexTypes, _compressionCodec, _timestampConfig, - _indexes, _properties, _tierOverwrites); + return new FieldConfig(_name, _encodingType, null, _indexTypes, _compressionCodec, _timestampConfig, _indexes, + _properties, _tierOverwrites); } } } From 174377df2a8fd5b3e4d28c4242b977b1694f3ef5 Mon Sep 17 00:00:00 2001 From: Saurabh Dubey Date: Fri, 29 Mar 2024 22:16:03 +0530 Subject: [PATCH 17/50] jsonExtractIndex support array of default values (#12748) --- .../JsonExtractIndexTransformFunction.java | 77 +++++++++++++++- ...JsonExtractIndexTransformFunctionTest.java | 90 +++++++++++++++---- 2 files changed, 151 insertions(+), 16 deletions(-) diff --git a/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java index 12e38ea5d60..b499b7384c1 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java @@ -18,6 +18,8 @@ */ package org.apache.pinot.core.operator.transform.function; +import com.fasterxml.jackson.databind.JsonNode; +import java.io.IOException; import java.math.BigDecimal; import java.util.List; import java.util.Map; @@ -27,6 +29,7 @@ import org.apache.pinot.core.operator.transform.TransformResultMetadata; import org.apache.pinot.segment.spi.index.reader.JsonIndexReader; import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.utils.JsonUtils; import org.roaringbitmap.RoaringBitmap; @@ -101,7 +104,24 @@ public void init(List arguments, Map c if (!(fourthArgument instanceof LiteralTransformFunction)) { throw new IllegalArgumentException("Default value must be a literal"); } - _defaultValue = dataType.convert(((LiteralTransformFunction) fourthArgument).getStringLiteral()); + + if (isSingleValue) { + _defaultValue = dataType.convert(((LiteralTransformFunction) fourthArgument).getStringLiteral()); + } else { + try { + JsonNode mvArray = JsonUtils.stringToJsonNode(((LiteralTransformFunction) fourthArgument).getStringLiteral()); + if (!mvArray.isArray()) { + throw new IllegalArgumentException("Default value must be a valid JSON array"); + } + Object[] defaultValues = new Object[mvArray.size()]; + for (int i = 0; i < mvArray.size(); i++) { + defaultValues[i] = dataType.convert(mvArray.get(i).asText()); + } + _defaultValue = defaultValues; + } catch (IOException e) { + throw new IllegalArgumentException("Default value must be a valid JSON array"); + } + } } String filterJsonPath = null; @@ -267,6 +287,17 @@ public int[][] transformToIntValuesMV(ValueBlock valueBlock) { for (int i = 0; i < numDocs; i++) { String[] value = valuesFromIndex[i]; + if (value.length == 0) { + if (_defaultValue != null) { + _intValuesMV[i] = new int[((Object[]) (_defaultValue)).length]; + for (int j = 0; j < _intValuesMV[i].length; j++) { + _intValuesMV[i][j] = (int) ((Object[]) _defaultValue)[j]; + } + continue; + } + throw new RuntimeException( + String.format("Illegal Json Path: [%s], for docId [%s]", _jsonPathString, valueBlock.getDocIds()[i])); + } _intValuesMV[i] = new int[value.length]; for (int j = 0; j < value.length; j++) { _intValuesMV[i][j] = Integer.parseInt(value[j]); @@ -283,6 +314,17 @@ public long[][] transformToLongValuesMV(ValueBlock valueBlock) { _valueToMatchingDocsMap); for (int i = 0; i < numDocs; i++) { String[] value = valuesFromIndex[i]; + if (value.length == 0) { + if (_defaultValue != null) { + _longValuesMV[i] = new long[((Object[]) (_defaultValue)).length]; + for (int j = 0; j < _longValuesMV[i].length; j++) { + _longValuesMV[i][j] = (long) ((Object[]) _defaultValue)[j]; + } + continue; + } + throw new RuntimeException( + String.format("Illegal Json Path: [%s], for docId [%s]", _jsonPathString, valueBlock.getDocIds()[i])); + } _longValuesMV[i] = new long[value.length]; for (int j = 0; j < value.length; j++) { _longValuesMV[i][j] = Long.parseLong(value[j]); @@ -299,6 +341,17 @@ public float[][] transformToFloatValuesMV(ValueBlock valueBlock) { _valueToMatchingDocsMap); for (int i = 0; i < numDocs; i++) { String[] value = valuesFromIndex[i]; + if (value.length == 0) { + if (_defaultValue != null) { + _floatValuesMV[i] = new float[((Object[]) (_defaultValue)).length]; + for (int j = 0; j < _floatValuesMV[i].length; j++) { + _floatValuesMV[i][j] = (float) ((Object[]) _defaultValue)[j]; + } + continue; + } + throw new RuntimeException( + String.format("Illegal Json Path: [%s], for docId [%s]", _jsonPathString, valueBlock.getDocIds()[i])); + } _floatValuesMV[i] = new float[value.length]; for (int j = 0; j < value.length; j++) { _floatValuesMV[i][j] = Float.parseFloat(value[j]); @@ -315,6 +368,17 @@ public double[][] transformToDoubleValuesMV(ValueBlock valueBlock) { _valueToMatchingDocsMap); for (int i = 0; i < numDocs; i++) { String[] value = valuesFromIndex[i]; + if (value.length == 0) { + if (_defaultValue != null) { + _doubleValuesMV[i] = new double[((Object[]) (_defaultValue)).length]; + for (int j = 0; j < _doubleValuesMV[i].length; j++) { + _doubleValuesMV[i][j] = (double) ((Object[]) _defaultValue)[j]; + } + continue; + } + throw new RuntimeException( + String.format("Illegal Json Path: [%s], for docId [%s]", _jsonPathString, valueBlock.getDocIds()[i])); + } _doubleValuesMV[i] = new double[value.length]; for (int j = 0; j < value.length; j++) { _doubleValuesMV[i][j] = Double.parseDouble(value[j]); @@ -331,6 +395,17 @@ public String[][] transformToStringValuesMV(ValueBlock valueBlock) { _valueToMatchingDocsMap); for (int i = 0; i < numDocs; i++) { String[] value = valuesFromIndex[i]; + if (value.length == 0) { + if (_defaultValue != null) { + _stringValuesMV[i] = new String[((Object[]) (_defaultValue)).length]; + for (int j = 0; j < _stringValuesMV[i].length; j++) { + _stringValuesMV[i][j] = (String) ((Object[]) _defaultValue)[j]; + } + continue; + } + throw new RuntimeException( + String.format("Illegal Json Path: [%s], for docId [%s]", _jsonPathString, valueBlock.getDocIds()[i])); + } _stringValuesMV[i] = new String[value.length]; System.arraycopy(value, 0, _stringValuesMV[i], 0, value.length); } diff --git a/pinot-core/src/test/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunctionTest.java index d2cd7921077..7fcfb31b533 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunctionTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunctionTest.java @@ -251,7 +251,7 @@ private void addMvTests(List testArguments) { // MV with filters testArguments.add(new Object[]{ String.format( - "jsonExtractIndex(%s,'%s','INT_ARRAY', '0', 'REGEXP_LIKE(\"$.arrayField[*].arrStringField\", ''.*y.*'')')", + "jsonExtractIndex(%s,'%s','INT_ARRAY', '[]', 'REGEXP_LIKE(\"$.arrayField[*].arrStringField\", ''.*y.*'')')", JSON_STRING_SV_COLUMN, "$.arrayField[*].arrIntField"), "$.arrayField[?(@.arrStringField =~ /.*y.*/)].arrIntField", DataType.INT, false @@ -259,7 +259,7 @@ private void addMvTests(List testArguments) { testArguments.add(new Object[]{ String.format( - "jsonExtractIndex(%s,'%s','STRING_ARRAY', '0', '\"$.arrayField[*].arrIntField\" > 2')", + "jsonExtractIndex(%s,'%s','STRING_ARRAY', '[]', '\"$.arrayField[*].arrIntField\" > 2')", JSON_STRING_SV_COLUMN, "$.arrayField[*].arrStringField"), "$.arrayField[?(@.arrIntField > 2)].arrStringField", DataType.STRING, false @@ -268,7 +268,7 @@ private void addMvTests(List testArguments) { @Test(dataProvider = "testJsonExtractIndexDefaultValue") public void testJsonExtractIndexDefaultValue(String expressionStr, String jsonPathString, DataType resultsDataType, - boolean isSingleValue) { + boolean isSingleValue, Object expectedDefaultValue) { ExpressionContext expression = RequestContextUtils.getExpression(expressionStr); TransformFunction transformFunction = TransformFunctionFactory.get(expression, _dataSourceMap); Assert.assertTrue(transformFunction instanceof JsonExtractIndexTransformFunction); @@ -281,37 +281,72 @@ public void testJsonExtractIndexDefaultValue(String expressionStr, String jsonPa case INT: int[] intValues = transformFunction.transformToIntValuesSV(_projectionBlock); for (int i = 0; i < NUM_ROWS; i++) { - Assert.assertEquals(intValues[i], 0); + Assert.assertEquals(intValues[i], expectedDefaultValue); } break; case LONG: long[] longValues = transformFunction.transformToLongValuesSV(_projectionBlock); for (int i = 0; i < NUM_ROWS; i++) { - Assert.assertEquals(longValues[i], 0L); + Assert.assertEquals(longValues[i], expectedDefaultValue); } break; case FLOAT: float[] floatValues = transformFunction.transformToFloatValuesSV(_projectionBlock); for (int i = 0; i < NUM_ROWS; i++) { - Assert.assertEquals(floatValues[i], 0f); + Assert.assertEquals(floatValues[i], expectedDefaultValue); } break; case DOUBLE: double[] doubleValues = transformFunction.transformToDoubleValuesSV(_projectionBlock); for (int i = 0; i < NUM_ROWS; i++) { - Assert.assertEquals(doubleValues[i], 0d); + Assert.assertEquals(doubleValues[i], expectedDefaultValue); } break; case BIG_DECIMAL: BigDecimal[] bigDecimalValues = transformFunction.transformToBigDecimalValuesSV(_projectionBlock); for (int i = 0; i < NUM_ROWS; i++) { - Assert.assertEquals(bigDecimalValues[i], BigDecimal.ZERO); + Assert.assertEquals(bigDecimalValues[i], expectedDefaultValue); } break; case STRING: String[] stringValues = transformFunction.transformToStringValuesSV(_projectionBlock); for (int i = 0; i < NUM_ROWS; i++) { - Assert.assertEquals(stringValues[i], "null"); + Assert.assertEquals(stringValues[i], expectedDefaultValue); + } + break; + default: + throw new UnsupportedOperationException("Not support data type - " + resultsDataType); + } + } else { + switch (resultsDataType) { + case INT: + int[][] intValues = transformFunction.transformToIntValuesMV(_projectionBlock); + for (int i = 0; i < NUM_ROWS; i++) { + Assert.assertEquals(intValues[i], expectedDefaultValue); + } + break; + case LONG: + long[][] longValues = transformFunction.transformToLongValuesMV(_projectionBlock); + for (int i = 0; i < NUM_ROWS; i++) { + Assert.assertEquals(longValues[i], expectedDefaultValue); + } + break; + case FLOAT: + float[][] floatValues = transformFunction.transformToFloatValuesMV(_projectionBlock); + for (int i = 0; i < NUM_ROWS; i++) { + Assert.assertEquals(floatValues[i], expectedDefaultValue); + } + break; + case DOUBLE: + double[][] doubleValues = transformFunction.transformToDoubleValuesMV(_projectionBlock); + for (int i = 0; i < NUM_ROWS; i++) { + Assert.assertEquals(doubleValues[i], expectedDefaultValue); + } + break; + case STRING: + String[][] stringValues = transformFunction.transformToStringValuesMV(_projectionBlock); + for (int i = 0; i < NUM_ROWS; i++) { + Assert.assertEquals(stringValues[i], expectedDefaultValue); } break; default: @@ -326,31 +361,56 @@ public Object[][] testJsonExtractIndexDefaultValueDataProvider() { // With default value testArguments.add(new Object[]{ String.format("jsonExtractIndex(%s,'%s','INT',0)", JSON_STRING_SV_COLUMN, - "$.noField"), "$.noField", DataType.INT, true + "$.noField"), "$.noField", DataType.INT, true, 0 }); testArguments.add(new Object[]{ String.format("jsonExtractIndex(%s,'%s','LONG',0)", JSON_STRING_SV_COLUMN, - "$.noField"), "$.noField", DataType.LONG, true + "$.noField"), "$.noField", DataType.LONG, true, 0L }); testArguments.add(new Object[]{ String.format("jsonExtractIndex(%s,'%s','FLOAT',0)", JSON_STRING_SV_COLUMN, - "$.noField"), "$.noField", DataType.FLOAT, true + "$.noField"), "$.noField", DataType.FLOAT, true, (float) 0 }); testArguments.add(new Object[]{ String.format("jsonExtractIndex(%s,'%s','DOUBLE',0)", JSON_STRING_SV_COLUMN, - "$.noField"), "$.noField", DataType.DOUBLE, true + "$.noField"), "$.noField", DataType.DOUBLE, true, (double) 0 }); testArguments.add(new Object[]{ String.format("jsonExtractIndex(%s,'%s','BIG_DECIMAL',0)", JSON_STRING_SV_COLUMN, - "$.noField"), "$.noField", DataType.BIG_DECIMAL, true + "$.noField"), "$.noField", DataType.BIG_DECIMAL, true, new BigDecimal(0) }); testArguments.add(new Object[]{ String.format("jsonExtractIndex(%s,'%s','STRING','null')", JSON_STRING_SV_COLUMN, - "$.noField"), "$.noField", DataType.STRING, true + "$.noField"), "$.noField", DataType.STRING, true, "null" }); + addMvDefaultValueTests(testArguments); return testArguments.toArray(new Object[0][]); } + private void addMvDefaultValueTests(List testArguments) { + testArguments.add(new Object[]{ + String.format("jsonExtractIndex(%s,'%s','INT_ARRAY', '%s')", JSON_STRING_SV_COLUMN, "$.noField", + "[1, 2, 3]"), "$.noField", DataType.INT, false, new Integer[]{1, 2, 3} + }); + testArguments.add(new Object[]{ + String.format("jsonExtractIndex(%s,'%s','LONG_ARRAY', '%s')", JSON_STRING_SV_COLUMN, "$.noField", + "[1, 5, 6]"), "$.noField", DataType.LONG, false, new Long[]{1L, 5L, 6L} + }); + testArguments.add(new Object[]{ + String.format("jsonExtractIndex(%s,'%s','FLOAT_ARRAY', '%s')", JSON_STRING_SV_COLUMN, "$.noField", + "[1.2, 3.1, 1.6]"), "$.noField", DataType.FLOAT, false, new Float[]{1.2f, 3.1f, 1.6f} + }); + testArguments.add(new Object[]{ + String.format("jsonExtractIndex(%s,'%s','DOUBLE_ARRAY', '%s')", JSON_STRING_SV_COLUMN, "$.noField", + "[1.5, 3.4, 1.6]"), "$.noField", DataType.DOUBLE, false, new Double[]{1.5d, 3.4d, 1.6d} + }); + testArguments.add(new Object[]{ + String.format("jsonExtractIndex(%s,'%s','STRING_ARRAY', '%s')", JSON_STRING_SV_COLUMN, "$.noField", + "[\"randomString1\", \"randomString2\"]"), "$.noField", DataType.STRING, false, + new String[]{"randomString1", "randomString2"} + }); + } + // get value for key, excluding nested private String getValueForKey(String blob, JsonPath path) { Object out = JSON_PARSER_CONTEXT.parse(blob).read(path); From 28aec2e01cedfcfaf47e83f41f58f810ac153b31 Mon Sep 17 00:00:00 2001 From: Vivek Iyer Vaidyanathan Date: Sun, 31 Mar 2024 18:54:27 -0700 Subject: [PATCH 18/50] Add some additional metrics for Minion tasks (#12710) * Add some additional metrics for Minion tasks * Address review comments --- .../pinot/common/metrics/MinionMeter.java | 8 ++++- .../pinot/core/minion/SegmentPurger.java | 8 +++-- ...aseMultipleSegmentsConversionExecutor.java | 13 +++++++- .../BaseSingleSegmentConversionExecutor.java | 22 +++++++++++-- .../plugin/minion/tasks/BaseTaskExecutor.java | 33 +++++++++++++++++++ 5 files changed, 76 insertions(+), 8 deletions(-) diff --git a/pinot-common/src/main/java/org/apache/pinot/common/metrics/MinionMeter.java b/pinot-common/src/main/java/org/apache/pinot/common/metrics/MinionMeter.java index 376f86e55e5..c85aad39eda 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/metrics/MinionMeter.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/metrics/MinionMeter.java @@ -31,7 +31,13 @@ public enum MinionMeter implements AbstractMetrics.Meter { NUMBER_TASKS_FAILED("tasks", false), NUMBER_TASKS_FATAL_FAILED("tasks", false), SEGMENT_UPLOAD_FAIL_COUNT("segments", false), - SEGMENT_DOWNLOAD_FAIL_COUNT("segments", false); + SEGMENT_DOWNLOAD_FAIL_COUNT("segments", false), + SEGMENT_DOWNLOAD_COUNT("segments", false), + SEGMENT_UPLOAD_COUNT("segments", false), + SEGMENT_BYTES_DOWNLOADED("bytes", false), + SEGMENT_BYTES_UPLOADED("bytes", false), + RECORDS_PROCESSED_COUNT("rows", false), + RECORDS_PURGED_COUNT("rows", false); private final String _meterName; private final String _unit; diff --git a/pinot-core/src/main/java/org/apache/pinot/core/minion/SegmentPurger.java b/pinot-core/src/main/java/org/apache/pinot/core/minion/SegmentPurger.java index 4faf6955220..2ab65bbe9c3 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/minion/SegmentPurger.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/minion/SegmentPurger.java @@ -70,7 +70,8 @@ public File purgeSegment() throws Exception { SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(_indexDir); String segmentName = segmentMetadata.getName(); - LOGGER.info("Start purging table: {}, segment: {}", _tableConfig.getTableName(), segmentName); + String tableNameWithType = _tableConfig.getTableName(); + LOGGER.info("Start purging table: {}, segment: {}", tableNameWithType, segmentName); try (PurgeRecordReader purgeRecordReader = new PurgeRecordReader()) { // Make a first pass through the data to see if records need to be purged or modified @@ -107,8 +108,9 @@ public File purgeSegment() driver.build(); } - LOGGER.info("Finish purging table: {}, segment: {}, purged {} records, modified {} records", - _tableConfig.getTableName(), segmentName, _numRecordsPurged, _numRecordsModified); + LOGGER.info("Finish purging table: {}, segment: {}, purged {} records, modified {} records", tableNameWithType, + segmentName, _numRecordsPurged, _numRecordsModified); + return new File(_workingDir, segmentName); } diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseMultipleSegmentsConversionExecutor.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseMultipleSegmentsConversionExecutor.java index 6b439add13f..e7ef8a4eea6 100644 --- a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseMultipleSegmentsConversionExecutor.java +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseMultipleSegmentsConversionExecutor.java @@ -51,6 +51,7 @@ import org.apache.pinot.minion.event.MinionEventObservers; import org.apache.pinot.minion.exception.TaskCancelledException; import org.apache.pinot.segment.local.utils.SegmentPushUtils; +import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl; import org.apache.pinot.spi.auth.AuthProvider; import org.apache.pinot.spi.config.table.TableType; import org.apache.pinot.spi.filesystem.PinotFS; @@ -192,6 +193,8 @@ public List executeTask(PinotTaskConfig pinotTaskConfig String crypterName = getTableConfig(tableNameWithType).getValidationConfig().getCrypterClassName(); try { List inputSegmentDirs = new ArrayList<>(); + int numRecords = 0; + for (int i = 0; i < downloadURLs.length; i++) { // Download the segment file _eventObserver.notifyProgress(_pinotTaskConfig, String @@ -209,6 +212,10 @@ public List executeTask(PinotTaskConfig pinotTaskConfig if (!FileUtils.deleteQuietly(tarredSegmentFile)) { LOGGER.warn("Failed to delete tarred input segment: {}", tarredSegmentFile.getAbsolutePath()); } + + reportSegmentDownloadMetrics(indexDir, tableNameWithType, taskType); + SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(indexDir); + numRecords += segmentMetadata.getTotalDocs(); } // Convert the segments @@ -216,6 +223,8 @@ public List executeTask(PinotTaskConfig pinotTaskConfig Preconditions.checkState(workingDir.mkdir()); List segmentConversionResults = convert(pinotTaskConfig, inputSegmentDirs, workingDir); + reportTaskProcessingMetrics(tableNameWithType, taskType, numRecords); + // Create a directory for converted tarred segment files File convertedTarredSegmentDir = new File(tempDataDir, "convertedTarredSegmentDir"); Preconditions.checkState(convertedTarredSegmentDir.mkdir()); @@ -224,11 +233,13 @@ public List executeTask(PinotTaskConfig pinotTaskConfig List tarredSegmentFiles = new ArrayList<>(numOutputSegments); int count = 1; for (SegmentConversionResult segmentConversionResult : segmentConversionResults) { + File convertedSegmentDir = segmentConversionResult.getFile(); + reportSegmentUploadMetrics(convertedSegmentDir, tableNameWithType, taskType); + // Tar the converted segment _eventObserver.notifyProgress(_pinotTaskConfig, String .format("Compressing segment: %s (%d out of %d)", segmentConversionResult.getSegmentName(), count++, numOutputSegments)); - File convertedSegmentDir = segmentConversionResult.getFile(); File convertedSegmentTarFile = new File(convertedTarredSegmentDir, segmentConversionResult.getSegmentName() + TarGzCompressionUtils.TAR_GZ_FILE_EXTENSION); TarGzCompressionUtils.createTarGzFile(convertedSegmentDir, convertedSegmentTarFile); diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseSingleSegmentConversionExecutor.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseSingleSegmentConversionExecutor.java index 22337ada6bd..a920817ae99 100644 --- a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseSingleSegmentConversionExecutor.java +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseSingleSegmentConversionExecutor.java @@ -36,7 +36,6 @@ import org.apache.pinot.common.auth.AuthProviderUtils; import org.apache.pinot.common.metadata.segment.SegmentZKMetadataCustomMapModifier; import org.apache.pinot.common.metrics.MinionMeter; -import org.apache.pinot.common.metrics.MinionMetrics; import org.apache.pinot.common.utils.FileUploadDownloadClient; import org.apache.pinot.common.utils.TarGzCompressionUtils; import org.apache.pinot.common.utils.fetcher.SegmentFetcherFactory; @@ -45,6 +44,8 @@ import org.apache.pinot.minion.event.MinionEventObserver; import org.apache.pinot.minion.event.MinionEventObservers; import org.apache.pinot.minion.exception.TaskCancelledException; +import org.apache.pinot.plugin.minion.tasks.purge.PurgeTaskExecutor; +import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl; import org.apache.pinot.spi.auth.AuthProvider; import org.apache.pinot.spi.utils.builder.TableNameBuilder; import org.slf4j.Logger; @@ -60,8 +61,6 @@ public abstract class BaseSingleSegmentConversionExecutor extends BaseTaskExecutor { private static final Logger LOGGER = LoggerFactory.getLogger(BaseSingleSegmentConversionExecutor.class); - protected final MinionMetrics _minionMetrics = MinionMetrics.get(); - // Tracking finer grained progress status. protected PinotTaskConfig _pinotTaskConfig; protected MinionEventObserver _eventObserver; @@ -123,6 +122,9 @@ public SegmentConversionResult executeTask(PinotTaskConfig pinotTaskConfig) LOGGER.warn("Failed to delete tarred input segment: {}", tarredSegmentFile.getAbsolutePath()); } + // Publish metrics related to segment download + reportSegmentDownloadMetrics(indexDir, tableNameWithType, taskType); + // Convert the segment File workingDir = new File(tempDataDir, "workingDir"); Preconditions.checkState(workingDir.mkdir()); @@ -135,6 +137,20 @@ public SegmentConversionResult executeTask(PinotTaskConfig pinotTaskConfig) if (convertedSegmentDir == null) { return segmentConversionResult; } + + // Publish metrics related to segment upload + reportSegmentUploadMetrics(workingDir, tableNameWithType, taskType); + + // Collect the task processing metrics from various single segment executors and publish them here. + SegmentMetadataImpl segmentMetadata = new SegmentMetadataImpl(indexDir); + Object numRecordsPurged = segmentConversionResult.getCustomProperty(PurgeTaskExecutor.NUM_RECORDS_PURGED_KEY); + if (numRecordsPurged != null) { + reportTaskProcessingMetrics(tableNameWithType, taskType, segmentMetadata.getTotalDocs(), + (int) numRecordsPurged); + } else { + reportTaskProcessingMetrics(tableNameWithType, taskType, segmentMetadata.getTotalDocs()); + } + // Tar the converted segment _eventObserver.notifyProgress(_pinotTaskConfig, "Compressing segment: " + segmentName); File convertedTarredSegmentFile = diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseTaskExecutor.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseTaskExecutor.java index d85bf447375..2b57bbb8b4d 100644 --- a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseTaskExecutor.java +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/BaseTaskExecutor.java @@ -19,9 +19,13 @@ package org.apache.pinot.plugin.minion.tasks; import com.google.common.base.Preconditions; +import java.io.File; +import org.apache.commons.io.FileUtils; import org.apache.pinot.common.metadata.ZKMetadataProvider; import org.apache.pinot.common.metadata.segment.SegmentZKMetadata; import org.apache.pinot.common.metadata.segment.SegmentZKMetadataCustomMapModifier; +import org.apache.pinot.common.metrics.MinionMeter; +import org.apache.pinot.common.metrics.MinionMetrics; import org.apache.pinot.core.minion.PinotTaskConfig; import org.apache.pinot.minion.MinionContext; import org.apache.pinot.minion.executor.PinotTaskExecutor; @@ -33,6 +37,7 @@ public abstract class BaseTaskExecutor implements PinotTaskExecutor { protected static final MinionContext MINION_CONTEXT = MinionContext.getInstance(); protected boolean _cancelled = false; + protected final MinionMetrics _minionMetrics = MinionMetrics.get(); @Override public void cancel() { @@ -68,4 +73,32 @@ protected long getSegmentCrc(String tableNameWithType, String segmentName) { */ return segmentZKMetadata == null ? -1 : segmentZKMetadata.getCrc(); } + + protected void reportSegmentDownloadMetrics(File indexDir, String tableNameWithType, String taskType) { + long downloadSegmentSize = FileUtils.sizeOfDirectory(indexDir); + addTaskMeterMetrics(MinionMeter.SEGMENT_BYTES_DOWNLOADED, downloadSegmentSize, tableNameWithType, taskType); + addTaskMeterMetrics(MinionMeter.SEGMENT_DOWNLOAD_COUNT, 1L, tableNameWithType, taskType); + } + + protected void reportSegmentUploadMetrics(File indexDir, String tableNameWithType, String taskType) { + long uploadSegmentSize = FileUtils.sizeOfDirectory(indexDir); + addTaskMeterMetrics(MinionMeter.SEGMENT_BYTES_UPLOADED, uploadSegmentSize, tableNameWithType, taskType); + addTaskMeterMetrics(MinionMeter.SEGMENT_UPLOAD_COUNT, 1L, tableNameWithType, taskType); + } + + protected void reportTaskProcessingMetrics(String tableNameWithType, String taskType, int numRecordsProcessed, + int numRecordsPurged) { + reportTaskProcessingMetrics(tableNameWithType, taskType, numRecordsProcessed); + addTaskMeterMetrics(MinionMeter.RECORDS_PURGED_COUNT, numRecordsPurged, tableNameWithType, taskType); + } + + protected void reportTaskProcessingMetrics(String tableNameWithType, String taskType, int numRecordsProcessed) { + addTaskMeterMetrics(MinionMeter.RECORDS_PROCESSED_COUNT, numRecordsProcessed, tableNameWithType, taskType); + } + + private void addTaskMeterMetrics(MinionMeter meter, long unitCount, String tableName, String taskType) { + _minionMetrics.addMeteredGlobalValue(meter, unitCount); + _minionMetrics.addMeteredTableValue(tableName, meter, unitCount); + _minionMetrics.addMeteredTableValue(tableName, taskType, meter, unitCount); + } } From 3185e303f42e02bf3df80c7ddf16360e0fe2b197 Mon Sep 17 00:00:00 2001 From: soumitra-st <127247229+soumitra-st@users.noreply.github.com> Date: Mon, 1 Apr 2024 01:44:50 -0700 Subject: [PATCH 19/50] Improved null check for varargs (#12673) * Improved null check for varargs * Fixed the null check for varargs to not check the null inside the array * Filter out null values from varargs --- .../src/main/java/org/apache/pinot/client/BrokerCache.java | 6 +++++- .../apache/pinot/common/function/scalar/ArrayFunctions.java | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pinot-clients/pinot-java-client/src/main/java/org/apache/pinot/client/BrokerCache.java b/pinot-clients/pinot-java-client/src/main/java/org/apache/pinot/client/BrokerCache.java index 759dd32084f..6742174582d 100644 --- a/pinot-clients/pinot-java-client/src/main/java/org/apache/pinot/client/BrokerCache.java +++ b/pinot-clients/pinot-java-client/src/main/java/org/apache/pinot/client/BrokerCache.java @@ -28,6 +28,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Properties; import java.util.Random; import java.util.Set; @@ -190,7 +191,10 @@ protected void updateBrokerData() public String getBroker(String... tableNames) { List brokers = null; - if (tableNames != null) { + // If tableNames is not-null, filter out nulls + tableNames = + tableNames == null ? tableNames : Arrays.stream(tableNames).filter(Objects::nonNull).toArray(String[]::new); + if (!(tableNames == null || tableNames.length == 0)) { // returning list of common brokers hosting all the tables. brokers = BrokerSelectorUtils.getTablesCommonBrokers(Arrays.asList(tableNames), _brokerData.getTableToBrokerMap()); diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/ArrayFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/ArrayFunctions.java index 32f115b51a7..53e6bc76c25 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/ArrayFunctions.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/ArrayFunctions.java @@ -230,7 +230,7 @@ public static String arrayElementAtString(String[] arr, int idx) { @ScalarFunction(names = {"array", "arrayValueConstructor"}, isVarArg = true) public static Object arrayValueConstructor(Object... arr) { - if (arr.length == 0) { + if (arr == null || arr.length == 0) { return arr; } Class clazz = arr[0].getClass(); From c15c3912cb49a77476cbe84c113f68b201318c68 Mon Sep 17 00:00:00 2001 From: Gonzalo Ortiz Jaureguizar Date: Mon, 1 Apr 2024 10:45:05 +0200 Subject: [PATCH 20/50] Percentile operations supporting null (#12271) * new test framework candidate * Improved test system * Improve framework to be able to specify segments as strings * fix headers * Improve assertions when there are nulls * Improve error text * Improvements in the framework * Add a base class single input aggregation operations can extend to support null handling * Fix issue in NullableSingleInputAggregationFunction.forEachNotNullInt * Improve error message in NullEnabledQueriesTest * Add new schema family * Rename test schemas and table config * Split AllNullQueriesTest into on test per query * Revert change in AllNullQueriesTest that belongs to mode-null-support branch * Add tests * Fix issue in bytes in aggregation case * Update to the new framework * Fix some tests * rollback a code style change --- .../function/AggregationFunctionFactory.java | 37 +- ...ullableSingleInputAggregationFunction.java | 9 + .../PercentileAggregationFunction.java | 57 +-- .../PercentileEstAggregationFunction.java | 108 +++--- .../PercentileEstMVAggregationFunction.java | 4 +- .../PercentileKLLAggregationFunction.java | 69 ++-- .../PercentileKLLMVAggregationFunction.java | 2 +- .../PercentileMVAggregationFunction.java | 4 +- .../PercentileRawEstAggregationFunction.java | 10 +- .../PercentileRawKLLAggregationFunction.java | 4 +- ...rcentileRawTDigestAggregationFunction.java | 17 +- ...entileSmartTDigestAggregationFunction.java | 96 +++-- .../PercentileTDigestAggregationFunction.java | 110 +++--- ...ercentileTDigestMVAggregationFunction.java | 6 +- ...ractPercentileAggregationFunctionTest.java | 333 ++++++++++++++++++ .../PercentileAggregationFunctionTest.java | 27 ++ .../PercentileEstAggregationFunctionTest.java | 45 +++ .../PercentileKLLAggregationFunctionTest.java | 47 +++ ...leSmartTDigestAggregationFunctionTest.java | 87 +++++ .../apache/pinot/queries/FluentQueryTest.java | 2 +- 20 files changed, 861 insertions(+), 213 deletions(-) create mode 100644 pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/AbstractPercentileAggregationFunctionTest.java create mode 100644 pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileAggregationFunctionTest.java create mode 100644 pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileEstAggregationFunctionTest.java create mode 100644 pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLAggregationFunctionTest.java create mode 100644 pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunctionTest.java diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/AggregationFunctionFactory.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/AggregationFunctionFactory.java index eeed8608a4e..a82d421ebc9 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/AggregationFunctionFactory.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/AggregationFunctionFactory.java @@ -61,16 +61,16 @@ public static AggregationFunction getAggregationFunction(FunctionContext functio if (upperCaseFunctionName.startsWith("PERCENTILE")) { String remainingFunctionName = upperCaseFunctionName.substring(10); if (remainingFunctionName.equals("SMARTTDIGEST")) { - return new PercentileSmartTDigestAggregationFunction(arguments); + return new PercentileSmartTDigestAggregationFunction(arguments, nullHandlingEnabled); } if (remainingFunctionName.equals("KLL")) { - return new PercentileKLLAggregationFunction(arguments); + return new PercentileKLLAggregationFunction(arguments, nullHandlingEnabled); } if (remainingFunctionName.equals("KLLMV")) { return new PercentileKLLMVAggregationFunction(arguments); } if (remainingFunctionName.equals("RAWKLL")) { - return new PercentileRawKLLAggregationFunction(arguments); + return new PercentileRawKLLAggregationFunction(arguments, nullHandlingEnabled); } if (remainingFunctionName.equals("RAWKLLMV")) { return new PercentileRawKLLMVAggregationFunction(arguments); @@ -80,23 +80,28 @@ public static AggregationFunction getAggregationFunction(FunctionContext functio // NOTE: This convention is deprecated. DO NOT add new functions here if (remainingFunctionName.matches("\\d+")) { // Percentile - return new PercentileAggregationFunction(firstArgument, parsePercentileToInt(remainingFunctionName)); + return new PercentileAggregationFunction(firstArgument, parsePercentileToInt(remainingFunctionName), + nullHandlingEnabled); } else if (remainingFunctionName.matches("EST\\d+")) { // PercentileEst String percentileString = remainingFunctionName.substring(3); - return new PercentileEstAggregationFunction(firstArgument, parsePercentileToInt(percentileString)); + return new PercentileEstAggregationFunction(firstArgument, parsePercentileToInt(percentileString), + nullHandlingEnabled); } else if (remainingFunctionName.matches("RAWEST\\d+")) { // PercentileRawEst String percentileString = remainingFunctionName.substring(6); - return new PercentileRawEstAggregationFunction(firstArgument, parsePercentileToInt(percentileString)); + return new PercentileRawEstAggregationFunction(firstArgument, parsePercentileToInt(percentileString), + nullHandlingEnabled); } else if (remainingFunctionName.matches("TDIGEST\\d+")) { // PercentileTDigest String percentileString = remainingFunctionName.substring(7); - return new PercentileTDigestAggregationFunction(firstArgument, parsePercentileToInt(percentileString)); + return new PercentileTDigestAggregationFunction(firstArgument, parsePercentileToInt(percentileString), + nullHandlingEnabled); } else if (remainingFunctionName.matches("RAWTDIGEST\\d+")) { // PercentileRawTDigest String percentileString = remainingFunctionName.substring(10); - return new PercentileRawTDigestAggregationFunction(firstArgument, parsePercentileToInt(percentileString)); + return new PercentileRawTDigestAggregationFunction(firstArgument, parsePercentileToInt(percentileString), + nullHandlingEnabled); } else if (remainingFunctionName.matches("\\d+MV")) { // PercentileMV String percentileString = remainingFunctionName.substring(0, remainingFunctionName.length() - 2); @@ -125,23 +130,23 @@ public static AggregationFunction getAggregationFunction(FunctionContext functio Preconditions.checkArgument(percentile >= 0 && percentile <= 100, "Invalid percentile: %s", percentile); if (remainingFunctionName.isEmpty()) { // Percentile - return new PercentileAggregationFunction(firstArgument, percentile); + return new PercentileAggregationFunction(firstArgument, percentile, nullHandlingEnabled); } if (remainingFunctionName.equals("EST")) { // PercentileEst - return new PercentileEstAggregationFunction(firstArgument, percentile); + return new PercentileEstAggregationFunction(firstArgument, percentile, nullHandlingEnabled); } if (remainingFunctionName.equals("RAWEST")) { // PercentileRawEst - return new PercentileRawEstAggregationFunction(firstArgument, percentile); + return new PercentileRawEstAggregationFunction(firstArgument, percentile, nullHandlingEnabled); } if (remainingFunctionName.equals("TDIGEST")) { // PercentileTDigest - return new PercentileTDigestAggregationFunction(firstArgument, percentile); + return new PercentileTDigestAggregationFunction(firstArgument, percentile, nullHandlingEnabled); } if (remainingFunctionName.equals("RAWTDIGEST")) { // PercentileRawTDigest - return new PercentileRawTDigestAggregationFunction(firstArgument, percentile); + return new PercentileRawTDigestAggregationFunction(firstArgument, percentile, nullHandlingEnabled); } if (remainingFunctionName.equals("MV")) { // PercentileMV @@ -175,11 +180,13 @@ public static AggregationFunction getAggregationFunction(FunctionContext functio Preconditions.checkArgument(compressionFactor >= 0, "Invalid compressionFactor: %d", compressionFactor); if (remainingFunctionName.equals("TDIGEST")) { // PercentileTDigest - return new PercentileTDigestAggregationFunction(firstArgument, percentile, compressionFactor); + return new PercentileTDigestAggregationFunction(firstArgument, percentile, compressionFactor, + nullHandlingEnabled); } if (remainingFunctionName.equals("RAWTDIGEST")) { // PercentileRawTDigest - return new PercentileRawTDigestAggregationFunction(firstArgument, percentile, compressionFactor); + return new PercentileRawTDigestAggregationFunction(firstArgument, percentile, compressionFactor, + nullHandlingEnabled); } if (remainingFunctionName.equals("TDIGESTMV")) { // PercentileTDigestMV diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/NullableSingleInputAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/NullableSingleInputAggregationFunction.java index 78f1ae12696..907f0139d2a 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/NullableSingleInputAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/NullableSingleInputAggregationFunction.java @@ -103,6 +103,15 @@ public void forEachNotNull(int length, IntIterator nullIndexIterator, BatchConsu } } + /** + * Folds over the non-null ranges of the blockValSet using the reducer. + * @param initialAcum the initial value of the accumulator + * @param The type of the accumulator + */ + public A foldNotNull(int length, BlockValSet blockValSet, A initialAcum, Reducer reducer) { + return foldNotNull(length, blockValSet.getNullBitmap(), initialAcum, reducer); + } + /** * Folds over the non-null ranges of the blockValSet using the reducer. * @param initialAcum the initial value of the accumulator diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileAggregationFunction.java index 5d227caeada..c9c71744d26 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileAggregationFunction.java @@ -31,7 +31,7 @@ import org.apache.pinot.segment.spi.AggregationFunctionType; -public class PercentileAggregationFunction extends BaseSingleInputAggregationFunction { +public class PercentileAggregationFunction extends NullableSingleInputAggregationFunction { private static final double DEFAULT_FINAL_RESULT = Double.NEGATIVE_INFINITY; //version 0 functions specified in the of form PERCENTILE<2-digits>(column) @@ -39,14 +39,14 @@ public class PercentileAggregationFunction extends BaseSingleInputAggregationFun protected final int _version; protected final double _percentile; - public PercentileAggregationFunction(ExpressionContext expression, int percentile) { - super(expression); + public PercentileAggregationFunction(ExpressionContext expression, int percentile, boolean nullHandlingEnabled) { + super(expression, nullHandlingEnabled); _version = 0; _percentile = percentile; } - public PercentileAggregationFunction(ExpressionContext expression, double percentile) { - super(expression); + public PercentileAggregationFunction(ExpressionContext expression, double percentile, boolean nullHandlingEnabled) { + super(expression, nullHandlingEnabled); _version = 1; _percentile = percentile; } @@ -77,33 +77,42 @@ public GroupByResultHolder createGroupByResultHolder(int initialCapacity, int ma public void aggregate(int length, AggregationResultHolder aggregationResultHolder, Map blockValSetMap) { DoubleArrayList valueList = getValueList(aggregationResultHolder); - double[] valueArray = blockValSetMap.get(_expression).getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - valueList.add(valueArray[i]); - } + BlockValSet blockValSet = blockValSetMap.get(_expression); + double[] valueArray = blockValSet.getDoubleValuesSV(); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + valueList.add(valueArray[i]); + } + }); } @Override public void aggregateGroupBySV(int length, int[] groupKeyArray, GroupByResultHolder groupByResultHolder, Map blockValSetMap) { - double[] valueArray = blockValSetMap.get(_expression).getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - DoubleArrayList valueList = getValueList(groupByResultHolder, groupKeyArray[i]); - valueList.add(valueArray[i]); - } + BlockValSet blockValSet = blockValSetMap.get(_expression); + double[] valueArray = blockValSet.getDoubleValuesSV(); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + DoubleArrayList valueList = getValueList(groupByResultHolder, groupKeyArray[i]); + valueList.add(valueArray[i]); + } + }); } @Override public void aggregateGroupByMV(int length, int[][] groupKeysArray, GroupByResultHolder groupByResultHolder, Map blockValSetMap) { - double[] valueArray = blockValSetMap.get(_expression).getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - double value = valueArray[i]; - for (int groupKey : groupKeysArray[i]) { - DoubleArrayList valueList = getValueList(groupByResultHolder, groupKey); - valueList.add(value); + BlockValSet blockValSet = blockValSetMap.get(_expression); + double[] valueArray = blockValSet.getDoubleValuesSV(); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + double value = valueArray[i]; + for (int groupKey : groupKeysArray[i]) { + DoubleArrayList valueList = getValueList(groupByResultHolder, groupKey); + valueList.add(value); + } } - } + }); } @Override @@ -146,7 +155,11 @@ public ColumnDataType getFinalResultColumnType() { public Double extractFinalResult(DoubleArrayList intermediateResult) { int size = intermediateResult.size(); if (size == 0) { - return DEFAULT_FINAL_RESULT; + if (_nullHandlingEnabled) { + return null; + } else { + return DEFAULT_FINAL_RESULT; + } } else { double[] values = intermediateResult.elements(); Arrays.sort(values, 0, size); diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileEstAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileEstAggregationFunction.java index d055e465054..e67a3f7d650 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileEstAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileEstAggregationFunction.java @@ -32,7 +32,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType; -public class PercentileEstAggregationFunction extends BaseSingleInputAggregationFunction { +public class PercentileEstAggregationFunction extends NullableSingleInputAggregationFunction { public static final double DEFAULT_MAX_ERROR = 0.05; //version 0 functions specified in the of form PERCENTILEEST<2-digits>(column) @@ -40,14 +40,15 @@ public class PercentileEstAggregationFunction extends BaseSingleInputAggregation protected final int _version; protected final double _percentile; - public PercentileEstAggregationFunction(ExpressionContext expression, int percentile) { - super(expression); + public PercentileEstAggregationFunction(ExpressionContext expression, int percentile, boolean nullHandlingEnabled) { + super(expression, nullHandlingEnabled); _version = 0; _percentile = percentile; } - public PercentileEstAggregationFunction(ExpressionContext expression, double percentile) { - super(expression); + public PercentileEstAggregationFunction(ExpressionContext expression, double percentile, + boolean nullHandlingEnabled) { + super(expression, nullHandlingEnabled); _version = 1; _percentile = percentile; } @@ -81,24 +82,30 @@ public void aggregate(int length, AggregationResultHolder aggregationResultHolde if (blockValSet.getValueType() != DataType.BYTES) { long[] longValues = blockValSet.getLongValuesSV(); QuantileDigest quantileDigest = getDefaultQuantileDigest(aggregationResultHolder); - for (int i = 0; i < length; i++) { - quantileDigest.add(longValues[i]); - } + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + quantileDigest.add(longValues[i]); + } + }); } else { // Serialized QuantileDigest byte[][] bytesValues = blockValSet.getBytesValuesSV(); - QuantileDigest quantileDigest = aggregationResultHolder.getResult(); - if (quantileDigest != null) { - for (int i = 0; i < length; i++) { - quantileDigest.merge(ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[i])); + foldNotNull(length, blockValSet, (QuantileDigest) aggregationResultHolder.getResult(), (quantile, from, toEx) -> { + int start; + QuantileDigest quantileDigest; + if (quantile != null) { + start = from; + quantileDigest = quantile; + } else { + start = from + 1; + quantileDigest = ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[from]); + aggregationResultHolder.setValue(quantileDigest); } - } else { - quantileDigest = ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[0]); - aggregationResultHolder.setValue(quantileDigest); - for (int i = 1; i < length; i++) { + for (int i = start; i < toEx; i++) { quantileDigest.merge(ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[i])); } - } + return quantileDigest; + }); } } @@ -108,22 +115,26 @@ public void aggregateGroupBySV(int length, int[] groupKeyArray, GroupByResultHol BlockValSet blockValSet = blockValSetMap.get(_expression); if (blockValSet.getValueType() != DataType.BYTES) { long[] longValues = blockValSet.getLongValuesSV(); - for (int i = 0; i < length; i++) { - getDefaultQuantileDigest(groupByResultHolder, groupKeyArray[i]).add(longValues[i]); - } + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + getDefaultQuantileDigest(groupByResultHolder, groupKeyArray[i]).add(longValues[i]); + } + }); } else { // Serialized QuantileDigest byte[][] bytesValues = blockValSet.getBytesValuesSV(); - for (int i = 0; i < length; i++) { - QuantileDigest value = ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[i]); - int groupKey = groupKeyArray[i]; - QuantileDigest quantileDigest = groupByResultHolder.getResult(groupKey); - if (quantileDigest != null) { - quantileDigest.merge(value); - } else { - groupByResultHolder.setValueForKey(groupKey, value); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + QuantileDigest value = ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[i]); + int groupKey = groupKeyArray[i]; + QuantileDigest quantileDigest = groupByResultHolder.getResult(groupKey); + if (quantileDigest != null) { + quantileDigest.merge(value); + } else { + groupByResultHolder.setValueForKey(groupKey, value); + } } - } + }); } } @@ -133,28 +144,32 @@ public void aggregateGroupByMV(int length, int[][] groupKeysArray, GroupByResult BlockValSet blockValSet = blockValSetMap.get(_expression); if (blockValSet.getValueType() != DataType.BYTES) { long[] longValues = blockValSet.getLongValuesSV(); - for (int i = 0; i < length; i++) { - long value = longValues[i]; - for (int groupKey : groupKeysArray[i]) { - getDefaultQuantileDigest(groupByResultHolder, groupKey).add(value); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + long value = longValues[i]; + for (int groupKey : groupKeysArray[i]) { + getDefaultQuantileDigest(groupByResultHolder, groupKey).add(value); + } } - } + }); } else { // Serialized QuantileDigest byte[][] bytesValues = blockValSet.getBytesValuesSV(); - for (int i = 0; i < length; i++) { - QuantileDigest value = ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[i]); - for (int groupKey : groupKeysArray[i]) { - QuantileDigest quantileDigest = groupByResultHolder.getResult(groupKey); - if (quantileDigest != null) { - quantileDigest.merge(value); - } else { - // Create a new QuantileDigest for the group - groupByResultHolder - .setValueForKey(groupKey, ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[i])); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + QuantileDigest value = ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[i]); + for (int groupKey : groupKeysArray[i]) { + QuantileDigest quantileDigest = groupByResultHolder.getResult(groupKey); + if (quantileDigest != null) { + quantileDigest.merge(value); + } else { + // Create a new QuantileDigest for the group + groupByResultHolder.setValueForKey(groupKey, + ObjectSerDeUtils.QUANTILE_DIGEST_SER_DE.deserialize(bytesValues[i])); + } } } - } + }); } } @@ -202,6 +217,9 @@ public ColumnDataType getFinalResultColumnType() { @Override public Long extractFinalResult(QuantileDigest intermediateResult) { + if (intermediateResult.getCount() == 0 && _nullHandlingEnabled) { + return null; + } return intermediateResult.getQuantile(_percentile / 100.0); } diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileEstMVAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileEstMVAggregationFunction.java index c1001f25c7e..5a861714620 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileEstMVAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileEstMVAggregationFunction.java @@ -30,11 +30,11 @@ public class PercentileEstMVAggregationFunction extends PercentileEstAggregationFunction { public PercentileEstMVAggregationFunction(ExpressionContext expression, int percentile) { - super(expression, percentile); + super(expression, percentile, false); } public PercentileEstMVAggregationFunction(ExpressionContext expression, double percentile) { - super(expression, percentile); + super(expression, percentile, false); } @Override diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLAggregationFunction.java index 6d2b3b8697f..bcf025a8014 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLAggregationFunction.java @@ -61,14 +61,14 @@ *

*/ public class PercentileKLLAggregationFunction - extends BaseSingleInputAggregationFunction> { + extends NullableSingleInputAggregationFunction> { protected static final int DEFAULT_K_VALUE = 200; protected final double _percentile; protected int _kValue; - public PercentileKLLAggregationFunction(List arguments) { - super(arguments.get(0)); + public PercentileKLLAggregationFunction(List arguments, boolean nullHandlingEnabled) { + super(arguments.get(0), nullHandlingEnabled); // Check that there are correct number of arguments int numArguments = arguments.size(); @@ -107,14 +107,18 @@ public void aggregate(int length, AggregationResultHolder aggregationResultHolde if (valueType == DataType.BYTES) { // Assuming the column contains serialized data sketch KllDoublesSketch[] deserializedSketches = deserializeSketches(blockValSetMap.get(_expression).getBytesValuesSV()); - for (int i = 0; i < length; i++) { - sketch.merge(deserializedSketches[i]); - } + forEachNotNull(length, valueSet, (from, to) -> { + for (int i = from; i < to; i++) { + sketch.merge(deserializedSketches[i]); + } + }); } else { double[] values = valueSet.getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - sketch.update(values[i]); - } + forEachNotNull(length, valueSet, (from, to) -> { + for (int i = from; i < to; i++) { + sketch.update(values[i]); + } + }); } } @@ -127,16 +131,20 @@ public void aggregateGroupBySV(int length, int[] groupKeyArray, GroupByResultHol if (valueType == DataType.BYTES) { // serialized sketch KllDoublesSketch[] deserializedSketches = deserializeSketches(blockValSetMap.get(_expression).getBytesValuesSV()); - for (int i = 0; i < length; i++) { - KllDoublesSketch sketch = getOrCreateSketch(groupByResultHolder, groupKeyArray[i]); - sketch.merge(deserializedSketches[i]); - } + forEachNotNull(length, valueSet, (from, to) -> { + for (int i = from; i < to; i++) { + KllDoublesSketch sketch = getOrCreateSketch(groupByResultHolder, groupKeyArray[i]); + sketch.merge(deserializedSketches[i]); + } + }); } else { double[] values = valueSet.getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - KllDoublesSketch sketch = getOrCreateSketch(groupByResultHolder, groupKeyArray[i]); - sketch.update(values[i]); - } + forEachNotNull(length, valueSet, (from, to) -> { + for (int i = from; i < to; i++) { + KllDoublesSketch sketch = getOrCreateSketch(groupByResultHolder, groupKeyArray[i]); + sketch.update(values[i]); + } + }); } } @@ -149,20 +157,24 @@ public void aggregateGroupByMV(int length, int[][] groupKeysArray, GroupByResult if (valueType == DataType.BYTES) { // serialized sketch KllDoublesSketch[] deserializedSketches = deserializeSketches(blockValSetMap.get(_expression).getBytesValuesSV()); - for (int i = 0; i < length; i++) { - for (int groupKey : groupKeysArray[i]) { - KllDoublesSketch sketch = getOrCreateSketch(groupByResultHolder, groupKey); - sketch.merge(deserializedSketches[i]); + forEachNotNull(length, valueSet, (from, to) -> { + for (int i = from; i < to; i++) { + for (int groupKey : groupKeysArray[i]) { + KllDoublesSketch sketch = getOrCreateSketch(groupByResultHolder, groupKey); + sketch.merge(deserializedSketches[i]); + } } - } + }); } else { double[] values = valueSet.getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - for (int groupKey : groupKeysArray[i]) { - KllDoublesSketch sketch = getOrCreateSketch(groupByResultHolder, groupKey); - sketch.update(values[i]); + forEachNotNull(length, valueSet, (from, to) -> { + for (int i = from; i < to; i++) { + for (int groupKey : groupKeysArray[i]) { + KllDoublesSketch sketch = getOrCreateSketch(groupByResultHolder, groupKey); + sketch.update(values[i]); + } } - } + }); } } @@ -241,6 +253,9 @@ public String getResultColumnName() { @Override public Comparable extractFinalResult(KllDoublesSketch sketch) { + if (sketch.isEmpty() && _nullHandlingEnabled) { + return null; + } return sketch.getQuantile(_percentile / 100); } } diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLMVAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLMVAggregationFunction.java index 4653e9051d3..26af8dea447 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLMVAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLMVAggregationFunction.java @@ -32,7 +32,7 @@ public class PercentileKLLMVAggregationFunction extends PercentileKLLAggregationFunction { public PercentileKLLMVAggregationFunction(List arguments) { - super(arguments); + super(arguments, false); } @Override diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileMVAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileMVAggregationFunction.java index 794a9896a7d..620763ea759 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileMVAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileMVAggregationFunction.java @@ -30,11 +30,11 @@ public class PercentileMVAggregationFunction extends PercentileAggregationFunction { public PercentileMVAggregationFunction(ExpressionContext expression, int percentile) { - super(expression, percentile); + super(expression, percentile, false); } public PercentileMVAggregationFunction(ExpressionContext expression, double percentile) { - super(expression, percentile); + super(expression, percentile, false); } @Override diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawEstAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawEstAggregationFunction.java index 063359ec960..04787e7d559 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawEstAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawEstAggregationFunction.java @@ -37,12 +37,14 @@ public class PercentileRawEstAggregationFunction extends BaseSingleInputAggregationFunction { private final PercentileEstAggregationFunction _percentileEstAggregationFunction; - public PercentileRawEstAggregationFunction(ExpressionContext expressionContext, double percentile) { - this(expressionContext, new PercentileEstAggregationFunction(expressionContext, percentile)); + public PercentileRawEstAggregationFunction(ExpressionContext expressionContext, double percentile, + boolean nullHandlingEnabled) { + this(expressionContext, new PercentileEstAggregationFunction(expressionContext, percentile, nullHandlingEnabled)); } - public PercentileRawEstAggregationFunction(ExpressionContext expressionContext, int percentile) { - this(expressionContext, new PercentileEstAggregationFunction(expressionContext, percentile)); + public PercentileRawEstAggregationFunction(ExpressionContext expressionContext, int percentile, + boolean nullHandlingEnabled) { + this(expressionContext, new PercentileEstAggregationFunction(expressionContext, percentile, nullHandlingEnabled)); } protected PercentileRawEstAggregationFunction(ExpressionContext expression, diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawKLLAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawKLLAggregationFunction.java index 39c2022ff02..7e88cf009d8 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawKLLAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawKLLAggregationFunction.java @@ -28,8 +28,8 @@ public class PercentileRawKLLAggregationFunction extends PercentileKLLAggregationFunction { - public PercentileRawKLLAggregationFunction(List arguments) { - super(arguments); + public PercentileRawKLLAggregationFunction(List arguments, boolean nullHandlingEnabled) { + super(arguments, nullHandlingEnabled); } @Override diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawTDigestAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawTDigestAggregationFunction.java index 99a096c1306..fc618027a5f 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawTDigestAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileRawTDigestAggregationFunction.java @@ -37,17 +37,22 @@ public class PercentileRawTDigestAggregationFunction extends BaseSingleInputAggregationFunction { private final PercentileTDigestAggregationFunction _percentileTDigestAggregationFunction; - public PercentileRawTDigestAggregationFunction(ExpressionContext expressionContext, int percentile) { - this(expressionContext, new PercentileTDigestAggregationFunction(expressionContext, percentile)); + public PercentileRawTDigestAggregationFunction(ExpressionContext expressionContext, int percentile, + boolean nullHandlingEnabled) { + this(expressionContext, new PercentileTDigestAggregationFunction(expressionContext, percentile, + nullHandlingEnabled)); } - public PercentileRawTDigestAggregationFunction(ExpressionContext expressionContext, double percentile) { - this(expressionContext, new PercentileTDigestAggregationFunction(expressionContext, percentile)); + public PercentileRawTDigestAggregationFunction(ExpressionContext expressionContext, double percentile, + boolean nullHandlingEnabled) { + this(expressionContext, new PercentileTDigestAggregationFunction(expressionContext, percentile, + nullHandlingEnabled)); } public PercentileRawTDigestAggregationFunction(ExpressionContext expressionContext, double percentile, - int compressionFactor) { - this(expressionContext, new PercentileTDigestAggregationFunction(expressionContext, percentile, compressionFactor)); + int compressionFactor, boolean nullHandlingEnabled) { + this(expressionContext, new PercentileTDigestAggregationFunction(expressionContext, percentile, compressionFactor, + nullHandlingEnabled)); } protected PercentileRawTDigestAggregationFunction(ExpressionContext expression, diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunction.java index 92cd5fa09b9..20d5372ca56 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunction.java @@ -50,15 +50,15 @@ * - compression: Compression for the converted TDigest, 100 by default. * Example of third argument: 'threshold=10000;compression=50' */ -public class PercentileSmartTDigestAggregationFunction extends BaseSingleInputAggregationFunction { +public class PercentileSmartTDigestAggregationFunction extends NullableSingleInputAggregationFunction { private static final double DEFAULT_FINAL_RESULT = Double.NEGATIVE_INFINITY; private final double _percentile; private final int _threshold; private final int _compression; - public PercentileSmartTDigestAggregationFunction(List arguments) { - super(arguments.get(0)); + public PercentileSmartTDigestAggregationFunction(List arguments, boolean nullHandlingEnabled) { + super(arguments.get(0), nullHandlingEnabled); try { _percentile = arguments.get(1).getLiteral().getDoubleValue(); } catch (Exception e) { @@ -128,39 +128,53 @@ private static void validateValueType(BlockValSet blockValSet) { blockValSet.isSingleValue() ? "" : "_MV"); } - private static void aggregateIntoTDigest(int length, AggregationResultHolder aggregationResultHolder, + private void aggregateIntoTDigest(int length, AggregationResultHolder aggregationResultHolder, BlockValSet blockValSet) { TDigest tDigest = aggregationResultHolder.getResult(); if (blockValSet.isSingleValue()) { double[] doubleValues = blockValSet.getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - tDigest.add(doubleValues[i]); - } + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + tDigest.add(doubleValues[i]); + } + }); } else { double[][] doubleValues = blockValSet.getDoubleValuesMV(); - for (int i = 0; i < length; i++) { - for (double value : doubleValues[i]) { - tDigest.add(value); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + for (double value : doubleValues[i]) { + tDigest.add(value); + } } - } + }); } } - private void aggregateIntoValueList(int length, AggregationResultHolder aggregationResultHolder, - BlockValSet blockValSet) { + private DoubleArrayList getOrCreateList(int length, AggregationResultHolder aggregationResultHolder) { DoubleArrayList valueList = aggregationResultHolder.getResult(); if (valueList == null) { valueList = new DoubleArrayList(length); aggregationResultHolder.setValue(valueList); } + return valueList; + } + + private void aggregateIntoValueList(int length, AggregationResultHolder aggregationResultHolder, + BlockValSet blockValSet) { + DoubleArrayList valueList = getOrCreateList(length, aggregationResultHolder); if (blockValSet.isSingleValue()) { double[] doubleValues = blockValSet.getDoubleValuesSV(); - valueList.addElements(valueList.size(), doubleValues, 0, length); + forEachNotNull(length, blockValSet, (from, toEx) -> + valueList.addElements(valueList.size(), doubleValues, from, toEx - from) + ); } else { double[][] doubleValues = blockValSet.getDoubleValuesMV(); - for (int i = 0; i < length; i++) { - valueList.addElements(valueList.size(), doubleValues[i]); - } + forEachNotNull(length, blockValSet, (from, toEx) -> { + for (int i = 0; i < length; i++) { + valueList.addElements(valueList.size(), doubleValues[i]); + } + } + ); } if (valueList.size() > _threshold) { aggregationResultHolder.setValue(convertValueListToTDigest(valueList)); @@ -183,16 +197,20 @@ public void aggregateGroupBySV(int length, int[] groupKeyArray, GroupByResultHol validateValueType(blockValSet); if (blockValSet.isSingleValue()) { double[] doubleValues = blockValSet.getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - DoubleArrayList valueList = getValueList(groupByResultHolder, groupKeyArray[i]); - valueList.add(doubleValues[i]); - } + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + DoubleArrayList valueList = getValueList(groupByResultHolder, groupKeyArray[i]); + valueList.add(doubleValues[i]); + } + }); } else { double[][] doubleValues = blockValSet.getDoubleValuesMV(); - for (int i = 0; i < length; i++) { - DoubleArrayList valueList = getValueList(groupByResultHolder, groupKeyArray[i]); - valueList.addElements(valueList.size(), doubleValues[i]); - } + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + DoubleArrayList valueList = getValueList(groupByResultHolder, groupKeyArray[i]); + valueList.addElements(valueList.size(), doubleValues[i]); + } + }); } } @@ -212,19 +230,23 @@ public void aggregateGroupByMV(int length, int[][] groupKeysArray, GroupByResult validateValueType(blockValSet); if (blockValSet.isSingleValue()) { double[] doubleValues = blockValSet.getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - for (int groupKey : groupKeysArray[i]) { - getValueList(groupByResultHolder, groupKey).add(doubleValues[i]); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + for (int groupKey : groupKeysArray[i]) { + getValueList(groupByResultHolder, groupKey).add(doubleValues[i]); + } } - } + }); } else { double[][] doubleValues = blockValSet.getDoubleValuesMV(); - for (int i = 0; i < length; i++) { - for (int groupKey : groupKeysArray[i]) { - DoubleArrayList valueList = getValueList(groupByResultHolder, groupKey); - valueList.addElements(valueList.size(), doubleValues[i]); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + for (int groupKey : groupKeysArray[i]) { + DoubleArrayList valueList = getValueList(groupByResultHolder, groupKey); + valueList.addElements(valueList.size(), doubleValues[i]); + } } - } + }); } } @@ -285,7 +307,11 @@ public Double extractFinalResult(Object intermediateResult) { DoubleArrayList valueList = (DoubleArrayList) intermediateResult; int size = valueList.size(); if (size == 0) { - return DEFAULT_FINAL_RESULT; + if (_nullHandlingEnabled) { + return null; + } else { + return DEFAULT_FINAL_RESULT; + } } else { double[] values = valueList.elements(); Arrays.sort(values, 0, size); diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestAggregationFunction.java index d4224739c6e..c831e52d224 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestAggregationFunction.java @@ -39,7 +39,7 @@ * extra handling for two argument PERCENTILE functions to assess if v0 or v1. This can be revisited later if the * need arises */ -public class PercentileTDigestAggregationFunction extends BaseSingleInputAggregationFunction { +public class PercentileTDigestAggregationFunction extends NullableSingleInputAggregationFunction { public static final int DEFAULT_TDIGEST_COMPRESSION = 100; // version 0 functions specified in the of form PERCENTILETDIGEST<2-digits>(column). Uses default compression of 100 @@ -48,23 +48,25 @@ public class PercentileTDigestAggregationFunction extends BaseSingleInputAggrega protected final double _percentile; protected final int _compressionFactor; - public PercentileTDigestAggregationFunction(ExpressionContext expression, int percentile) { - super(expression); + public PercentileTDigestAggregationFunction(ExpressionContext expression, int percentile, + boolean nullHandlingEnabled) { + super(expression, nullHandlingEnabled); _version = 0; _percentile = percentile; _compressionFactor = DEFAULT_TDIGEST_COMPRESSION; } - public PercentileTDigestAggregationFunction(ExpressionContext expression, double percentile) { - super(expression); + public PercentileTDigestAggregationFunction(ExpressionContext expression, double percentile, + boolean nullHandlingEnabled) { + super(expression, nullHandlingEnabled); _version = 1; _percentile = percentile; _compressionFactor = DEFAULT_TDIGEST_COMPRESSION; } public PercentileTDigestAggregationFunction(ExpressionContext expression, double percentile, - int compressionFactor) { - super(expression); + int compressionFactor, boolean nullHandlingEnabled) { + super(expression, nullHandlingEnabled); _version = 1; _percentile = percentile; _compressionFactor = compressionFactor; @@ -104,24 +106,28 @@ public void aggregate(int length, AggregationResultHolder aggregationResultHolde if (blockValSet.getValueType() != DataType.BYTES) { double[] doubleValues = blockValSet.getDoubleValuesSV(); TDigest tDigest = getDefaultTDigest(aggregationResultHolder, _compressionFactor); - for (int i = 0; i < length; i++) { - tDigest.add(doubleValues[i]); - } + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + tDigest.add(doubleValues[i]); + } + }); } else { // Serialized TDigest byte[][] bytesValues = blockValSet.getBytesValuesSV(); - TDigest tDigest = aggregationResultHolder.getResult(); - if (tDigest != null) { - for (int i = 0; i < length; i++) { - tDigest.add(ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i])); - } - } else { - tDigest = ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[0]); - aggregationResultHolder.setValue(tDigest); - for (int i = 1; i < length; i++) { - tDigest.add(ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i])); + foldNotNull(length, blockValSet, (TDigest) aggregationResultHolder.getResult(), (tDigest, from, toEx) -> { + if (tDigest != null) { + for (int i = from; i < toEx; i++) { + tDigest.add(ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i])); + } + } else { + tDigest = ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[0]); + aggregationResultHolder.setValue(tDigest); + for (int i = 1; i < length; i++) { + tDigest.add(ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i])); + } } - } + return tDigest; + }); } } @@ -131,22 +137,26 @@ public void aggregateGroupBySV(int length, int[] groupKeyArray, GroupByResultHol BlockValSet blockValSet = blockValSetMap.get(_expression); if (blockValSet.getValueType() != DataType.BYTES) { double[] doubleValues = blockValSet.getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - getDefaultTDigest(groupByResultHolder, groupKeyArray[i], _compressionFactor).add(doubleValues[i]); - } + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + getDefaultTDigest(groupByResultHolder, groupKeyArray[i], _compressionFactor).add(doubleValues[i]); + } + }); } else { // Serialized TDigest byte[][] bytesValues = blockValSet.getBytesValuesSV(); - for (int i = 0; i < length; i++) { - TDigest value = ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i]); - int groupKey = groupKeyArray[i]; - TDigest tDigest = groupByResultHolder.getResult(groupKey); - if (tDigest != null) { - tDigest.add(value); - } else { - groupByResultHolder.setValueForKey(groupKey, value); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + TDigest value = ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i]); + int groupKey = groupKeyArray[i]; + TDigest tDigest = groupByResultHolder.getResult(groupKey); + if (tDigest != null) { + tDigest.add(value); + } else { + groupByResultHolder.setValueForKey(groupKey, value); + } } - } + }); } } @@ -156,27 +166,31 @@ public void aggregateGroupByMV(int length, int[][] groupKeysArray, GroupByResult BlockValSet blockValSet = blockValSetMap.get(_expression); if (blockValSet.getValueType() != DataType.BYTES) { double[] doubleValues = blockValSet.getDoubleValuesSV(); - for (int i = 0; i < length; i++) { - double value = doubleValues[i]; - for (int groupKey : groupKeysArray[i]) { - getDefaultTDigest(groupByResultHolder, groupKey, _compressionFactor).add(value); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + double value = doubleValues[i]; + for (int groupKey : groupKeysArray[i]) { + getDefaultTDigest(groupByResultHolder, groupKey, _compressionFactor).add(value); + } } - } + }); } else { // Serialized QuantileDigest byte[][] bytesValues = blockValSet.getBytesValuesSV(); - for (int i = 0; i < length; i++) { - TDigest value = ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i]); - for (int groupKey : groupKeysArray[i]) { - TDigest tDigest = groupByResultHolder.getResult(groupKey); - if (tDigest != null) { - tDigest.add(value); - } else { - // Create a new TDigest for the group - groupByResultHolder.setValueForKey(groupKey, ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i])); + forEachNotNull(length, blockValSet, (from, to) -> { + for (int i = from; i < to; i++) { + TDigest value = ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i]); + for (int groupKey : groupKeysArray[i]) { + TDigest tDigest = groupByResultHolder.getResult(groupKey); + if (tDigest != null) { + tDigest.add(value); + } else { + // Create a new TDigest for the group + groupByResultHolder.setValueForKey(groupKey, ObjectSerDeUtils.TDIGEST_SER_DE.deserialize(bytesValues[i])); + } } } - } + }); } } diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestMVAggregationFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestMVAggregationFunction.java index 571f2ae9126..a6b7884e6e8 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestMVAggregationFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PercentileTDigestMVAggregationFunction.java @@ -30,16 +30,16 @@ public class PercentileTDigestMVAggregationFunction extends PercentileTDigestAggregationFunction { public PercentileTDigestMVAggregationFunction(ExpressionContext expression, int percentile) { - super(expression, percentile); + super(expression, percentile, false); } public PercentileTDigestMVAggregationFunction(ExpressionContext expression, double percentile) { - super(expression, percentile); + super(expression, percentile, false); } public PercentileTDigestMVAggregationFunction(ExpressionContext expression, double percentile, int compressionFactor) { - super(expression, percentile, compressionFactor); + super(expression, percentile, compressionFactor, false); } @Override diff --git a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/AbstractPercentileAggregationFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/AbstractPercentileAggregationFunctionTest.java new file mode 100644 index 00000000000..fe9cc09f26a --- /dev/null +++ b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/AbstractPercentileAggregationFunctionTest.java @@ -0,0 +1,333 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.pinot.core.query.aggregation.function; + +import org.apache.pinot.queries.FluentQueryTest; +import org.apache.pinot.spi.data.FieldSpec; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +public abstract class AbstractPercentileAggregationFunctionTest extends AbstractAggregationFunctionTest { + + @DataProvider(name = "scenarios") + Object[] scenarios() { + return new Object[] { + new Scenario(FieldSpec.DataType.INT), + new Scenario(FieldSpec.DataType.LONG), + new Scenario(FieldSpec.DataType.FLOAT), + new Scenario(FieldSpec.DataType.DOUBLE), + }; + } + + public abstract String callStr(String column, int percent); + + public String getFinalResultColumnType() { + return "DOUBLE"; + } + + public class Scenario { + private final FieldSpec.DataType _dataType; + + public Scenario(FieldSpec.DataType dataType) { + _dataType = dataType; + } + + public FieldSpec.DataType getDataType() { + return _dataType; + } + + public FluentQueryTest.DeclaringTable getDeclaringTable(boolean nullHandlingEnabled) { + return givenSingleNullableFieldTable(_dataType, nullHandlingEnabled); + } + + @Override + public String toString() { + return "Scenario{" + "dt=" + _dataType + '}'; + } + } + + FluentQueryTest.TableWithSegments withDefaultData(Scenario scenario, boolean nullHandlingEnabled) { + return scenario.getDeclaringTable(nullHandlingEnabled) + .onFirstInstance("myField", + "null", + "0", + "null", + "1", + "null", + "2", + "null", + "3", + "null", + "4", + "null" + ).andSegment("myField", + "null", + "5", + "null", + "6", + "null", + "7", + "null", + "8", + "null", + "9", + "null" + ); + } + + String minValue(FieldSpec.DataType dataType) { + switch (dataType) { + case INT: return "-2.147483648E9"; + case LONG: return "-9.223372036854776E18"; + case FLOAT: return "-Infinity"; + case DOUBLE: return "-Infinity"; + default: + throw new IllegalArgumentException("Unexpected type " + dataType); + } + } + + String expectedAggrWithoutNull10(Scenario scenario) { + return minValue(scenario._dataType); + } + + String expectedAggrWithoutNull15(Scenario scenario) { + return minValue(scenario._dataType); + } + + String expectedAggrWithoutNull30(Scenario scenario) { + return minValue(scenario._dataType); + } + + String expectedAggrWithoutNull35(Scenario scenario) { + return minValue(scenario._dataType); + } + + String expectedAggrWithoutNull50(Scenario scenario) { + return minValue(scenario._dataType); + } + + String expectedAggrWithoutNull55(Scenario scenario) { + return "0"; + } + + String expectedAggrWithoutNull70(Scenario scenario) { + return "3"; + } + + String expectedAggrWithoutNull75(Scenario scenario) { + return "4"; + } + + String expectedAggrWithoutNull90(Scenario scenario) { + return "7"; + } + + String expectedAggrWithoutNull100(Scenario scenario) { + return "9"; + } + + @Test(dataProvider = "scenarios") + void aggrWithoutNull(Scenario scenario) { + + FluentQueryTest.TableWithSegments instance = withDefaultData(scenario, false); + + instance + .whenQuery("select " + callStr("myField", 10) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull10(scenario)); + + instance + .whenQuery("select " + callStr("myField", 15) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull15(scenario)); + + instance + .whenQuery("select " + callStr("myField", 30) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull30(scenario)); + instance + .whenQuery("select " + callStr("myField", 35) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull35(scenario)); + + instance + .whenQuery("select " + callStr("myField", 50) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull50(scenario)); + instance + .whenQuery("select " + callStr("myField", 55) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull55(scenario)); + + instance + .whenQuery("select " + callStr("myField", 70) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull70(scenario)); + + instance + .whenQuery("select " + callStr("myField", 75) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull75(scenario)); + + instance + .whenQuery("select " + callStr("myField", 90) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull90(scenario)); + + instance + .whenQuery("select " + callStr("myField", 100) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithoutNull100(scenario)); + } + + String expectedAggrWithNull10(Scenario scenario) { + return "1"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull10(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 10) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull10(scenario)); + } + + String expectedAggrWithNull15(Scenario scenario) { + return "1"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull15(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 15) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull15(scenario)); + } + + String expectedAggrWithNull30(Scenario scenario) { + return "3"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull30(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 30) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull30(scenario)); + } + + String expectedAggrWithNull35(Scenario scenario) { + return "3"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull35(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 35) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull35(scenario)); + } + + String expectedAggrWithNull50(Scenario scenario) { + return "5"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull50(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 50) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull50(scenario)); + } + + String expectedAggrWithNull55(Scenario scenario) { + return "5"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull55(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 55) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull55(scenario)); + } + + String expectedAggrWithNull70(Scenario scenario) { + return "7"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull70(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 70) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull70(scenario)); + } + + String expectedAggrWithNull75(Scenario scenario) { + return "7"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull75(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 75) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull75(scenario)); + } + + String expectedAggrWithNull100(Scenario scenario) { + return "9"; + } + + @Test(dataProvider = "scenarios") + void aggrWithNull100(Scenario scenario) { + withDefaultData(scenario, true) + .whenQuery("select " + callStr("myField", 100) + " from testTable") + .thenResultIs(getFinalResultColumnType(), expectedAggrWithNull100(scenario)); + } + + @Test(dataProvider = "scenarios") + void aggrSvWithoutNull(Scenario scenario) { + scenario.getDeclaringTable(false) + .onFirstInstance("myField", + "null", + "1", + "null" + ).andSegment("myField", + "9" + ).andSegment("myField", + "null", + "null", + "null" + ).whenQuery("select $segmentName, " + callStr("myField", 50) + " from testTable " + + "group by $segmentName order by $segmentName") + .thenResultIs("STRING | " + getFinalResultColumnType(), + "testTable_0 | " + minValue(scenario._dataType), + "testTable_1 | 9", + "testTable_2 | " + minValue(scenario._dataType) + ); + } + + @Test(dataProvider = "scenarios") + void aggrSvWithNull(Scenario scenario) { + scenario.getDeclaringTable(true) + .onFirstInstance("myField", + "null", + "1", + "null" + ).andSegment("myField", + "9" + ).andSegment("myField", + "null", + "null", + "null" + ).whenQuery("select $segmentName, " + callStr("myField", 50) + " from testTable " + + "group by $segmentName order by $segmentName") + .thenResultIs("STRING | " + getFinalResultColumnType(), + "testTable_0 | 1", + "testTable_1 | 9", + "testTable_2 | null" + ); + } +} diff --git a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileAggregationFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileAggregationFunctionTest.java new file mode 100644 index 00000000000..3c2ecdde011 --- /dev/null +++ b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileAggregationFunctionTest.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.pinot.core.query.aggregation.function; + +public class PercentileAggregationFunctionTest extends AbstractPercentileAggregationFunctionTest { + @Override + public String callStr(String column, int percent) { + return "PERCENTILE(" + column + ", " + percent + ")"; + } +} diff --git a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileEstAggregationFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileEstAggregationFunctionTest.java new file mode 100644 index 00000000000..4dda1614b7c --- /dev/null +++ b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileEstAggregationFunctionTest.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.query.aggregation.function; + +import org.apache.pinot.spi.data.FieldSpec; + + +public class PercentileEstAggregationFunctionTest extends AbstractPercentileAggregationFunctionTest { + @Override + public String callStr(String column, int percent) { + return "PERCENTILEEST(" + column + ", " + percent + ")"; + } + + @Override + public String getFinalResultColumnType() { + return "LONG"; + } + + String minValue(FieldSpec.DataType dataType) { + switch (dataType) { + case INT: return "-2147483648"; + case LONG: return "-9223372036854775808"; + case FLOAT: return "-9223372036854775808"; + case DOUBLE: return "-9223372036854775808"; + default: + throw new IllegalArgumentException("Unexpected type " + dataType); + } + } +} diff --git a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLAggregationFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLAggregationFunctionTest.java new file mode 100644 index 00000000000..1eb6c991c22 --- /dev/null +++ b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileKLLAggregationFunctionTest.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.query.aggregation.function; + + +public class PercentileKLLAggregationFunctionTest extends AbstractPercentileAggregationFunctionTest { + @Override + public String callStr(String column, int percent) { + return "PERCENTILEKLL(" + column + ", " + percent + ")"; + } + + @Override + String expectedAggrWithNull10(Scenario scenario) { + return "0"; + } + + @Override + String expectedAggrWithNull30(Scenario scenario) { + return "2"; + } + + @Override + String expectedAggrWithNull50(Scenario scenario) { + return "4"; + } + + @Override + String expectedAggrWithNull70(Scenario scenario) { + return "6"; + } +} diff --git a/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunctionTest.java b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunctionTest.java new file mode 100644 index 00000000000..b1eb471c704 --- /dev/null +++ b/pinot-core/src/test/java/org/apache/pinot/core/query/aggregation/function/PercentileSmartTDigestAggregationFunctionTest.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.query.aggregation.function; + + +public class PercentileSmartTDigestAggregationFunctionTest { + + public static class WithHighThreshold extends AbstractPercentileAggregationFunctionTest { + @Override + public String callStr(String column, int percent) { + return "PERCENTILESMARTTDIGEST(" + column + ", " + percent + ", 'THRESHOLD=10000')"; + } + } + + public static class WithSmallThreshold extends AbstractPercentileAggregationFunctionTest { + @Override + public String callStr(String column, int percent) { + return "PERCENTILESMARTTDIGEST(" + column + ", " + percent + ", 'THRESHOLD=1')"; + } + + @Override + String expectedAggrWithNull10(Scenario scenario) { + return "0.5"; + } + + @Override + String expectedAggrWithNull30(Scenario scenario) { + return "2.5"; + } + + @Override + String expectedAggrWithNull50(Scenario scenario) { + return "4.5"; + } + + @Override + String expectedAggrWithNull70(Scenario scenario) { + return "6.5"; + } + + @Override + String expectedAggrWithoutNull55(Scenario scenario) { + switch (scenario.getDataType()) { + case INT: + return "-6.442450943999939E8"; + case LONG: + return "-2.7670116110564065E18"; + case FLOAT: + case DOUBLE: + return "-Infinity"; + default: + throw new IllegalArgumentException("Unsupported datatype " + scenario.getDataType()); + } + } + + @Override + String expectedAggrWithoutNull75(Scenario scenario) { + return "4.0"; + } + + @Override + String expectedAggrWithoutNull90(Scenario scenario) { + return "7.100000000000001"; + } + + @Override + String expectedAggrWithoutNull100(Scenario scenario) { + return super.expectedAggrWithoutNull100(scenario); + } + } +} diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/FluentQueryTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/FluentQueryTest.java index ba6d22c429c..8bd93cd42e3 100644 --- a/pinot-core/src/test/java/org/apache/pinot/queries/FluentQueryTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/queries/FluentQueryTest.java @@ -112,7 +112,7 @@ public OnFirstInstance onFirstInstance(Object[]... content) { } } - static class TableWithSegments { + public static class TableWithSegments { protected final TableConfig _tableConfig; protected final Schema _schema; protected final File _indexDir; From 62b97ef3b41f1f00dfc8272d855e1eac802c0106 Mon Sep 17 00:00:00 2001 From: Ting Chen Date: Mon, 1 Apr 2024 10:34:11 -0700 Subject: [PATCH 21/50] Add support for phrase search with wildcard and prefix matching for Lucene indexed tables (#12680) * Intial commit to support phrase search with regex matching for the terms in the phrase * Increase max clause limit for SpanOr queries. * Fix the lint errors. * Fix lint * Fix based on comments. * Fix lint. * Fix lint * Remove unused imports. * Revise based on comments. --- .../pinot/queries/TextSearchQueriesTest.java | 40 +++++++ .../RealtimeLuceneTextIndex.java | 12 ++- .../readers/text/LuceneTextIndexReader.java | 14 ++- .../index/text/TextIndexConfigBuilder.java | 2 + .../local/utils/LuceneTextIndexUtils.java | 75 +++++++++++++ .../LuceneMutableTextIndexTest.java | 2 +- .../NativeAndLuceneMutableTextIndexTest.java | 2 +- .../store/FilePerIndexDirectoryTest.java | 4 +- .../store/SingleFileIndexDirectoryTest.java | 4 +- .../local/utils/LuceneTextIndexUtilsTest.java | 100 ++++++++++++++++++ .../segment/spi/index/TextIndexConfig.java | 31 +++++- .../pinot/spi/config/table/FieldConfig.java | 2 + 12 files changed, 277 insertions(+), 11 deletions(-) create mode 100644 pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java create mode 100644 pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtilsTest.java diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java index df1b8a790fb..217e099003d 100644 --- a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java @@ -153,6 +153,7 @@ public void setUp() props = new HashMap<>(); props.put(FieldConfig.TEXT_INDEX_STOP_WORD_INCLUDE_KEY, "coordinator"); props.put(FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY, "it, those"); + props.put(FieldConfig.TEXT_INDEX_ENABLE_PREFIX_SUFFIX_PHRASE_QUERIES, "true"); columnProperties.put(SKILLS_TEXT_COL_NAME, props); props = new HashMap<>(); props.put(FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY, ""); @@ -207,6 +208,7 @@ private void buildSegment() addTextIndexProp(config, SKILLS_TEXT_COL_NAME, ImmutableMap.builder() .put(FieldConfig.TEXT_INDEX_STOP_WORD_INCLUDE_KEY, "coordinator") .put(FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY, "it, those") + .put(FieldConfig.TEXT_INDEX_ENABLE_PREFIX_SUFFIX_PHRASE_QUERIES, "true") .build()); addTextIndexProp(config, SKILLS_TEXT_COL_DICT_NAME, Collections.singletonMap(FieldConfig.TEXT_INDEX_STOP_WORD_EXCLUDE_KEY, "")); @@ -280,6 +282,44 @@ private List createTestData() return rows; } + @Test + public void testMultiTermRegexSearch() + throws Exception { + // Search in SKILLS_TEXT_COL column to look for documents that have the /.*ealtime stream system.*/ regex pattern + List expected = new ArrayList<>(); + expected.add(new Object[]{1010, + "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed " + + "storage, concurrency, multi-threading"}); + expected.add(new Object[]{1019, + "C++, Java, Python, realtime streaming systems, Machine learning, spark, Kubernetes, transaction processing, " + + "distributed storage, concurrency, multi-threading, apache airflow"}); + + String query = + "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '*ealtime streaming system*') " + + "LIMIT 50000"; + testTextSearchSelectQueryHelper(query, expected.size(), false, expected); + + // Search /*java realtime stream system*, only 1 result left./ + List expected1 = new ArrayList<>(); + expected1.add(new Object[]{1010, + "Distributed systems, Java, realtime streaming systems, Machine learning, spark, Kubernetes, distributed " + + "storage, concurrency, multi-threading"}); + String query1 = + "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '*ava realtime streaming " + + "system*') LIMIT 50000"; + testTextSearchSelectQueryHelper(query1, expected1.size(), false, expected1); + + String query2 = + "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '*ava realtime streaming " + + "system* AND *chine learn*') LIMIT 50000"; + testTextSearchSelectQueryHelper(query2, expected1.size(), false, expected1); + + String query3 = + "SELECT INT_COL, SKILLS_TEXT_COL FROM MyTable WHERE TEXT_MATCH(SKILLS_TEXT_COL, '*ava realtime streaming " + + "system* AND *chine learner*') LIMIT 50000"; + testTextSearchSelectQueryHelper(query3, 0, false, new ArrayList<>()); + } + /** * Tests for phrase, term, regex, composite (using AND/OR) text search queries. * Both selection and aggregation queries are used. diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java index 2a35b2da60d..a71d2663ed7 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.SearcherManager; import org.apache.pinot.segment.local.indexsegment.mutable.MutableSegmentImpl; import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator; +import org.apache.pinot.segment.local.utils.LuceneTextIndexUtils; import org.apache.pinot.segment.spi.index.TextIndexConfig; import org.apache.pinot.segment.spi.index.mutable.MutableTextIndex; import org.roaringbitmap.IntIterator; @@ -53,6 +54,7 @@ public class RealtimeLuceneTextIndex implements MutableTextIndex { private Analyzer _analyzer; private final String _column; private final String _segmentName; + private boolean _enablePrefixSuffixMatchingInPhraseQueries = false; /** * Created by {@link MutableSegmentImpl} @@ -80,6 +82,7 @@ public RealtimeLuceneTextIndex(String column, File segmentIndexDir, String segme IndexWriter indexWriter = _indexCreator.getIndexWriter(); _searcherManager = new SearcherManager(indexWriter, false, false, null); _analyzer = _indexCreator.getIndexWriter().getConfig().getAnalyzer(); + _enablePrefixSuffixMatchingInPhraseQueries = config.isEnablePrefixSuffixMatchingInPhraseQueries(); } catch (Exception e) { LOGGER.error("Failed to instantiate realtime Lucene index reader for column {}, exception {}", column, e.getMessage()); @@ -119,7 +122,14 @@ public MutableRoaringBitmap getDocIds(String searchQuery) { Callable searchCallable = () -> { IndexSearcher indexSearcher = null; try { - Query query = new QueryParser(_column, _analyzer).parse(searchQuery); + QueryParser parser = new QueryParser(_column, _analyzer); + if (_enablePrefixSuffixMatchingInPhraseQueries) { + parser.setAllowLeadingWildcard(true); + } + Query query = parser.parse(searchQuery); + if (_enablePrefixSuffixMatchingInPhraseQueries) { + query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query); + } indexSearcher = _searcherManager.acquire(); indexSearcher.search(query, docIDCollector); return getPinotDocIds(indexSearcher, docIDs); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java index 9b971d51424..3a0efabe8c9 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java @@ -40,6 +40,7 @@ import org.apache.pinot.segment.local.segment.index.column.PhysicalColumnIndexContainer; import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder; import org.apache.pinot.segment.local.segment.store.TextIndexUtils; +import org.apache.pinot.segment.local.utils.LuceneTextIndexUtils; import org.apache.pinot.segment.spi.V1Constants; import org.apache.pinot.segment.spi.index.TextIndexConfig; import org.apache.pinot.segment.spi.index.reader.TextIndexReader; @@ -66,6 +67,7 @@ public class LuceneTextIndexReader implements TextIndexReader { private final DocIdTranslator _docIdTranslator; private final Analyzer _analyzer; private boolean _useANDForMultiTermQueries = false; + private boolean _enablePrefixSuffixMatchingInPhraseQueries = false; public LuceneTextIndexReader(String column, File indexDir, int numDocs, TextIndexConfig config) { _column = column; @@ -82,6 +84,9 @@ public LuceneTextIndexReader(String column, File indexDir, int numDocs, TextInde if (config.isUseANDForMultiTermQueries()) { _useANDForMultiTermQueries = true; } + if (config.isEnablePrefixSuffixMatchingInPhraseQueries()) { + _enablePrefixSuffixMatchingInPhraseQueries = true; + } // TODO: consider using a threshold of num docs per segment to decide between building // mapping file upfront on segment load v/s on-the-fly during query processing _docIdTranslator = new DocIdTranslator(indexDir, _column, numDocs, _indexSearcher); @@ -150,10 +155,18 @@ public MutableRoaringBitmap getDocIds(String searchQuery) { // be instantiated per query. Analyzer on the other hand is stateless // and can be created upfront. QueryParser parser = new QueryParser(_column, _analyzer); + // Phrase search with prefix/suffix matching may have leading *. E.g., `*pache pinot` which can be stripped by + // the query parser. To support the feature, we need to explicitly set the config to be true. + if (_enablePrefixSuffixMatchingInPhraseQueries) { + parser.setAllowLeadingWildcard(true); + } if (_useANDForMultiTermQueries) { parser.setDefaultOperator(QueryParser.Operator.AND); } Query query = parser.parse(searchQuery); + if (_enablePrefixSuffixMatchingInPhraseQueries) { + query = LuceneTextIndexUtils.convertToMultiTermSpanQuery(query); + } _indexSearcher.search(query, docIDCollector); return docIds; } catch (Exception e) { @@ -162,7 +175,6 @@ public MutableRoaringBitmap getDocIds(String searchQuery) { throw new RuntimeException(msg, e); } } - /** * When we destroy the loaded ImmutableSegment, all the indexes * (for each column) are destroyed and as part of that diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java index 1c14226c0bb..5d07fb788de 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/TextIndexConfigBuilder.java @@ -50,6 +50,8 @@ public TextIndexConfig.AbstractBuilder withProperties(@Nullable Map spanQueryLst = new ArrayList<>(); + boolean prefixOrSuffixQueryFound = false; + for (BooleanClause clause : ((BooleanQuery) query).clauses()) { + Query q = clause.getQuery(); + if (q instanceof WildcardQuery || q instanceof PrefixQuery) { + prefixOrSuffixQueryFound = true; + spanQueryLst.add(new SpanMultiTermQueryWrapper<>((AutomatonQuery) q)); + } else if (q instanceof TermQuery) { + spanQueryLst.add(new SpanTermQuery(((TermQuery) q).getTerm())); + } else { + LOGGER.info("query can not be handled currently {} ", q); + return query; + } + } + if (!prefixOrSuffixQueryFound) { + return query; + } + SpanNearQuery spanNearQuery = new SpanNearQuery(spanQueryLst.toArray(new SpanQuery[0]), 0, true); + LOGGER.debug("The phrase query {} is re-written as {}", query, spanNearQuery); + return spanNearQuery; + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java index e8066bb9e7b..c485a3dd39b 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/LuceneMutableTextIndexTest.java @@ -59,7 +59,7 @@ private String[][] getRepeatedData() { public void setUp() throws Exception { TextIndexConfig config = - new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null); + new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null, false); _realtimeLuceneTextIndex = new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR, "fooBar", config); String[][] documents = getTextData(); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java index 211614b5b2c..ca1c94ceb8f 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeAndLuceneMutableTextIndexTest.java @@ -72,7 +72,7 @@ private String[][] getMVTextData() { public void setUp() throws Exception { TextIndexConfig config = - new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null); + new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null, false); _realtimeLuceneTextIndex = new RealtimeLuceneTextIndex(TEXT_COLUMN_NAME, INDEX_DIR, "fooBar", config); _nativeMutableTextIndex = new NativeMutableTextIndex(TEXT_COLUMN_NAME); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java index 61fa9f0319e..f60de6d12d2 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/FilePerIndexDirectoryTest.java @@ -202,7 +202,7 @@ public void nativeTextIndexIsDeleted() public void testRemoveTextIndices() throws IOException { TextIndexConfig config = - new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null); + new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null, false); try (FilePerIndexDirectory fpi = new FilePerIndexDirectory(TEMP_DIR, _segmentMetadata, ReadMode.mmap); LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true, config); LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true, config)) { @@ -265,7 +265,7 @@ public void testRemoveTextIndices() public void testGetColumnIndices() throws IOException { TextIndexConfig config = - new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null); + new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null, false); // Write sth to buffers and flush them to index files on disk try (FilePerIndexDirectory fpi = new FilePerIndexDirectory(TEMP_DIR, _segmentMetadata, ReadMode.mmap); LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true, config); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java index 2a03044abe6..28494666362 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/store/SingleFileIndexDirectoryTest.java @@ -235,7 +235,7 @@ public void testCleanupRemovedIndices() public void testRemoveTextIndices() throws IOException, ConfigurationException { TextIndexConfig config = - new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null); + new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null, false); try (SingleFileIndexDirectory sfd = new SingleFileIndexDirectory(TEMP_DIR, _segmentMetadata, ReadMode.mmap); LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true, config); LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true, config)) { @@ -341,7 +341,7 @@ public void testPersistIndexMaps() { public void testGetColumnIndices() throws Exception { TextIndexConfig config = - new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null); + new TextIndexConfig(false, null, null, false, false, null, null, true, 500, null, false); try (SingleFileIndexDirectory sfd = new SingleFileIndexDirectory(TEMP_DIR, _segmentMetadata, ReadMode.mmap); LuceneTextIndexCreator fooCreator = new LuceneTextIndexCreator("foo", TEMP_DIR, true, config); LuceneTextIndexCreator barCreator = new LuceneTextIndexCreator("bar", TEMP_DIR, true, config)) { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtilsTest.java new file mode 100644 index 00000000000..004c3ed04c7 --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtilsTest.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.utils; + +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.queries.spans.SpanNearQuery; +import org.apache.lucene.queries.spans.SpanQuery; +import org.apache.lucene.queries.spans.SpanTermQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; +import org.testng.Assert; +import org.testng.annotations.Test; + + +public class LuceneTextIndexUtilsTest { + @Test + public void testBooleanQueryRewrittenToSpanQuery() { + // Test 1: The input is a boolean query with 2 clauses: "*pache pino*" + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + WildcardQuery wildcardQuery = new WildcardQuery(new Term("field", "*apche")); + PrefixQuery prefixQuery = new PrefixQuery(new Term("field", "pino")); + builder.add(new BooleanClause(wildcardQuery, BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(prefixQuery, BooleanClause.Occur.SHOULD)); + + SpanQuery[] spanQueries1 = + {new SpanMultiTermQueryWrapper<>(wildcardQuery), new SpanMultiTermQueryWrapper<>(prefixQuery)}; + SpanQuery expectedQuery = new SpanNearQuery(spanQueries1, 0, true); + Assert.assertEquals(expectedQuery, LuceneTextIndexUtils.convertToMultiTermSpanQuery(builder.build())); + + // Test 2: The input is a boolean query with 3 clauses: "*pache real pino*" + builder = new BooleanQuery.Builder(); + Term term = new Term("field", "real"); + builder.add(new BooleanClause(wildcardQuery, BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(prefixQuery, BooleanClause.Occur.SHOULD)); + + SpanQuery[] spanQueries2 = + {new SpanMultiTermQueryWrapper<>(wildcardQuery), new SpanTermQuery(term), new SpanMultiTermQueryWrapper<>( + prefixQuery)}; + expectedQuery = new SpanNearQuery(spanQueries2, 0, true); + Assert.assertEquals(expectedQuery, LuceneTextIndexUtils.convertToMultiTermSpanQuery(builder.build())); + + // Test 3: The input is a boolean query with 3 clauses: "*pache real* pino*" + builder = new BooleanQuery.Builder(); + builder.add(new BooleanClause(wildcardQuery, BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(prefixQuery, BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(prefixQuery, BooleanClause.Occur.SHOULD)); + + SpanQuery[] spanQueries3 = {new SpanMultiTermQueryWrapper<>(wildcardQuery), new SpanMultiTermQueryWrapper<>( + prefixQuery), new SpanMultiTermQueryWrapper<>(prefixQuery)}; + expectedQuery = new SpanNearQuery(spanQueries3, 0, true); + Assert.assertEquals(expectedQuery, LuceneTextIndexUtils.convertToMultiTermSpanQuery(builder.build())); + + // Test 4: The input is a boolean query with 1 clause: "*pino*". + WildcardQuery wildcardQuery1 = new WildcardQuery(new Term("field", "*pino*")); + builder = new BooleanQuery.Builder(); + builder.add(new BooleanClause(wildcardQuery1, BooleanClause.Occur.SHOULD)); + SpanQuery[] spanQueries4 = {new SpanMultiTermQueryWrapper<>(wildcardQuery1)}; + expectedQuery = new SpanNearQuery(spanQueries4, 0, true); + Assert.assertEquals(expectedQuery, LuceneTextIndexUtils.convertToMultiTermSpanQuery(builder.build())); + + // Test 5: Boolean queries without any wildcard/prefix subqueries are left unchanged. + builder = new BooleanQuery.Builder(); + builder.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD)) + .add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD)); + BooleanQuery q = builder.build(); + Assert.assertEquals(q, LuceneTextIndexUtils.convertToMultiTermSpanQuery(q)); + } + + @Test + public void testQueryIsNotRewritten() { + // Test 1: Term query is not re-written. + TermQuery termQuery = new TermQuery(new Term("field", "real")); + Assert.assertEquals(termQuery, LuceneTextIndexUtils.convertToMultiTermSpanQuery(termQuery)); + // Test 2: Regex query is not re-written. + RegexpQuery regexpQuery = new RegexpQuery(new Term("field", "\\d+")); + Assert.assertEquals(regexpQuery, LuceneTextIndexUtils.convertToMultiTermSpanQuery(regexpQuery)); + } +} diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java index 0b31e70e1ef..afbf7eb876a 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java @@ -37,7 +37,8 @@ public class TextIndexConfig extends IndexConfig { private static final boolean LUCENE_INDEX_DEFAULT_USE_COMPOUND_FILE = true; public static final TextIndexConfig DISABLED = new TextIndexConfig(true, null, null, false, false, Collections.emptyList(), Collections.emptyList(), false, - LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB, null); + LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB, null, false); + private static final boolean LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH = false; private final FSTType _fstType; @Nullable private final Object _rawValueForTextIndex; @@ -48,6 +49,7 @@ public class TextIndexConfig extends IndexConfig { private final boolean _luceneUseCompoundFile; private final int _luceneMaxBufferSizeMB; private final String _luceneAnalyzerClass; + private final boolean _enablePrefixSuffixMatchingInPhraseQueries; @JsonCreator public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty("fst") FSTType fstType, @@ -58,7 +60,8 @@ public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty @JsonProperty("stopWordsExclude") List stopWordsExclude, @JsonProperty("luceneUseCompoundFile") Boolean luceneUseCompoundFile, @JsonProperty("luceneMaxBufferSizeMB") Integer luceneMaxBufferSizeMB, - @JsonProperty("luceneAnalyzerClass") String luceneAnalyzerClass) { + @JsonProperty("luceneAnalyzerClass") String luceneAnalyzerClass, + @JsonProperty("enablePrefixSuffixMatchingInPhraseQueries") Boolean enablePrefixSuffixMatchingInPhraseQueries) { super(disabled); _fstType = fstType; _rawValueForTextIndex = rawValueForTextIndex; @@ -72,6 +75,9 @@ public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty luceneMaxBufferSizeMB == null ? LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB : luceneMaxBufferSizeMB; _luceneAnalyzerClass = (luceneAnalyzerClass == null || luceneAnalyzerClass.isEmpty()) ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS : luceneAnalyzerClass; + _enablePrefixSuffixMatchingInPhraseQueries = + enablePrefixSuffixMatchingInPhraseQueries == null ? LUCENE_INDEX_ENABLE_PREFIX_SUFFIX_MATCH_IN_PHRASE_SEARCH + : enablePrefixSuffixMatchingInPhraseQueries; } public FSTType getFstType() { @@ -125,6 +131,16 @@ public String getLuceneAnalyzerClass() { return _luceneAnalyzerClass; } + /** + * Whether to enable prefix and suffix wildcard term matching (i.e., .*value for prefix and value.* for suffix + * term matching) in a phrase query. By default, Pinot today treats .* in a phrase query like ".*value str1 value.*" + * as literal. If this flag is enabled, .*value will be treated as suffix matching and value.* will be treated as + * prefix matching. + */ + public boolean isEnablePrefixSuffixMatchingInPhraseQueries() { + return _enablePrefixSuffixMatchingInPhraseQueries; + } + public static abstract class AbstractBuilder { @Nullable protected FSTType _fstType; @@ -137,6 +153,7 @@ public static abstract class AbstractBuilder { protected boolean _luceneUseCompoundFile = LUCENE_INDEX_DEFAULT_USE_COMPOUND_FILE; protected int _luceneMaxBufferSizeMB = LUCENE_INDEX_DEFAULT_MAX_BUFFER_SIZE_MB; protected String _luceneAnalyzerClass = FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS; + protected boolean _enablePrefixSuffixMatchingInPhraseQueries = false; public AbstractBuilder(@Nullable FSTType fstType) { _fstType = fstType; @@ -151,11 +168,13 @@ public AbstractBuilder(TextIndexConfig other) { _luceneUseCompoundFile = other._luceneUseCompoundFile; _luceneMaxBufferSizeMB = other._luceneMaxBufferSizeMB; _luceneAnalyzerClass = other._luceneAnalyzerClass; + _enablePrefixSuffixMatchingInPhraseQueries = other._enablePrefixSuffixMatchingInPhraseQueries; } public TextIndexConfig build() { return new TextIndexConfig(false, _fstType, _rawValueForTextIndex, _enableQueryCache, _useANDForMultiTermQueries, - _stopWordsInclude, _stopWordsExclude, _luceneUseCompoundFile, _luceneMaxBufferSizeMB, _luceneAnalyzerClass); + _stopWordsInclude, _stopWordsExclude, _luceneUseCompoundFile, _luceneMaxBufferSizeMB, _luceneAnalyzerClass, + _enablePrefixSuffixMatchingInPhraseQueries); } public abstract AbstractBuilder withProperties(@Nullable Map textIndexProperties); @@ -189,6 +208,12 @@ public AbstractBuilder withLuceneAnalyzerClass(String luceneAnalyzerClass) { _luceneAnalyzerClass = luceneAnalyzerClass; return this; } + + public AbstractBuilder withEnablePrefixSuffixMatchingInPhraseQueries( + boolean enablePrefixSuffixMatchingInPhraseQueries) { + _enablePrefixSuffixMatchingInPhraseQueries = enablePrefixSuffixMatchingInPhraseQueries; + return this; + } } @Override diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java index 201edeb39aa..8a01646da99 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java @@ -61,6 +61,8 @@ public class FieldConfig extends BaseJsonConfig { // Config to disable forward index public static final String FORWARD_INDEX_DISABLED = "forwardIndexDisabled"; public static final String DEFAULT_FORWARD_INDEX_DISABLED = Boolean.FALSE.toString(); + public static final String TEXT_INDEX_ENABLE_PREFIX_SUFFIX_PHRASE_QUERIES = + "enablePrefixSuffixMatchingInPhraseQueries"; private final String _name; private final EncodingType _encodingType; From e71c3dc0233f086425f5a3bcc52e84ee855d7762 Mon Sep 17 00:00:00 2001 From: sullis Date: Mon, 1 Apr 2024 11:43:56 -0700 Subject: [PATCH 22/50] upgrade to slf4j 2.0.12 (#12761) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index fc4f5c7648a..35f6de5315f 100644 --- a/pom.xml +++ b/pom.xml @@ -156,7 +156,7 @@ 1.5.5-11 1.8.0 2.23.1 - 2.0.9 + 2.0.12 4.1.108.Final 1.0.4 1.19.0 From 4f0bc11b95fa26df3fac8efe9845e4d71f7351dd Mon Sep 17 00:00:00 2001 From: "Xiaotian (Jackie) Jiang" <17555551+Jackie-Jiang@users.noreply.github.com> Date: Mon, 1 Apr 2024 15:05:25 -0700 Subject: [PATCH 23/50] Cleanup Javax and Jakarta dependencies (#12760) --- pinot-broker/pom.xml | 35 --- pinot-common/pom.xml | 37 ++- pinot-compatibility-verifier/pom.xml | 6 - .../pinot-flink-connector/pom.xml | 8 - .../pinot-spark-3-connector/pom.xml | 15 - pinot-controller/pom.xml | 45 +-- pinot-core/pom.xml | 79 +---- pinot-distribution/pom.xml | 8 - pinot-integration-test-base/pom.xml | 4 - pinot-integration-tests/pom.xml | 12 - pinot-minion/pom.xml | 4 - .../pinot-batch-ingestion-spark-2.4/pom.xml | 16 +- .../pinot-batch-ingestion-spark-3/pom.xml | 16 +- .../pinot-file-system/pinot-s3/pom.xml | 6 - .../pinot-kafka-2.0/pom.xml | 4 - .../pinot-pulsar/pom.xml | 49 +--- .../SerializedFrequentLongsSketch.java | 3 +- .../SerializedFrequentStringsSketch.java | 3 +- pinot-server/pom.xml | 63 ---- pinot-spi/pom.xml | 26 ++ pinot-tools/pom.xml | 12 - pom.xml | 275 +++++++++++------- 22 files changed, 233 insertions(+), 493 deletions(-) diff --git a/pinot-broker/pom.xml b/pinot-broker/pom.xml index ccc20628d42..f3fd18485a3 100644 --- a/pinot-broker/pom.xml +++ b/pinot-broker/pom.xml @@ -48,45 +48,10 @@ pinot-query-runtime
- - - org.glassfish.jersey.containers - jersey-container-grizzly2-http - - - org.glassfish.jersey.inject - jersey-hk2 - - - org.glassfish.jersey.media - jersey-media-json-jackson - - - org.glassfish.jersey.core - jersey-common - - - io.swagger - swagger-jaxrs - - - javax.ws.rs - jsr311-api - - - - - io.swagger - swagger-jersey2-jaxrs - com.jcabi jcabi-log - - org.glassfish.hk2 - hk2-locator - com.fasterxml.jackson.core jackson-databind diff --git a/pinot-common/pom.xml b/pinot-common/pom.xml index f9a3f64c415..396a7d676fa 100644 --- a/pinot-common/pom.xml +++ b/pinot-common/pom.xml @@ -217,6 +217,34 @@ org.apache.calcite calcite-babel + + org.glassfish.jersey.core + jersey-server + + + org.glassfish.jersey.containers + jersey-container-grizzly2-http + + + org.glassfish.jersey.media + jersey-media-multipart + + + org.glassfish.jersey.media + jersey-media-json-jackson + + + org.glassfish.jersey.inject + jersey-hk2 + + + org.glassfish.hk2 + hk2-metadata-generator + + + io.swagger + swagger-jersey2-jaxrs + org.testng testng @@ -312,15 +340,6 @@ org.apache.zookeeper zookeeper - - javax.servlet - javax.servlet-api - compile - - - org.glassfish.jersey.core - jersey-server - org.reflections reflections diff --git a/pinot-compatibility-verifier/pom.xml b/pinot-compatibility-verifier/pom.xml index e93397675da..cbccbda4185 100644 --- a/pinot-compatibility-verifier/pom.xml +++ b/pinot-compatibility-verifier/pom.xml @@ -94,12 +94,6 @@ pinot-controller ${project.version} test-jar - - - jakarta.activation - jakarta.activation-api - - diff --git a/pinot-connectors/pinot-flink-connector/pom.xml b/pinot-connectors/pinot-flink-connector/pom.xml index 5a0e15e26e3..a69cf4ad986 100644 --- a/pinot-connectors/pinot-flink-connector/pom.xml +++ b/pinot-connectors/pinot-flink-connector/pom.xml @@ -41,14 +41,6 @@ com.google.guava guava - - org.glassfish.jersey.inject - jersey-hk2 - - - org.glassfish.jersey.media - jersey-media-json-jackson - org.apache.flink flink-streaming-java_${scala.compat.version} diff --git a/pinot-connectors/pinot-spark-3-connector/pom.xml b/pinot-connectors/pinot-spark-3-connector/pom.xml index 6e78532fc28..6e53637cb27 100644 --- a/pinot-connectors/pinot-spark-3-connector/pom.xml +++ b/pinot-connectors/pinot-spark-3-connector/pom.xml @@ -200,24 +200,9 @@ - - org.apache.pinot - pinot-common - - - org.apache.pinot - pinot-core - org.apache.pinot pinot-spark-common - - test - javax.servlet - javax.servlet-api - 3.0.1 - - diff --git a/pinot-controller/pom.xml b/pinot-controller/pom.xml index b2b52171a06..e0c750631c8 100644 --- a/pinot-controller/pom.xml +++ b/pinot-controller/pom.xml @@ -55,12 +55,6 @@ org.apache.pinot pinot-server test - - - javax.servlet - servlet-api - - org.apache.pinot @@ -94,34 +88,7 @@ test-jar test - - org.glassfish.jersey.containers - jersey-container-grizzly2-http - - - org.glassfish.jersey.core - jersey-server - - - org.glassfish.jersey.inject - jersey-hk2 - - - org.glassfish.jersey.media - jersey-media-multipart - - - org.glassfish.jersey.core - jersey-common - - - org.glassfish.jersey.media - jersey-media-json-jackson - - - io.swagger - swagger-jersey2-jaxrs - + com.fasterxml.jackson.core jackson-annotations @@ -134,16 +101,6 @@ com.fasterxml.jackson.core jackson-core - - io.swagger - swagger-jaxrs - - - javax.ws.rs - jsr311-api - - - org.slf4j jcl-over-slf4j diff --git a/pinot-core/pom.xml b/pinot-core/pom.xml index d782c39659a..bd6217c2410 100644 --- a/pinot-core/pom.xml +++ b/pinot-core/pom.xml @@ -36,18 +36,6 @@ - - com.yscope.clp - clp-ffi - - - com.uber - h3 - - - org.roaringbitmap - RoaringBitmap - org.apache.pinot pinot-spi @@ -64,24 +52,7 @@ org.apache.pinot pinot-common - - joda-time - joda-time - - - org.slf4j - jcl-over-slf4j - - + io.netty netty-transport-native-epoll @@ -126,54 +97,6 @@ io.netty netty-all - - org.slf4j - slf4j-api - - - com.clearspring.analytics - stream - - - org.apache.datasketches - datasketches-java - - - com.dynatrace.hash4j - hash4j - - - com.tdunning - t-digest - - - org.xerial.larray - larray-mmap - - - net.sf.jopt-simple - jopt-simple - - - com.jayway.jsonpath - json-path - - - org.locationtech.jts - jts-core - - - org.glassfish.jersey.containers - jersey-container-grizzly2-http - - - org.glassfish.grizzly - grizzly-http-server - - - org.glassfish.hk2 - hk2-locator - diff --git a/pinot-distribution/pom.xml b/pinot-distribution/pom.xml index a9bb9f5ecd8..540420cc22e 100644 --- a/pinot-distribution/pom.xml +++ b/pinot-distribution/pom.xml @@ -114,14 +114,6 @@ - - javax.servlet - javax.servlet-api - - - javax.activation - activation - diff --git a/pinot-integration-test-base/pom.xml b/pinot-integration-test-base/pom.xml index 6f07a8f5d4d..1aba58b3fd8 100644 --- a/pinot-integration-test-base/pom.xml +++ b/pinot-integration-test-base/pom.xml @@ -149,10 +149,6 @@ org.testng testng - - javax.servlet - javax.servlet-api - com.h2database h2 diff --git a/pinot-integration-tests/pom.xml b/pinot-integration-tests/pom.xml index 3280f652cab..995e423c666 100644 --- a/pinot-integration-tests/pom.xml +++ b/pinot-integration-tests/pom.xml @@ -202,14 +202,6 @@ com.101tec zkclient - - org.glassfish.hk2 - hk2-locator - - - org.glassfish.hk2 - hk2-metadata-generator - org.apache.pinot pinot-server @@ -330,10 +322,6 @@ org.testng testng - - javax.servlet - javax.servlet-api - com.h2database h2 diff --git a/pinot-minion/pom.xml b/pinot-minion/pom.xml index bdb7ed1b46a..ffd11f307b0 100644 --- a/pinot-minion/pom.xml +++ b/pinot-minion/pom.xml @@ -76,9 +76,5 @@ pinot-yammer test - - io.swagger - swagger-jersey2-jaxrs - diff --git a/pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark-2.4/pom.xml b/pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark-2.4/pom.xml index 07cc979ccfa..45568ae319a 100644 --- a/pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark-2.4/pom.xml +++ b/pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark-2.4/pom.xml @@ -54,8 +54,8 @@ provided - org.scala-lang - scala-library + com.zaxxer + HikariCP-java7 com.twitter @@ -77,18 +77,6 @@ org.slf4j slf4j-log4j12 - - com.zaxxer - HikariCP-java7 - - - org.glassfish.hk2.external - jakarta.inject - - - jakarta.ws.rs - jakarta.ws.rs-api -
diff --git a/pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark-3/pom.xml b/pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark-3/pom.xml index f50b384ee11..fd36cd868ce 100644 --- a/pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark-3/pom.xml +++ b/pinot-plugins/pinot-batch-ingestion/pinot-batch-ingestion-spark-3/pom.xml @@ -52,8 +52,8 @@ provided - org.scala-lang - scala-library + com.zaxxer + HikariCP-java7 com.twitter @@ -75,18 +75,6 @@ org.slf4j slf4j-log4j12 - - com.zaxxer - HikariCP-java7 - - - org.glassfish.hk2.external - jakarta.inject - - - jakarta.ws.rs - jakarta.ws.rs-api - commons-logging commons-logging diff --git a/pinot-plugins/pinot-file-system/pinot-s3/pom.xml b/pinot-plugins/pinot-file-system/pinot-s3/pom.xml index 428138fc28a..b0322f92b07 100644 --- a/pinot-plugins/pinot-file-system/pinot-s3/pom.xml +++ b/pinot-plugins/pinot-file-system/pinot-s3/pom.xml @@ -38,7 +38,6 @@ 4.5.14 4.4.13 2.12.2 - 3.1.0 package @@ -135,11 +134,6 @@ ${s3mock.version} test - - javax.servlet - javax.servlet-api - ${javax.version} - com.fasterxml.woodstox woodstox-core diff --git a/pinot-plugins/pinot-stream-ingestion/pinot-kafka-2.0/pom.xml b/pinot-plugins/pinot-stream-ingestion/pinot-kafka-2.0/pom.xml index 71567e099df..533e18f90e2 100644 --- a/pinot-plugins/pinot-stream-ingestion/pinot-kafka-2.0/pom.xml +++ b/pinot-plugins/pinot-stream-ingestion/pinot-kafka-2.0/pom.xml @@ -59,10 +59,6 @@ net.sf.jopt-simple jopt-simple - - org.scala-lang - scala-library - diff --git a/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml b/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml index 4ccd6b1d0c3..e9d4696e71f 100644 --- a/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml +++ b/pinot-plugins/pinot-stream-ingestion/pinot-pulsar/pom.xml @@ -37,16 +37,11 @@ package ${basedir}/../../.. - 3.1.0 - 2.1 - 2.39 0.16.0 - 2.29.0 1.60.1 + 1.62.2 2.6.2 1.17 - 1.2 - 1.62.2 @@ -60,10 +55,6 @@ org.apache.pulsar pulsar-client-original - - javax.ws.rs - javax.ws.rs-api - commons-configuration commons-configuration @@ -87,29 +78,16 @@ pulsar-client-admin-original - javax.servlet - javax.servlet-api - ${javax.servlet-api.version} - - - javax.ws.rs - javax.ws.rs-api - ${javax.ws.rs-api.version} + org.glassfish.jersey.core + jersey-server org.glassfish.jersey.containers jersey-container-grizzly2-http - ${jersey-container-grizzly2-http.version} - - - org.glassfish.jersey.core - jersey-server - ${jersey-container-grizzly2-http.version} org.glassfish.jersey.containers jersey-container-servlet-core - ${jersey-container-grizzly2-http.version} io.netty @@ -129,6 +107,11 @@ grpc-context ${grpc-context.version} + + io.grpc + grpc-protobuf-lite + ${grpc-protobuf-lite.version} + io.prometheus simpleclient @@ -151,22 +134,6 @@ simpleclient_hotspot ${simpleclient_common.version} - - io.grpc - grpc-protobuf-lite - ${grpc-protobuf-lite.version} - - - io.grpc - grpc-context - - - - - javax.annotation - javax.annotation-api - ${javax.annotation-api.version} - org.codehaus.mojo animal-sniffer-annotations diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/customobject/SerializedFrequentLongsSketch.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/customobject/SerializedFrequentLongsSketch.java index 53124e473b3..203515c67bb 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/customobject/SerializedFrequentLongsSketch.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/customobject/SerializedFrequentLongsSketch.java @@ -19,7 +19,6 @@ package org.apache.pinot.segment.local.customobject; import java.util.Base64; -import javax.validation.constraints.NotNull; import org.apache.datasketches.frequencies.LongsSketch; @@ -31,7 +30,7 @@ public SerializedFrequentLongsSketch(LongsSketch sketch) { } @Override - public int compareTo(@NotNull LongsSketch other) { + public int compareTo(LongsSketch other) { // There is no well-defined ordering for these sketches // numActiveItems is just a placeholder, which can be changed later return _sketch.getNumActiveItems() - other.getNumActiveItems(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/customobject/SerializedFrequentStringsSketch.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/customobject/SerializedFrequentStringsSketch.java index 40f89bc83df..040692a553f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/customobject/SerializedFrequentStringsSketch.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/customobject/SerializedFrequentStringsSketch.java @@ -19,7 +19,6 @@ package org.apache.pinot.segment.local.customobject; import java.util.Base64; -import javax.validation.constraints.NotNull; import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.frequencies.ItemsSketch; @@ -31,7 +30,7 @@ public SerializedFrequentStringsSketch(ItemsSketch sketch) { } @Override - public int compareTo(@NotNull ItemsSketch other) { + public int compareTo(ItemsSketch other) { // There is no well-defined ordering for these sketches // numActiveItems is just a placeholder, which can be changed later return _sketch.getNumActiveItems() - other.getNumActiveItems(); diff --git a/pinot-server/pom.xml b/pinot-server/pom.xml index 39c95e01a2e..b39ec13324d 100644 --- a/pinot-server/pom.xml +++ b/pinot-server/pom.xml @@ -74,11 +74,6 @@ testng test - - javax.servlet - javax.servlet-api - compile - org.apache.helix helix-core @@ -91,54 +86,6 @@ com.jcabi jcabi-log - - org.glassfish.jersey.containers - jersey-container-grizzly2-http - - - org.glassfish.jersey.core - jersey-server - - - org.javassist - javassist - - - - - org.javassist - javassist - - - org.glassfish.jersey.core - jersey-common - - - org.glassfish.jersey.inject - jersey-hk2 - - - org.glassfish.jersey.media - jersey-media-json-jackson - - - io.swagger - swagger-jersey2-jaxrs - - - javax.ws.rs - javax.ws.rs-api - - - org.glassfish.hk2.external - javax.inject - - - - - javax.ws.rs - javax.ws.rs-api - com.fasterxml.jackson.core jackson-annotations @@ -151,16 +98,6 @@ com.fasterxml.jackson.core jackson-databind - - io.swagger - swagger-jaxrs - - - javax.ws.rs - jsr311-api - - - org.webjars swagger-ui diff --git a/pinot-spi/pom.xml b/pinot-spi/pom.xml index 99ee1929054..d1c76c9e888 100644 --- a/pinot-spi/pom.xml +++ b/pinot-spi/pom.xml @@ -90,6 +90,32 @@ commons-math + + + jakarta.servlet + jakarta.servlet-api + + + jakarta.ws.rs + jakarta.ws.rs-api + + + + + javax.servlet + javax.servlet-api + + + + javax.ws.rs + javax.ws.rs-api + + + + javax.annotation + javax.annotation-api + + org.slf4j jcl-over-slf4j diff --git a/pinot-tools/pom.xml b/pinot-tools/pom.xml index a47c828f7b4..115a79d6d26 100644 --- a/pinot-tools/pom.xml +++ b/pinot-tools/pom.xml @@ -151,10 +151,6 @@ com.google.errorprone error_prone_annotations - - javax.annotation - javax.annotation-api - org.codehaus.mojo animal-sniffer-annotations @@ -179,10 +175,6 @@ io.grpc grpc-context - - jakarta.activation - jakarta.activation-api - com.typesafe.netty netty-reactive-streams @@ -229,10 +221,6 @@ testng test - - org.glassfish.jersey.containers - jersey-container-grizzly2-http - org.glassfish.tyrus.bundles tyrus-standalone-client diff --git a/pom.xml b/pom.xml index 35f6de5315f..bf687489111 100644 --- a/pom.xml +++ b/pom.xml @@ -134,13 +134,13 @@ 1.11.3 1.13.1 + 1.5.9 1.3.1 0.11 2.12.7.20221012 3.9.2 2.12.3 2.39 - 2.4.4 2.6.1 1.6.9 3.3.6 @@ -189,6 +189,25 @@ 3.2.2 2.2 + + 6.0.0 + 3.0.2 + 2.1.1 + 4.0.2 + 3.1.0 + 2.1.3 + 3.1.1 + + 4.0.1 + 2.0.1.Final + 1.3.2 + 2.3.1 + 1.0-2 + 2.1.1 + 1.1.1 + 1.1.1 + 2.2 + 26.34.0 1.23.0 @@ -496,11 +515,26 @@ avro ${avro.version} + + org.apache.avro + avro-mapred + ${avro.version} + org.apache.parquet parquet-avro ${parquet.version} + + org.apache.orc + orc-core + ${orc.version} + + + org.apache.orc + orc-mapreduce + ${orc.version} + org.xerial.snappy snappy-java @@ -521,32 +555,11 @@ libthrift 0.15.0 - - javax.servlet - javax.servlet-api - 3.0.1 - compile - - - javax.ws.rs - javax.ws.rs-api - 2.0.1 - org.quartz-scheduler quartz ${quartz.version} - - javax.validation - validation-api - 2.0.1.Final - - - javax.activation - activation - 1.1.1 - org.apache.helix helix-core @@ -743,6 +756,89 @@ ${commons-math.version} + + + jakarta.servlet + jakarta.servlet-api + ${jakarta.servlet-api.version} + + + jakarta.validation + jakarta.validation-api + ${jakarta.validation-api.version} + + + jakarta.annotation + jakarta.annotation-api + ${jakarta.annotation-api.version} + + + jakarta.xml.bind + jakarta.xml.bind-api + ${jakarta.xml.bind-api.version} + + + jakarta.ws.rs + jakarta.ws.rs-api + ${jakarta.ws.rs-api.version} + + + jakarta.activation + jakarta.activation-api + ${jakarta.activation-api.version} + + + jakarta.servlet.jsp + jakarta.servlet.jsp-api + ${jakarta.servlet.jsp-api.version} + + + + javax.servlet + javax.servlet-api + ${javax.servlet-api.version} + + + javax.validation + validation-api + ${javax.validation-api.version} + + + javax.annotation + javax.annotation-api + ${javax.annotation-api.version} + + + javax.xml.bind + jaxb-api + ${javax.jaxb-api.version} + + + javax.xml.bind + stax-api + ${javax.stax-api.version} + + + javax.ws.rs + javax.ws.rs-api + ${javax.ws.rs-api.version} + + + javax.ws.rs + jsr311-api + ${javax.jsr311-api.version} + + + javax.activation + activation + ${javax.activation.version} + + + javax.servlet.jsp + javax.servlet.jsp-api + ${javax.jsp-api.version} + + com.google.cloud @@ -956,6 +1052,22 @@ org.eclipse.jetty jetty-util + + com.sun.jersey + jersey-core + + + com.sun.jersey + jersey-client + + + com.sun.jersey + jersey-server + + + com.sun.jersey.contribs + jersey-guice + commons-logging commons-logging @@ -978,12 +1090,6 @@ reload4j 1.2.25 - - - javax.xml.bind - jaxb-api - 2.3.1 - org.apache.kerby @@ -1026,12 +1132,6 @@ org.eclipse.jetty jetty-server ${eclipse.jetty.version} - - - javax.servlet - javax.servlet-api - - org.eclipse.jetty @@ -1088,21 +1188,6 @@ ${dropwizard-metrics.version} - - org.apache.orc - orc-core - 1.5.9 - - - org.apache.avro - avro-mapred - ${avro.version} - - - org.apache.orc - orc-mapreduce - 1.5.9 - org.webjars swagger-ui @@ -1181,35 +1266,16 @@ jcabi-log 0.24.1 - - org.glassfish.jersey.containers - jersey-container-grizzly2-http - ${jersey.version} - - - org.glassfish.grizzly - grizzly-http-server - ${grizzly.version} - - - org.glassfish.jersey.core - jersey-server - ${jersey.version} - - - org.javassist - javassist - - - org.javassist javassist 3.19.0-GA + + org.glassfish.jersey.core - jersey-common + jersey-server ${jersey.version} @@ -1218,39 +1284,19 @@ ${jersey.version} - org.glassfish.jersey.inject - jersey-hk2 + org.glassfish.jersey.core + jersey-common ${jersey.version} - org.glassfish.hk2 - hk2-locator - ${hk2.version} - - - jakarta.annotation - jakarta.annotation-api - - - jakarta.ws.rs - jakarta.ws.rs-api - - + org.glassfish.jersey.containers + jersey-container-grizzly2-http + ${jersey.version} - org.glassfish.hk2 - hk2-metadata-generator - ${hk2.version} - - - jakarta.annotation - jakarta.annotation-api - - - jakarta.ws.rs - jakarta.ws.rs-api - - + org.glassfish.jersey.containers + jersey-container-servlet-core + ${jersey.version} org.glassfish.jersey.media @@ -1263,25 +1309,26 @@ ${jersey.version} - io.swagger - swagger-jaxrs - ${swagger.version} + org.glassfish.jersey.inject + jersey-hk2 + ${jersey.version} + + + org.glassfish.hk2 + hk2-locator + ${hk2.version} + + + org.glassfish.hk2 + hk2-metadata-generator + ${hk2.version} io.swagger swagger-jersey2-jaxrs ${swagger.version} - - - javax.ws.rs - jsr311-api - - - org.glassfish.hk2.external - javax.inject - - + org.apache.maven.surefire surefire-testng @@ -1708,7 +1755,11 @@ + commons-logging:commons-logging + + javax.inject:javax.inject + jakarta.inject:jakarta.inject-api From 08fc2c7d62ba17d64b6cb680017e29b4480c8f22 Mon Sep 17 00:00:00 2001 From: sullis Date: Mon, 1 Apr 2024 17:39:09 -0700 Subject: [PATCH 24/50] upgrade lmax disruptor to 4.0.0 (#12769) --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index bf687489111..a9649153685 100644 --- a/pom.xml +++ b/pom.xml @@ -629,7 +629,7 @@ com.lmax disruptor - 3.3.4 + 4.0.0 org.asynchttpclient From 4abb2d18f733781539d2d72ab75e1bb03c197489 Mon Sep 17 00:00:00 2001 From: Erich <134291879+ege-st@users.noreply.github.com> Date: Tue, 2 Apr 2024 03:37:57 -0400 Subject: [PATCH 25/50] Set column major builder to be on by default (#12770) --- .../java/org/apache/pinot/spi/config/table/IndexingConfig.java | 2 +- .../pinot/spi/config/table/ingestion/StreamIngestionConfig.java | 2 +- .../org/apache/pinot/spi/utils/builder/TableConfigBuilder.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java index a433c845cad..ce5bc79dda7 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/IndexingConfig.java @@ -62,7 +62,7 @@ public class IndexingConfig extends BaseJsonConfig { private SegmentPartitionConfig _segmentPartitionConfig; private boolean _aggregateMetrics; private boolean _nullHandlingEnabled; - private boolean _columnMajorSegmentBuilderEnabled = false; + private boolean _columnMajorSegmentBuilderEnabled = true; /** * If `optimizeDictionary` enabled, dictionary is not created for the high-cardinality diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/StreamIngestionConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/StreamIngestionConfig.java index 365911ee69f..2d832dd4b2f 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/StreamIngestionConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/StreamIngestionConfig.java @@ -35,7 +35,7 @@ public class StreamIngestionConfig extends BaseJsonConfig { private final List> _streamConfigMaps; @JsonPropertyDescription("Whether to use column major mode when creating the segment.") - private boolean _columnMajorSegmentBuilderEnabled; + private boolean _columnMajorSegmentBuilderEnabled = true; @JsonCreator public StreamIngestionConfig(@JsonProperty("streamConfigMaps") List> streamConfigMaps) { diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java index 05b7b30bb96..dc8fb2ae8a1 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/builder/TableConfigBuilder.java @@ -100,7 +100,7 @@ public class TableConfigBuilder { private Map _streamConfigs; private SegmentPartitionConfig _segmentPartitionConfig; private boolean _nullHandlingEnabled; - private boolean _columnMajorSegmentBuilderEnabled; + private boolean _columnMajorSegmentBuilderEnabled = true; private List _varLengthDictionaryColumns; private List _starTreeIndexConfigs; private List _jsonIndexColumns; From 3695bed80d7a040305f144575d20f0dcff8a1d9a Mon Sep 17 00:00:00 2001 From: "Xiaotian (Jackie) Jiang" <17555551+Jackie-Jiang@users.noreply.github.com> Date: Tue, 2 Apr 2024 09:42:24 -0700 Subject: [PATCH 26/50] Upgrade Calcite to 1.36.0 (#12754) --- pinot-common/pom.xml | 30 +- pinot-common/src/main/codegen/config.fmpp | 111 +- .../src/main/codegen/default_config.fmpp | 465 + .../src/main/codegen/includes/parserImpls.ftl | 32 +- .../src/main/codegen/templates/Parser.jj | 9131 +++++++++++++++++ .../pinot/sql/parsers/CalciteSqlParser.java | 78 +- .../apache/pinot/sql/parsers/ParserUtils.java | 70 + .../sql/parsers/CalciteSqlCompilerTest.java | 18 +- .../api/resources/PinotQueryResource.java | 10 +- .../data/function/DateTimeFunctionsTest.java | 6 +- .../tests/StarTreeClusterIntegrationTest.java | 4 +- .../integration/tests/custom/ArrayTest.java | 313 +- .../src/test/resources/ssb/ssb_query_set.yaml | 6 +- .../calcite/rel/rules/PinotQueryRuleSets.java | 9 +- .../calcite/rel/rules/PinotRuleUtils.java | 18 +- .../calcite/sql2rel/PinotConvertletTable.java | 19 +- .../apache/pinot/query/QueryEnvironment.java | 162 +- .../parser/CalciteRexExpressionParser.java | 11 +- .../pinot/query/parser/ParserUtils.java | 118 - .../pinot/query/parser/QueryRewriter.java | 46 - .../apache/pinot/query/planner/QueryPlan.java | 51 - .../query/planner/QueryPlanMetadata.java | 10 +- .../pinot/query/planner/SubPlanMetadata.java | 9 +- .../logical/PinotLogicalQueryPlanner.java | 42 +- .../planner/logical/SubPlanFragmenter.java | 5 +- .../physical/DispatchablePlanContext.java | 8 +- .../planner/physical/DispatchableSubPlan.java | 8 +- .../pinot/query/validate/Validator.java | 3 +- .../pinot/query/QueryCompilationTest.java | 6 +- .../queries/ResourceBasedQueryPlansTest.java | 10 +- .../src/test/resources/queries/JoinPlans.json | 39 +- .../resources/queries/PinotHintablePlans.json | 2 +- .../queries/WindowFunctionPlans.json | 72 +- .../service/dispatch/QueryDispatcher.java | 12 +- .../queries/ResourceBasedQueriesTest.java | 2 +- .../src/test/resources/queries/Case.json | 7 +- .../resources/queries/LexicalStructure.json | 2 +- .../test/resources/queries/TypeCasting.json | 8 +- pom.xml | 4 +- 39 files changed, 10119 insertions(+), 838 deletions(-) create mode 100644 pinot-common/src/main/codegen/default_config.fmpp create mode 100644 pinot-common/src/main/codegen/templates/Parser.jj create mode 100644 pinot-common/src/main/java/org/apache/pinot/sql/parsers/ParserUtils.java delete mode 100644 pinot-query-planner/src/main/java/org/apache/pinot/query/parser/ParserUtils.java delete mode 100644 pinot-query-planner/src/main/java/org/apache/pinot/query/parser/QueryRewriter.java delete mode 100644 pinot-query-planner/src/main/java/org/apache/pinot/query/planner/QueryPlan.java diff --git a/pinot-common/pom.xml b/pinot-common/pom.xml index 396a7d676fa..a1002ca458a 100644 --- a/pinot-common/pom.xml +++ b/pinot-common/pom.xml @@ -63,35 +63,7 @@ - - - org.apache.maven.plugins - maven-dependency-plugin - - - unpack-parser-template - generate-sources - - unpack - - - - - org.apache.calcite - calcite-core - ${calcite.version} - jar - true - ${project.build.directory}/ - **/Parser.jj,**/default_config.fmpp - - - - - - - + maven-resources-plugin diff --git a/pinot-common/src/main/codegen/config.fmpp b/pinot-common/src/main/codegen/config.fmpp index c2fb71ea01d..95c5a3d33fb 100644 --- a/pinot-common/src/main/codegen/config.fmpp +++ b/pinot-common/src/main/codegen/config.fmpp @@ -17,6 +17,8 @@ # under the License. # +# Copied from Calcite 1.36.0 babel and modified for Pinot syntax. Update this file when upgrading Calcite version. + data: { default: tdd("../default_config.fmpp") @@ -30,24 +32,28 @@ data: { package: "org.apache.pinot.sql.parsers.parser", class: "SqlParserImpl", - # List of import statements. + # List of additional classes and packages to import. + # Example: "org.apache.calcite.sql.*", "java.util.List". imports: [ - "com.google.common.collect.*" "org.apache.pinot.sql.parsers.parser.*" - "java.util.*" ] - # List of new keywords to add + # List of new keywords. Example: "DATABASES", "TABLES". If the keyword is + # not a reserved keyword, add it to the 'nonReservedKeywords' section. keywords: [ "FILE" "ARCHIVE" ] - # List of non-reserved keywords to add + # List of non-reserved keywords to add; + # items in this list become non-reserved nonReservedKeywordsToAdd: [ - # customized for Pinot "FILE" "ARCHIVE" + # Pinot allows using DEFAULT as the catalog name + "DEFAULT_" + # Pinot allows using DATETIME as column name + "DATETIME" # The following keywords are reserved in core Calcite, # are reserved in some version of SQL, @@ -134,6 +140,7 @@ data: { "CONSTRAINTS" "CONSTRUCTOR" "CONTAINS" + "CONTAINS_SUBSTR" "CONTINUE" "CONVERT" "CORR" @@ -161,12 +168,13 @@ data: { "CYCLE" "DATA" # "DATE" + "DATETIME_DIFF" "DAY" "DEALLOCATE" "DEC" "DECIMAL" "DECLARE" - "DEFAULT_" +# "DEFAULT" "DEFERRABLE" "DEFERRED" # "DEFINE" @@ -241,7 +249,6 @@ data: { "HOLD" "HOUR" "IDENTITY" -# "IF" # not a keyword in Calcite "ILIKE" "IMMEDIATE" "IMMEDIATELY" @@ -468,7 +475,11 @@ data: { "TEMPORARY" # "THEN" # "TIME" + "TIME_DIFF" + "TIME_TRUNC" # "TIMESTAMP" + "TIMESTAMP_DIFF" + "TIMESTAMP_TRUNC" "TIMEZONE_HOUR" "TIMEZONE_MINUTE" "TINYINT" @@ -525,21 +536,99 @@ data: { "ZONE" ] - # List of extended statement syntax to add + # List of non-reserved keywords to remove; + # items in this list become reserved. + nonReservedKeywordsToRemove: [ + ] + + # List of additional join types. Each is a method with no arguments. + # Example: "LeftSemiJoin". + joinTypes: [ + ] + + # List of methods for parsing custom SQL statements. + # Return type of method implementation should be 'SqlNode'. + # Example: "SqlShowDatabases()", "SqlShowTables()". statementParserMethods: [ "SqlInsertFromFile()" "SqlPhysicalExplain()" ] - # List of custom function syntax to add + # List of methods for parsing custom literals. + # Return type of method implementation should be "SqlNode". + # Example: ParseJsonLiteral(). + literalParserMethods: [ + ] + + # List of methods for parsing custom data types. + # Return type of method implementation should be "SqlTypeNameSpec". + # Example: SqlParseTimeStampZ(). + dataTypeParserMethods: [ + ] + + # List of methods for parsing builtin function calls. + # Return type of method implementation should be "SqlNode". + # Example: "DateTimeConstructorCall()". + builtinFunctionCallMethods: [ + ] + + # List of methods for parsing extensions to "ALTER " calls. + # Each must accept arguments "(SqlParserPos pos, String scope)". + # Example: "SqlAlterTable". + alterStatementParserMethods: [ + ] + + # List of methods for parsing extensions to "CREATE [OR REPLACE]" calls. + # Each must accept arguments "(SqlParserPos pos, boolean replace)". + # Example: "SqlCreateForeignSchema". + createStatementParserMethods: [ + ] + + # List of methods for parsing extensions to "DROP" calls. + # Each must accept arguments "(SqlParserPos pos)". + # Example: "SqlDropSchema". + dropStatementParserMethods: [ + ] + + # List of methods for parsing extensions to "TRUNCATE" calls. + # Each must accept arguments "(SqlParserPos pos)". + # Example: "SqlTruncate". + truncateStatementParserMethods: [ + ] + + # Binary operators tokens. + # Example: "< INFIX_CAST: \"::\" >". + binaryOperatorsTokens: [ + ] + + # Binary operators initialization. + # Example: "InfixCast". extraBinaryExpressions: [ "SqlAtTimeZone" ] # List of files in @includes directory that have parser method + # implementations for parsing custom SQL statements, literals or types + # given as part of "statementParserMethods", "literalParserMethods" or + # "dataTypeParserMethods". + # Example: "parserImpls.ftl". implementationFiles: [ "parserImpls.ftl" - ], + ] + + # Custom identifier token. + # + # PostgreSQL allows letters with diacritical marks and non-Latin letters + # in the beginning of identifier and additionally dollar sign in the rest of identifier. + # Letters with diacritical marks and non-Latin letters + # are represented by character codes 128 to 255 (or in octal \200 to \377). + # See https://learn.microsoft.com/en-gb/office/vba/language/reference/user-interface-help/character-set-128255 + # See https://github.com/postgres/postgres/blob/master/src/backend/parser/scan.l + # + # MySQL allows digit in the beginning of identifier + customIdentifierToken: "< IDENTIFIER: (||[\"\\200\"-\"\\377\"]) (|||[\"\\200\"-\"\\377\"])* >" + + includeParsingStringLiteralAsArrayLiteral: true } } diff --git a/pinot-common/src/main/codegen/default_config.fmpp b/pinot-common/src/main/codegen/default_config.fmpp new file mode 100644 index 00000000000..78191c0c11c --- /dev/null +++ b/pinot-common/src/main/codegen/default_config.fmpp @@ -0,0 +1,465 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Copied from Calcite 1.36.0 and modified for Pinot syntax. Update this file when upgrading Calcite version. + +# Default data declarations for parsers. +# Each of these may be overridden in a parser's config.fmpp file. +# In addition, each parser must define "package" and "class". +parser: { + # List of additional classes and packages to import. + # Example: "org.apache.calcite.sql.*", "java.util.List". + imports: [ + ] + + # List of new keywords. Example: "DATABASES", "TABLES". If the keyword is + # not a reserved keyword, add it to the 'nonReservedKeywords' section. + keywords: [ + ] + + # List of keywords from "keywords" section that are not reserved. + nonReservedKeywords: [ + "A" + "ABSENT" + "ABSOLUTE" + "ACTION" + "ADA" + "ADD" + "ADMIN" + "AFTER" + "ALWAYS" + "APPLY" + "ARRAY_AGG" + "ARRAY_CONCAT_AGG" + "ASC" + "ASSERTION" + "ASSIGNMENT" + "ATTRIBUTE" + "ATTRIBUTES" + "BEFORE" + "BERNOULLI" + "BREADTH" + "C" + "CASCADE" + "CATALOG" + "CATALOG_NAME" + "CENTURY" + "CHAIN" + "CHARACTERISTICS" + "CHARACTERS" + "CHARACTER_SET_CATALOG" + "CHARACTER_SET_NAME" + "CHARACTER_SET_SCHEMA" + "CLASS_ORIGIN" + "COBOL" + "COLLATION" + "COLLATION_CATALOG" + "COLLATION_NAME" + "COLLATION_SCHEMA" + "COLUMN_NAME" + "COMMAND_FUNCTION" + "COMMAND_FUNCTION_CODE" + "COMMITTED" + "CONDITIONAL" + "CONDITION_NUMBER" + "CONNECTION" + "CONNECTION_NAME" + "CONSTRAINT_CATALOG" + "CONSTRAINT_NAME" + "CONSTRAINTS" + "CONSTRAINT_SCHEMA" + "CONSTRUCTOR" + "CONTAINS_SUBSTR" + "CONTINUE" + "CURSOR_NAME" + "DATA" + "DATABASE" + "DATE_DIFF" + "DATE_TRUNC" + "DATETIME_DIFF" + "DATETIME_INTERVAL_CODE" + "DATETIME_INTERVAL_PRECISION" + "DATETIME_TRUNC" + "DAYOFWEEK" + "DAYOFYEAR" + "DAYS" + "DECADE" + "DEFAULTS" + "DEFERRABLE" + "DEFERRED" + "DEFINED" + "DEFINER" + "DEGREE" + "DEPTH" + "DERIVED" + "DESC" + "DESCRIPTION" + "DESCRIPTOR" + "DIAGNOSTICS" + "DISPATCH" + "DOMAIN" + "DOW" + "DOY" + "DOT_FORMAT" + "DYNAMIC_FUNCTION" + "DYNAMIC_FUNCTION_CODE" + "ENCODING" + "EPOCH" + "ERROR" + "EXCEPTION" + "EXCLUDE" + "EXCLUDING" + "FINAL" + "FIRST" + "FOLLOWING" + "FORMAT" + "FORTRAN" + "FOUND" + "FRAC_SECOND" + "G" + "GENERAL" + "GENERATED" + "GEOMETRY" + "GO" + "GOTO" + "GRANTED" + "GROUP_CONCAT" + "HIERARCHY" + "HOP" + "HOURS" + "IGNORE" + "ILIKE" + "IMMEDIATE" + "IMMEDIATELY" + "IMPLEMENTATION" + "INCLUDE" + "INCLUDING" + "INCREMENT" + "INITIALLY" + "INPUT" + "INSTANCE" + "INSTANTIABLE" + "INVOKER" + "ISODOW" + "ISOLATION" + "ISOYEAR" + "JAVA" + "JSON" + "K" + "KEY" + "KEY_MEMBER" + "KEY_TYPE" + "LABEL" + "LAST" + "LENGTH" + "LEVEL" + "LIBRARY" + "LOCATOR" + "M" + "MAP" + "MATCHED" + "MAXVALUE" + "MESSAGE_LENGTH" + "MESSAGE_OCTET_LENGTH" + "MESSAGE_TEXT" + "MICROSECOND" + "MILLENNIUM" + "MILLISECOND" + "MINUTES" + "MINVALUE" + "MONTHS" + "MORE_" + "MUMPS" + "NAME" + "NAMES" + "NANOSECOND" + "NESTING" + "NORMALIZED" + "NULLABLE" + "NULLS" + "NUMBER" + "OBJECT" + "OCTETS" + "OPTION" + "OPTIONS" + "ORDERING" + "ORDINALITY" + "OTHERS" + "OUTPUT" + "OVERRIDING" + "PAD" + "PARAMETER_MODE" + "PARAMETER_NAME" + "PARAMETER_ORDINAL_POSITION" + "PARAMETER_SPECIFIC_CATALOG" + "PARAMETER_SPECIFIC_NAME" + "PARAMETER_SPECIFIC_SCHEMA" + "PARTIAL" + "PASCAL" + "PASSING" + "PASSTHROUGH" + "PAST" + "PATH" + "PIVOT" + "PLACING" + "PLAN" + "PLI" + "PRECEDING" + "PRESERVE" + "PRIOR" + "PRIVILEGES" + "PUBLIC" + "QUARTER" + "QUARTERS" + "READ" + "RELATIVE" + "REPEATABLE" + "REPLACE" + "RESPECT" + "RESTART" + "RESTRICT" + "RETURNED_CARDINALITY" + "RETURNED_LENGTH" + "RETURNED_OCTET_LENGTH" + "RETURNED_SQLSTATE" + "RETURNING" + "RLIKE" + "ROLE" + "ROUTINE" + "ROUTINE_CATALOG" + "ROUTINE_NAME" + "ROUTINE_SCHEMA" + "ROW_COUNT" + "SCALAR" + "SCALE" + "SCHEMA" + "SCHEMA_NAME" + "SCOPE_CATALOGS" + "SCOPE_NAME" + "SCOPE_SCHEMA" + "SECONDS" + "SECTION" + "SECURITY" + "SELF" + "SEPARATOR" + "SEQUENCE" + "SERIALIZABLE" + "SERVER" + "SERVER_NAME" + "SESSION" + "SETS" + "SIMPLE" + "SIZE" + "SOURCE" + "SPACE" + "SPECIFIC_NAME" + "SQL_BIGINT" + "SQL_BINARY" + "SQL_BIT" + "SQL_BLOB" + "SQL_BOOLEAN" + "SQL_CHAR" + "SQL_CLOB" + "SQL_DATE" + "SQL_DECIMAL" + "SQL_DOUBLE" + "SQL_FLOAT" + "SQL_INTEGER" + "SQL_INTERVAL_DAY" + "SQL_INTERVAL_DAY_TO_HOUR" + "SQL_INTERVAL_DAY_TO_MINUTE" + "SQL_INTERVAL_DAY_TO_SECOND" + "SQL_INTERVAL_HOUR" + "SQL_INTERVAL_HOUR_TO_MINUTE" + "SQL_INTERVAL_HOUR_TO_SECOND" + "SQL_INTERVAL_MINUTE" + "SQL_INTERVAL_MINUTE_TO_SECOND" + "SQL_INTERVAL_MONTH" + "SQL_INTERVAL_SECOND" + "SQL_INTERVAL_YEAR" + "SQL_INTERVAL_YEAR_TO_MONTH" + "SQL_LONGVARBINARY" + "SQL_LONGVARCHAR" + "SQL_LONGVARNCHAR" + "SQL_NCHAR" + "SQL_NCLOB" + "SQL_NUMERIC" + "SQL_NVARCHAR" + "SQL_REAL" + "SQL_SMALLINT" + "SQL_TIME" + "SQL_TIMESTAMP" + "SQL_TINYINT" + "SQL_TSI_DAY" + "SQL_TSI_FRAC_SECOND" + "SQL_TSI_HOUR" + "SQL_TSI_MICROSECOND" + "SQL_TSI_MINUTE" + "SQL_TSI_MONTH" + "SQL_TSI_QUARTER" + "SQL_TSI_SECOND" + "SQL_TSI_WEEK" + "SQL_TSI_YEAR" + "SQL_VARBINARY" + "SQL_VARCHAR" + "STATE" + "STATEMENT" + "STRING_AGG" + "STRUCTURE" + "STYLE" + "SUBCLASS_ORIGIN" + "SUBSTITUTE" + "TABLE_NAME" + "TEMPORARY" + "TIES" + "TIME_DIFF" + "TIME_TRUNC" + "TIMESTAMPADD" + "TIMESTAMPDIFF" + "TIMESTAMP_DIFF" + "TIMESTAMP_TRUNC" + "TOP_LEVEL_COUNT" + "TRANSACTION" + "TRANSACTIONS_ACTIVE" + "TRANSACTIONS_COMMITTED" + "TRANSACTIONS_ROLLED_BACK" + "TRANSFORM" + "TRANSFORMS" + "TRIGGER_CATALOG" + "TRIGGER_NAME" + "TRIGGER_SCHEMA" + "TUMBLE" + "TYPE" + "UNBOUNDED" + "UNCOMMITTED" + "UNCONDITIONAL" + "UNDER" + "UNPIVOT" + "UNNAMED" + "USAGE" + "USER_DEFINED_TYPE_CATALOG" + "USER_DEFINED_TYPE_CODE" + "USER_DEFINED_TYPE_NAME" + "USER_DEFINED_TYPE_SCHEMA" + "UTF16" + "UTF32" + "UTF8" + "VERSION" + "VIEW" + "WEEK" + "WEEKS" + "WORK" + "WRAPPER" + "WRITE" + "XML" + "YEARS" + "ZONE" + ] + + # List of non-reserved keywords to add; + # items in this list become non-reserved. + nonReservedKeywordsToAdd: [ + ] + + # List of non-reserved keywords to remove; + # items in this list become reserved. + nonReservedKeywordsToRemove: [ + ] + + # List of additional join types. Each is a method with no arguments. + # Example: "LeftSemiJoin". + joinTypes: [ + ] + + # List of methods for parsing custom SQL statements. + # Return type of method implementation should be 'SqlNode'. + # Example: "SqlShowDatabases()", "SqlShowTables()". + statementParserMethods: [ + ] + + # List of methods for parsing custom literals. + # Return type of method implementation should be "SqlNode". + # Example: ParseJsonLiteral(). + literalParserMethods: [ + ] + + # List of methods for parsing custom data types. + # Return type of method implementation should be "SqlTypeNameSpec". + # Example: SqlParseTimeStampZ(). + dataTypeParserMethods: [ + ] + + # List of methods for parsing builtin function calls. + # Return type of method implementation should be "SqlNode". + # Example: "DateTimeConstructorCall()". + builtinFunctionCallMethods: [ + ] + + # List of methods for parsing extensions to "ALTER " calls. + # Each must accept arguments "(SqlParserPos pos, String scope)". + # Example: "SqlAlterTable". + alterStatementParserMethods: [ + ] + + # List of methods for parsing extensions to "CREATE [OR REPLACE]" calls. + # Each must accept arguments "(SqlParserPos pos, boolean replace)". + # Example: "SqlCreateForeignSchema". + createStatementParserMethods: [ + ] + + # List of methods for parsing extensions to "DROP" calls. + # Each must accept arguments "(SqlParserPos pos)". + # Example: "SqlDropSchema". + dropStatementParserMethods: [ + ] + + # List of methods for parsing extensions to "TRUNCATE" calls. + # Each must accept arguments "(SqlParserPos pos)". + # Example: "SqlTruncate". + truncateStatementParserMethods: [ + ] + + # Binary operators tokens. + # Example: "< INFIX_CAST: \"::\" >". + binaryOperatorsTokens: [ + ] + + # Binary operators initialization. + # Example: "InfixCast". + extraBinaryExpressions: [ + ] + + # List of files in @includes directory that have parser method + # implementations for parsing custom SQL statements, literals or types + # given as part of "statementParserMethods", "literalParserMethods" or + # "dataTypeParserMethods". + # Example: "parserImpls.ftl". + implementationFiles: [ + ] + + # Custom identifier token. + # Example: "< IDENTIFIER: (|)+ >". + customIdentifierToken: "" + + includePosixOperators: false + includeCompoundIdentifier: true + includeBraces: true + includeAdditionalDeclarations: false + includeParsingStringLiteralAsArrayLiteral: false +} diff --git a/pinot-common/src/main/codegen/includes/parserImpls.ftl b/pinot-common/src/main/codegen/includes/parserImpls.ftl index 989894dd5db..6e1283b075b 100644 --- a/pinot-common/src/main/codegen/includes/parserImpls.ftl +++ b/pinot-common/src/main/codegen/includes/parserImpls.ftl @@ -33,7 +33,7 @@ private void DataFileDef(List list) : SqlNodeList DataFileDefList() : { SqlParserPos pos; - List list = Lists.newArrayList(); + List list = new ArrayList(); } { { pos = getPos(); } @@ -73,36 +73,6 @@ SqlInsertFromFile SqlInsertFromFile() : } } -/** - * define the rest of the sql into SqlStmtList - */ -private void SqlStatementList(SqlNodeList list) : -{ -} -{ - { - list.add(SqlStmt()); - } -} - -SqlNodeList SqlStmtsEof() : -{ - SqlParserPos pos; - SqlNodeList stmts; -} -{ - { - pos = getPos(); - stmts = new SqlNodeList(pos); - stmts.add(SqlStmt()); - } - ( LOOKAHEAD(2, SqlStmt()) SqlStatementList(stmts) )* - [ ] - { - return stmts; - } -} - void SqlAtTimeZone(List list, ExprContext exprContext, Span s) : { List list2; diff --git a/pinot-common/src/main/codegen/templates/Parser.jj b/pinot-common/src/main/codegen/templates/Parser.jj new file mode 100644 index 00000000000..1e86c8c3943 --- /dev/null +++ b/pinot-common/src/main/codegen/templates/Parser.jj @@ -0,0 +1,9131 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Copied from Calcite 1.36.0 and modified for Pinot syntax. Update this file when upgrading Calcite version. +// Modified parts are marked with "PINOT CUSTOMIZATION START/END". + +<@pp.dropOutputFile /> + +<@pp.changeOutputFile name="javacc/Parser.jj" /> + +options { + STATIC = false; + IGNORE_CASE = true; + UNICODE_INPUT = true; +} + + +PARSER_BEGIN(${parser.class}) + +package ${parser.package}; + +<#list (parser.imports!default.parser.imports) as importStr> +import ${importStr}; + + +import org.apache.calcite.avatica.util.Casing; +import org.apache.calcite.avatica.util.TimeUnit; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.runtime.CalciteContextException; +import org.apache.calcite.sql.JoinConditionType; +import org.apache.calcite.sql.JoinType; +import org.apache.calcite.sql.SqlAlter; +import org.apache.calcite.sql.SqlBasicTypeNameSpec; +import org.apache.calcite.sql.SqlBinaryOperator; +import org.apache.calcite.sql.SqlCall; +import org.apache.calcite.sql.SqlCharStringLiteral; +import org.apache.calcite.sql.SqlCollation; +import org.apache.calcite.sql.SqlCollectionTypeNameSpec; +import org.apache.calcite.sql.SqlDataTypeSpec; +import org.apache.calcite.sql.SqlDelete; +import org.apache.calcite.sql.SqlDescribeSchema; +import org.apache.calcite.sql.SqlDescribeTable; +import org.apache.calcite.sql.SqlDynamicParam; +import org.apache.calcite.sql.SqlExplain; +import org.apache.calcite.sql.SqlExplainFormat; +import org.apache.calcite.sql.SqlExplainLevel; +import org.apache.calcite.sql.SqlFunction; +import org.apache.calcite.sql.SqlFunctionCategory; +import org.apache.calcite.sql.SqlHint; +import org.apache.calcite.sql.SqlIdentifier; +import org.apache.calcite.sql.SqlInsert; +import org.apache.calcite.sql.SqlInsertKeyword; +import org.apache.calcite.sql.SqlIntervalQualifier; +import org.apache.calcite.sql.SqlJdbcDataTypeName; +import org.apache.calcite.sql.SqlJdbcFunctionCall; +import org.apache.calcite.sql.SqlJoin; +import org.apache.calcite.sql.SqlJsonConstructorNullClause; +import org.apache.calcite.sql.SqlJsonEncoding; +import org.apache.calcite.sql.SqlJsonExistsErrorBehavior; +import org.apache.calcite.sql.SqlJsonEmptyOrError; +import org.apache.calcite.sql.SqlJsonQueryEmptyOrErrorBehavior; +import org.apache.calcite.sql.SqlJsonQueryWrapperBehavior; +import org.apache.calcite.sql.SqlJsonValueEmptyOrErrorBehavior; +import org.apache.calcite.sql.SqlJsonValueReturning; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.SqlLiteral; +import org.apache.calcite.sql.SqlMatchRecognize; +import org.apache.calcite.sql.SqlMerge; +import org.apache.calcite.sql.SqlMapTypeNameSpec; +import org.apache.calcite.sql.SqlNode; +import org.apache.calcite.sql.SqlNodeList; +import org.apache.calcite.sql.SqlNumericLiteral; +import org.apache.calcite.sql.SqlOperator; +import org.apache.calcite.sql.SqlOrderBy; +import org.apache.calcite.sql.SqlPivot; +import org.apache.calcite.sql.SqlPostfixOperator; +import org.apache.calcite.sql.SqlPrefixOperator; +import org.apache.calcite.sql.SqlRowTypeNameSpec; +import org.apache.calcite.sql.SqlSampleSpec; +import org.apache.calcite.sql.SqlSelect; +import org.apache.calcite.sql.SqlSelectKeyword; +import org.apache.calcite.sql.SqlSetOption; +import org.apache.calcite.sql.SqlSnapshot; +import org.apache.calcite.sql.SqlTableRef; +import org.apache.calcite.sql.SqlTypeNameSpec; +import org.apache.calcite.sql.SqlUnnestOperator; +import org.apache.calcite.sql.SqlUnpivot; +import org.apache.calcite.sql.SqlUpdate; +import org.apache.calcite.sql.SqlUserDefinedTypeNameSpec; +import org.apache.calcite.sql.SqlUtil; +import org.apache.calcite.sql.SqlWindow; +import org.apache.calcite.sql.SqlWith; +import org.apache.calcite.sql.SqlWithItem; +import org.apache.calcite.sql.fun.SqlCase; +import org.apache.calcite.sql.fun.SqlInternalOperators; +import org.apache.calcite.sql.fun.SqlLibraryOperators; +import org.apache.calcite.sql.fun.SqlStdOperatorTable; +import org.apache.calcite.sql.fun.SqlTrimFunction; +import org.apache.calcite.sql.parser.Span; +import org.apache.calcite.sql.parser.SqlAbstractParserImpl; +import org.apache.calcite.sql.parser.SqlParseException; +import org.apache.calcite.sql.parser.SqlParser; +import org.apache.calcite.sql.parser.SqlParserImplFactory; +import org.apache.calcite.sql.parser.SqlParserPos; +import org.apache.calcite.sql.parser.SqlParserUtil; +import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.sql.validate.SqlConformance; +import org.apache.calcite.sql.validate.SqlConformanceEnum; +import org.apache.calcite.util.Glossary; +import org.apache.calcite.util.Pair; +import org.apache.calcite.util.SourceStringReader; +import org.apache.calcite.util.Util; +import org.apache.calcite.util.trace.CalciteTrace; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import org.slf4j.Logger; + +import java.io.Reader; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import static org.apache.calcite.util.Static.RESOURCE; + +/** + * SQL parser, generated from Parser.jj by JavaCC. + * + *

The public wrapper for this parser is {@link SqlParser}. + */ +public class ${parser.class} extends SqlAbstractParserImpl +{ + private static final Logger LOGGER = CalciteTrace.getParserTracer(); + + // Can't use quoted literal because of a bug in how JavaCC translates + // backslash-backslash. + private static final char BACKSLASH = 0x5c; + private static final char DOUBLE_QUOTE = 0x22; + private static final String DQ = DOUBLE_QUOTE + ""; + private static final String DQDQ = DQ + DQ; + private static final SqlLiteral LITERAL_ZERO = + SqlLiteral.createExactNumeric("0", SqlParserPos.ZERO); + private static final SqlLiteral LITERAL_ONE = + SqlLiteral.createExactNumeric("1", SqlParserPos.ZERO); + private static final SqlLiteral LITERAL_MINUS_ONE = + SqlLiteral.createExactNumeric("-1", SqlParserPos.ZERO); + private static final BigDecimal ONE_HUNDRED = BigDecimal.valueOf(100L); + + private static Metadata metadata; + + private Casing unquotedCasing; + private Casing quotedCasing; + private int identifierMaxLength; + private SqlConformance conformance; + + /** + * {@link SqlParserImplFactory} implementation for creating parser. + */ + public static final SqlParserImplFactory FACTORY = new SqlParserImplFactory() { + public SqlAbstractParserImpl getParser(Reader reader) { + final ${parser.class} parser = new ${parser.class}(reader); + if (reader instanceof SourceStringReader) { + final String sql = + ((SourceStringReader) reader).getSourceString(); + parser.setOriginalSql(sql); + } + return parser; + } + }; + + public SqlParseException normalizeException(Throwable ex) { + try { + if (ex instanceof ParseException) { + ex = cleanupParseException((ParseException) ex); + } + return convertException(ex); + } catch (ParseException e) { + throw new AssertionError(e); + } + } + + public Metadata getMetadata() { + synchronized (${parser.class}.class) { + if (metadata == null) { + metadata = new MetadataImpl( + new ${parser.class}(new java.io.StringReader(""))); + } + return metadata; + } + } + + public void setTabSize(int tabSize) { + jj_input_stream.setTabSize(tabSize); + } + + public void switchTo(SqlAbstractParserImpl.LexicalState state) { + final int stateOrdinal = + Arrays.asList(${parser.class}TokenManager.lexStateNames) + .indexOf(state.name()); + token_source.SwitchTo(stateOrdinal); + } + + public void setQuotedCasing(Casing quotedCasing) { + this.quotedCasing = quotedCasing; + } + + public void setUnquotedCasing(Casing unquotedCasing) { + this.unquotedCasing = unquotedCasing; + } + + public void setIdentifierMaxLength(int identifierMaxLength) { + this.identifierMaxLength = identifierMaxLength; + } + + public void setConformance(SqlConformance conformance) { + this.conformance = conformance; + } + + public SqlNode parseSqlExpressionEof() throws Exception { + return SqlExpressionEof(); + } + + public SqlNode parseSqlStmtEof() throws Exception { + return SqlStmtEof(); + } + + public SqlNodeList parseSqlStmtList() throws Exception { + return SqlStmtList(); + } + + public SqlNode parseArray() throws SqlParseException { + switchTo(LexicalState.BQID); + try { + return ArrayLiteral(); + } catch (ParseException ex) { + throw normalizeException(ex); + } catch (TokenMgrError ex) { + throw normalizeException(ex); + } + } + + private SqlNode extend(SqlNode table, SqlNodeList extendList) { + return SqlStdOperatorTable.EXTEND.createCall( + Span.of(table, extendList).pos(), table, extendList); + } + + /** Adds a warning that a token such as "HOURS" was used, + * whereas the SQL standard only allows "HOUR". + * + *

Currently, we silently add an exception to a list of warnings. In + * future, we may have better compliance checking, for example a strict + * compliance mode that throws if any non-standard features are used. */ + private TimeUnit warn(TimeUnit timeUnit) throws ParseException { + final String token = getToken(0).image.toUpperCase(Locale.ROOT); + warnings.add( + SqlUtil.newContextException(getPos(), + RESOURCE.nonStandardFeatureUsed(token))); + return timeUnit; + } +} + +PARSER_END(${parser.class}) + + +/*************************************** + * Utility Codes for Semantic Analysis * + ***************************************/ + +/* For Debug */ +JAVACODE +void debug_message1() { + LOGGER.info("{} , {}", getToken(0).image, getToken(1).image); +} + +JAVACODE String unquotedIdentifier() { + return SqlParserUtil.toCase(getToken(0).image, unquotedCasing); +} + +/** + * Allows parser to be extended with new types of table references. The + * default implementation of this production is empty. + */ +SqlNode ExtendedTableRef() : +{ +} +{ + UnusedExtension() + { + return null; + } +} + +/** + * Allows an OVER clause following a table expression as an extension to + * standard SQL syntax. The default implementation of this production is empty. + */ +SqlNode TableOverOpt() : +{ +} +{ + { + return null; + } +} + +/* + * Parses dialect-specific keywords immediately following the SELECT keyword. + */ +void SqlSelectKeywords(List keywords) : +{} +{ + E() +} + +/* + * Parses dialect-specific keywords immediately following the INSERT keyword. + */ +void SqlInsertKeywords(List keywords) : +{} +{ + E() +} + +/* +* Parse Floor/Ceil function parameters +*/ +SqlNode FloorCeilOptions(Span s, boolean floorFlag) : +{ + SqlNode node; +} +{ + node = StandardFloorCeilOptions(s, floorFlag) { + return node; + } +} + +/* +// This file contains the heart of a parser for SQL SELECT statements. +// code can be shared between various parsers (for example, a DDL parser and a +// DML parser) but is not a standalone JavaCC file. You need to prepend a +// parser declaration (such as that in Parser.jj). +*/ + +/* Epsilon */ +JAVACODE +void E() {} + +/** @Deprecated */ +JAVACODE List startList(Object o) +{ + List list = new ArrayList(); + list.add(o); + return list; +} + +/* + * NOTE jvs 6-Feb-2004: The straightforward way to implement the SQL grammar is + * to keep query expressions (SELECT, UNION, etc) separate from row expressions + * (+, LIKE, etc). However, this is not possible with an LL(k) parser, because + * both kinds of expressions allow parenthesization, so no fixed amount of left + * context is ever good enough. A sub-query can be a leaf in a row expression, + * and can include operators like UNION, so it's not even possible to use a + * syntactic lookahead rule like "look past an indefinite number of parentheses + * until you see SELECT, VALUES, or TABLE" (since at that point we still + * don't know whether we're parsing a sub-query like ((select ...) + x) + * vs. (select ... union select ...). + * + * The somewhat messy solution is to unify the two kinds of expression, + * and to enforce syntax rules using parameterized context. This + * is the purpose of the ExprContext parameter. It is passed to + * most expression productions, which check the expressions encountered + * against the context for correctness. When a query + * element like SELECT is encountered, the production calls + * checkQueryExpression, which will throw an exception if + * a row expression was expected instead. When a row expression like + * IN is encountered, the production calls checkNonQueryExpression + * instead. It is very important to understand how this works + * when modifying the grammar. + * + * The commingling of expressions results in some bogus ambiguities which are + * resolved with LOOKAHEAD hints. The worst example is comma. SQL allows both + * (WHERE x IN (1,2)) and (WHERE x IN (select ...)). This means when we parse + * the right-hand-side of an IN, we have to allow any kind of expression inside + * the parentheses. Now consider the expression "WHERE x IN(SELECT a FROM b + * GROUP BY c,d)". When the parser gets to "c,d" it doesn't know whether the + * comma indicates the end of the GROUP BY or the end of one item in an IN + * list. Luckily, we know that select and comma-list are mutually exclusive + * within IN, so we use maximal munch for the GROUP BY comma. However, this + * usage of hints could easily mask unintended ambiguities resulting from + * future changes to the grammar, making it very brittle. + */ + +JAVACODE protected SqlParserPos getPos() +{ + return new SqlParserPos( + token.beginLine, + token.beginColumn, + token.endLine, + token.endColumn); +} + +/** Starts a span at the current position. */ +JAVACODE Span span() +{ + return Span.of(getPos()); +} + +JAVACODE void checkQueryExpression(ExprContext exprContext) +{ + switch (exprContext) { + case ACCEPT_NON_QUERY: + case ACCEPT_SUB_QUERY: + case ACCEPT_CURSOR: + throw SqlUtil.newContextException(getPos(), + RESOURCE.illegalQueryExpression()); + } +} + +JAVACODE void checkNonQueryExpression(ExprContext exprContext) +{ + switch (exprContext) { + case ACCEPT_QUERY: + throw SqlUtil.newContextException(getPos(), + RESOURCE.illegalNonQueryExpression()); + } +} + +JAVACODE SqlNode checkNotJoin(SqlNode e) +{ + if (e instanceof SqlJoin) { + throw SqlUtil.newContextException(e.getParserPosition(), + RESOURCE.illegalJoinExpression()); + } + return e; +} + +/** + * Converts a ParseException (local to this particular instantiation + * of the parser) into a SqlParseException (common to all parsers). + */ +JAVACODE SqlParseException convertException(Throwable ex) +{ + if (ex instanceof SqlParseException) { + return (SqlParseException) ex; + } + SqlParserPos pos = null; + int[][] expectedTokenSequences = null; + String[] tokenImage = null; + if (ex instanceof ParseException) { + ParseException pex = (ParseException) ex; + expectedTokenSequences = pex.expectedTokenSequences; + tokenImage = pex.tokenImage; + if (pex.currentToken != null) { + final Token token = pex.currentToken.next; + // Checks token.image.equals("1") to avoid recursive call. + // The SqlAbstractParserImpl#MetadataImpl constructor uses constant "1" to + // throw intentionally to collect the expected tokens. + if (!token.image.equals("1") + && getMetadata().isKeyword(token.image) + && SqlParserUtil.allowsIdentifier(tokenImage, expectedTokenSequences)) { + // If the next token is a keyword, reformat the error message as: + + // Incorrect syntax near the keyword '{keyword}' at line {line_number}, + // column {column_number}. + final String expecting = ex.getMessage() + .substring(ex.getMessage().indexOf("Was expecting")); + final String errorMsg = String.format("Incorrect syntax near the keyword '%s' " + + "at line %d, column %d.\n%s", + token.image, + token.beginLine, + token.beginColumn, + expecting); + // Replace the ParseException with explicit error message. + ex = new ParseException(errorMsg); + } + pos = new SqlParserPos( + token.beginLine, + token.beginColumn, + token.endLine, + token.endColumn); + } + } else if (ex instanceof TokenMgrError) { + expectedTokenSequences = null; + tokenImage = null; + // Example: + // Lexical error at line 3, column 24. Encountered "#" after "a". + final java.util.regex.Pattern pattern = java.util.regex.Pattern.compile( + "(?s)Lexical error at line ([0-9]+), column ([0-9]+).*"); + java.util.regex.Matcher matcher = pattern.matcher(ex.getMessage()); + if (matcher.matches()) { + int line = Integer.parseInt(matcher.group(1)); + int column = Integer.parseInt(matcher.group(2)); + pos = new SqlParserPos(line, column, line, column); + } + } else if (ex instanceof CalciteContextException) { + // CalciteContextException is the standard wrapper for exceptions + // produced by the validator, but in the parser, the standard is + // SqlParseException; so, strip it away. In case you were wondering, + // the CalciteContextException appears because the parser + // occasionally calls into validator-style code such as + // SqlSpecialOperator.reduceExpr. + CalciteContextException ece = + (CalciteContextException) ex; + pos = new SqlParserPos( + ece.getPosLine(), + ece.getPosColumn(), + ece.getEndPosLine(), + ece.getEndPosColumn()); + ex = ece.getCause(); + } + + return new SqlParseException( + ex.getMessage(), pos, expectedTokenSequences, tokenImage, ex); +} + +/** + * Removes or transforms misleading information from a parse exception. + * + * @param e dirty excn + * + * @return clean excn + */ +JAVACODE ParseException cleanupParseException(ParseException ex) +{ + if (ex.expectedTokenSequences == null) { + return ex; + } + int iIdentifier = Arrays.asList(ex.tokenImage).indexOf(""); + + // Find all sequences in the error which contain identifier. For + // example, + // {} + // {A} + // {B, C} + // {D, } + // {D, A} + // {D, B} + // + // would yield + // {} + // {D} + final List prefixList = new ArrayList(); + for (int i = 0; i < ex.expectedTokenSequences.length; ++i) { + int[] seq = ex.expectedTokenSequences[i]; + int j = seq.length - 1; + int i1 = seq[j]; + if (i1 == iIdentifier) { + int[] prefix = new int[j]; + System.arraycopy(seq, 0, prefix, 0, j); + prefixList.add(prefix); + } + } + + if (prefixList.isEmpty()) { + return ex; + } + + int[][] prefixes = (int[][]) + prefixList.toArray(new int[prefixList.size()][]); + + // Since was one of the possible productions, + // we know that the parser will also have included all + // of the non-reserved keywords (which are treated as + // identifiers in non-keyword contexts). So, now we need + // to clean those out, since they're totally irrelevant. + + final List list = new ArrayList(); + Metadata metadata = getMetadata(); + for (int i = 0; i < ex.expectedTokenSequences.length; ++i) { + int [] seq = ex.expectedTokenSequences[i]; + String tokenImage = ex.tokenImage[seq[seq.length - 1]]; + String token = SqlParserUtil.getTokenVal(tokenImage); + if (token == null || !metadata.isNonReservedKeyword(token)) { + list.add(seq); + continue; + } + boolean match = matchesPrefix(seq, prefixes); + if (!match) { + list.add(seq); + } + } + + ex.expectedTokenSequences = + (int [][]) list.toArray(new int [list.size()][]); + return ex; +} + +JAVACODE boolean matchesPrefix(int[] seq, int[][] prefixes) +{ + nextPrefix: + for (int[] prefix : prefixes) { + if (seq.length == prefix.length + 1) { + for (int k = 0; k < prefix.length; k++) { + if (prefix[k] != seq[k]) { + continue nextPrefix; + } + } + return true; + } + } + return false; +} + +/***************************************** + * Syntactical Descriptions * + *****************************************/ + +SqlNode ExprOrJoinOrOrderedQuery(ExprContext exprContext) : +{ + SqlNode e; + final List list = new ArrayList(); +} +{ + // Lookhead to distinguish between "TABLE emp" (which will be + // matched by ExplicitTable() via Query()) + // and "TABLE fun(args)" (which will be matched by TableRef()) + ( + LOOKAHEAD(2) + e = Query(exprContext) + e = OrderByLimitOpt(e) + { return e; } + | + e = TableRef1(ExprContext.ACCEPT_QUERY_OR_JOIN) + ( e = JoinTable(e) )* + { list.add(e); } + ( AddSetOpQuery(list, exprContext) )* + { return SqlParserUtil.toTree(list); } + ) +} + +/** + * Parses either a row expression or a query expression with an optional + * ORDER BY. + * + *

Postgres syntax for limit: + * + *

+ *    [ LIMIT { count | ALL } ]
+ *    [ OFFSET start ]
+ *
+ * + *

Trino syntax for limit: + * + *

+ *    [ OFFSET start ]
+ *    [ LIMIT { count | ALL } ]
+ *
+ * + *

MySQL syntax for limit: + * + *

+ *    [ LIMIT { count | start, count } ]
+ *
+ * + *

SQL:2008 syntax for limit: + * + *

+ *    [ OFFSET start { ROW | ROWS } ]
+ *    [ FETCH { FIRST | NEXT } [ count ] { ROW | ROWS } ONLY ]
+ *
+ */ +SqlNode OrderedQueryOrExpr(ExprContext exprContext) : +{ + SqlNode e; +} +{ + e = QueryOrExpr(exprContext) + e = OrderByLimitOpt(e) + { return e; } +} + +/** Reads optional "ORDER BY", "LIMIT", "OFFSET", "FETCH" following a query, + * {@code e}. If any of them are present, adds them to the query; + * otherwise returns the query unchanged. + * Throws if they are present and {@code e} is not a query. */ +SqlNode OrderByLimitOpt(SqlNode e) : +{ + final SqlNodeList orderBy; + final Span s = Span.of(); + SqlNode[] offsetFetch = {null, null}; +} +{ + ( + // use the syntactic type of the expression we just parsed + // to decide whether ORDER BY makes sense + orderBy = OrderBy(e.isA(SqlKind.QUERY)) + | { orderBy = null; } + ) + [ + LimitClause(s, offsetFetch) + [ OffsetClause(s, offsetFetch) ] + | + OffsetClause(s, offsetFetch) + [ + LimitClause(s, offsetFetch) { + if (!this.conformance.isOffsetLimitAllowed()) { + throw SqlUtil.newContextException(s.end(this), + RESOURCE.offsetLimitNotAllowed()); + } + } + | + FetchClause(offsetFetch) + ] + | + FetchClause(offsetFetch) + ] + { + if (orderBy != null || offsetFetch[0] != null || offsetFetch[1] != null) { + return new SqlOrderBy(getPos(), e, + Util.first(orderBy, SqlNodeList.EMPTY), + offsetFetch[0], offsetFetch[1]); + } + return e; + } +} + +/** + * Parses an OFFSET clause in an ORDER BY expression. + */ +void OffsetClause(Span s, SqlNode[] offsetFetch) : +{ +} +{ + // ROW or ROWS is required in SQL:2008 but we make it optional + // because it is not present in Postgres-style syntax. + { s.add(this); } + offsetFetch[0] = UnsignedNumericLiteralOrParam() + [ | ] +} + +/** + * Parses a FETCH clause in an ORDER BY expression. + */ +void FetchClause(SqlNode[] offsetFetch) : +{ +} +{ + // SQL:2008-style syntax. "OFFSET ... FETCH ...". + // If you specify both LIMIT and FETCH, FETCH wins. + ( | ) offsetFetch[1] = UnsignedNumericLiteralOrParam() + ( | ) +} + +/** + * Parses a LIMIT clause in an ORDER BY expression. + */ +void LimitClause(Span s, SqlNode[] offsetFetch) : +{ + final String error; +} +{ + // Postgres-style syntax. "LIMIT ... OFFSET ..." + { s.add(this); } + ( + // MySQL-style syntax. "LIMIT start, count" or "LIMIT start, ALL" + LOOKAHEAD(2) + offsetFetch[0] = UnsignedNumericLiteralOrParam() + + ( + offsetFetch[1] = UnsignedNumericLiteralOrParam() { + error = "count"; + } + | + { + error = "ALL"; + } + ) { + if (!this.conformance.isLimitStartCountAllowed()) { + throw SqlUtil.newContextException(s.end(this), + RESOURCE.limitStartCountOrAllNotAllowed(error)); + } + } + | + offsetFetch[1] = UnsignedNumericLiteralOrParam() + | + + ) +} + +/** + * Parses a leaf in a query expression (SELECT, VALUES or TABLE). + */ +SqlNode LeafQuery(ExprContext exprContext) : +{ + SqlNode e; +} +{ + { + // ensure a query is legal in this context + checkQueryExpression(exprContext); + } + e = SqlSelect() { return e; } +| + e = TableConstructor() { return e; } +| + e = ExplicitTable(getPos()) { return e; } +} + +/** + * Parses a parenthesized query or single row expression. + * Depending on {@code exprContext}, may also accept a join. + */ +SqlNode ParenthesizedExpression(ExprContext exprContext) : +{ + SqlNode e; +} +{ + + { + // we've now seen left paren, so queries inside should + // be allowed as sub-queries + switch (exprContext) { + case ACCEPT_SUB_QUERY: + exprContext = ExprContext.ACCEPT_NONCURSOR; + break; + case ACCEPT_CURSOR: + exprContext = ExprContext.ACCEPT_ALL; + break; + } + } + e = ExprOrJoinOrOrderedQuery(exprContext) + + { + exprContext.throwIfNotCompatible(e); + return e; + } +} + +/** + * Parses a parenthesized query or comma-list of row expressions. + * + *

REVIEW jvs 8-Feb-2004: There's a small hole in this production. It can be + * used to construct something like + * + *

+ * WHERE x IN (select count(*) from t where c=d,5)
+ *
+ * + *

which should be illegal. The above is interpreted as equivalent to + * + *

+ * WHERE x IN ((select count(*) from t where c=d),5)
+ *
+ * + *

which is a legal use of a sub-query. The only way to fix the hole is to + * be able to remember whether a subexpression was parenthesized or not, which + * means preserving parentheses in the SqlNode tree. This is probably + * desirable anyway for use in purely syntactic parsing applications (e.g. SQL + * pretty-printer). However, if this is done, it's important to also make + * isA() on the paren node call down to its operand so that we can + * always correctly discriminate a query from a row expression. + */ +SqlNodeList ParenthesizedQueryOrCommaList( + ExprContext exprContext) : +{ + SqlNode e; + final List list = new ArrayList(); + ExprContext firstExprContext = exprContext; + final Span s; +} +{ + + { + // we've now seen left paren, so a query by itself should + // be interpreted as a sub-query + s = span(); + switch (exprContext) { + case ACCEPT_SUB_QUERY: + firstExprContext = ExprContext.ACCEPT_NONCURSOR; + break; + case ACCEPT_CURSOR: + firstExprContext = ExprContext.ACCEPT_ALL; + break; + } + } + e = OrderedQueryOrExpr(firstExprContext) { list.add(e); } + ( + + { + // a comma-list can't appear where only a query is expected + checkNonQueryExpression(exprContext); + } + AddExpression(list, exprContext) + )* + + { + return new SqlNodeList(list, s.end(this)); + } +} + +/** As ParenthesizedQueryOrCommaList, but allows DEFAULT + * in place of any of the expressions. For example, + * {@code (x, DEFAULT, null, DEFAULT)}. */ +SqlNodeList ParenthesizedQueryOrCommaListWithDefault( + ExprContext exprContext) : +{ + SqlNode e; + final List list = new ArrayList(); + ExprContext firstExprContext = exprContext; + final Span s; +} +{ + + { + // we've now seen left paren, so a query by itself should + // be interpreted as a sub-query + s = span(); + switch (exprContext) { + case ACCEPT_SUB_QUERY: + firstExprContext = ExprContext.ACCEPT_NONCURSOR; + break; + case ACCEPT_CURSOR: + firstExprContext = ExprContext.ACCEPT_ALL; + break; + } + } + ( + e = OrderedQueryOrExpr(firstExprContext) { list.add(e); } + | + e = Default() { list.add(e); } + ) + ( + + { + // a comma-list can't appear where only a query is expected + checkNonQueryExpression(exprContext); + } + ( + e = Expression(exprContext) { list.add(e); } + | + e = Default() { list.add(e); } + ) + )* + + { + return new SqlNodeList(list, s.end(this)); + } +} + +/** + * Parses function parameter lists. + * If the list starts with DISTINCT or ALL, it is discarded. + */ +List UnquantifiedFunctionParameterList(ExprContext exprContext) : +{ + final List args; +} +{ + args = FunctionParameterList(exprContext) { + args.remove(0); // remove DISTINCT or ALL, if present + return args; + } +} + +/** + * Parses function parameter lists including DISTINCT keyword recognition, + * DEFAULT, and named argument assignment. + */ +List FunctionParameterList(ExprContext exprContext) : +{ + final SqlLiteral qualifier; + final List list = new ArrayList(); +} +{ + + ( + qualifier = AllOrDistinct() { list.add(qualifier); } + | + { list.add(null); } + ) + AddArg0(list, exprContext) + ( + { + // a comma-list can't appear where only a query is expected + checkNonQueryExpression(exprContext); + } + AddArg(list, exprContext) + )* + + { + return list; + } +} + +SqlLiteral AllOrDistinct() : +{ +} +{ + { return SqlSelectKeyword.DISTINCT.symbol(getPos()); } +| + { return SqlSelectKeyword.ALL.symbol(getPos()); } +} + +void AddArg0(List list, ExprContext exprContext) : +{ + final SqlIdentifier name; + SqlNode e; + final ExprContext firstExprContext; + { + // we've now seen left paren, so queries inside should + // be allowed as sub-queries + switch (exprContext) { + case ACCEPT_SUB_QUERY: + firstExprContext = ExprContext.ACCEPT_NONCURSOR; + break; + case ACCEPT_CURSOR: + firstExprContext = ExprContext.ACCEPT_ALL; + break; + default: + firstExprContext = exprContext; + break; + } + } +} +{ + ( + LOOKAHEAD(2) name = SimpleIdentifier() + | { name = null; } + ) + ( + e = Default() + | + LOOKAHEAD(3) + e = TableParam() + | + e = PartitionedQueryOrQueryOrExpr(firstExprContext) + ) + { + if (name != null) { + e = SqlStdOperatorTable.ARGUMENT_ASSIGNMENT.createCall( + Span.of(name, e).pos(), e, name); + } + list.add(e); + } +} + +void AddArg(List list, ExprContext exprContext) : +{ + final SqlIdentifier name; + SqlNode e; +} +{ + ( + LOOKAHEAD(2) name = SimpleIdentifier() + | { name = null; } + ) + ( + e = Default() + | + e = Expression(exprContext) + | + e = TableParam() + ) + { + if (name != null) { + e = SqlStdOperatorTable.ARGUMENT_ASSIGNMENT.createCall( + Span.of(name, e).pos(), e, name); + } + list.add(e); + } +} + +SqlNode Default() : {} +{ + { + return SqlStdOperatorTable.DEFAULT.createCall(getPos()); + } +} + +/** + * Parses a query (SELECT, UNION, INTERSECT, EXCEPT, VALUES, TABLE) followed by + * the end-of-file symbol. + */ +SqlNode SqlQueryEof() : +{ + SqlNode query; +} +{ + query = OrderedQueryOrExpr(ExprContext.ACCEPT_QUERY) + + { return query; } +} + +/** + * Parses a list of SQL statements separated by semicolon. + * The semicolon is required between statements, but is + * optional at the end. + */ +SqlNodeList SqlStmtList() : +{ + final List stmtList = new ArrayList(); + SqlNode stmt; +} +{ + stmt = SqlStmt() { + stmtList.add(stmt); + } + ( + + [ + stmt = SqlStmt() { + stmtList.add(stmt); + } + ] + )* + + { + return new SqlNodeList(stmtList, Span.of(stmtList).pos()); + } +} + +/** + * Parses an SQL statement. + */ +SqlNode SqlStmt() : +{ + SqlNode stmt; +} +{ + ( +<#-- Add methods to parse additional statements here --> +<#list (parser.statementParserMethods!default.parser.statementParserMethods) as method> + LOOKAHEAD(2) stmt = ${method} + | + + stmt = SqlSetOption(Span.of(), null) + | + stmt = SqlAlter() + | +<#if (parser.createStatementParserMethods!default.parser.createStatementParserMethods)?size != 0> + stmt = SqlCreate() + | + +<#if (parser.dropStatementParserMethods!default.parser.dropStatementParserMethods)?size != 0> + stmt = SqlDrop() + | + +<#if (parser.truncateStatementParserMethods!default.parser.truncateStatementParserMethods)?size != 0> + LOOKAHEAD(2) + stmt = SqlTruncate() + | + + stmt = OrderedQueryOrExpr(ExprContext.ACCEPT_QUERY) + | + stmt = SqlExplain() + | + stmt = SqlDescribe() + | + stmt = SqlInsert() + | + stmt = SqlDelete() + | + stmt = SqlUpdate() + | + stmt = SqlMerge() + | + stmt = SqlProcedureCall() + ) + { + return stmt; + } +} + +/** + * Parses an SQL statement followed by the end-of-file symbol. + */ +SqlNode SqlStmtEof() : +{ + SqlNode stmt; +} +{ + stmt = SqlStmt() + { + return stmt; + } +} + +<#-- Add implementations of additional parser statement calls here --> +<#list (parser.implementationFiles!default.parser.implementationFiles) as file> + <#include "/@includes/"+file /> + + +SqlNodeList ParenthesizedKeyValueOptionCommaList() : +{ + final Span s; + final List list = new ArrayList(); +} +{ + { s = span(); } + + AddKeyValueOption(list) + ( + + AddKeyValueOption(list) + )* + { + return new SqlNodeList(list, s.end(this)); + } +} + +/** +* Parses an option with format key=val whose key is a simple identifier or string literal +* and value is a string literal. +*/ +void AddKeyValueOption(List list) : +{ + final SqlNode key; + final SqlNode value; +} +{ + ( + key = SimpleIdentifier() + | + key = StringLiteral() + ) + + value = StringLiteral() { + list.add(key); + list.add(value); + } +} + +/** Parses an option value (either a string or a numeric) and adds to a list. */ +void AddOptionValue(List list) : +{ + final SqlNode value; +} +{ + ( + value = NumericLiteral() { list.add(value); } + | + value = StringLiteral() { list.add(value); } + ) +} + +/** + * Parses a literal list separated by comma. The literal is either a string or a numeric. + */ +SqlNodeList ParenthesizedLiteralOptionCommaList() : +{ + final Span s; + final List list = new ArrayList(); +} +{ + { s = span(); } + + AddOptionValue(list) ( AddOptionValue(list) )* + { + return new SqlNodeList(list, s.end(this)); + } +} + +void AddHint(List hints) : +{ + final SqlIdentifier hintName; + final SqlNodeList hintOptions; + final SqlHint.HintOptionFormat optionFormat; +} +{ + hintName = SimpleIdentifier() + ( + LOOKAHEAD(5) + hintOptions = ParenthesizedKeyValueOptionCommaList() { + optionFormat = SqlHint.HintOptionFormat.KV_LIST; + } + | + LOOKAHEAD(3) + hintOptions = ParenthesizedSimpleIdentifierList() { + optionFormat = SqlHint.HintOptionFormat.ID_LIST; + } + | + LOOKAHEAD(3) + hintOptions = ParenthesizedLiteralOptionCommaList() { + optionFormat = SqlHint.HintOptionFormat.LITERAL_LIST; + } + | + LOOKAHEAD(2) + [ ] + { + hintOptions = SqlNodeList.EMPTY; + optionFormat = SqlHint.HintOptionFormat.EMPTY; + } + ) + { + hints.add( + new SqlHint(Span.of(hintOptions).end(this), hintName, hintOptions, + optionFormat)); + } +} + +/** Parses hints following a table reference, + * and returns the wrapped table reference. */ +SqlNode TableHints(SqlIdentifier tableName) : +{ + final List hints = new ArrayList(); +} +{ + AddHint(hints) ( AddHint(hints) )* { + final SqlParserPos pos = Span.of(tableName).addAll(hints).end(this); + final SqlNodeList hintList = new SqlNodeList(hints, pos); + return new SqlTableRef(pos, tableName, hintList); + } +} + +/** + * Parses a leaf SELECT expression without ORDER BY. + */ +SqlSelect SqlSelect() : +{ + final List keywords = new ArrayList(); + final SqlLiteral keyword; + final SqlNodeList keywordList; + final List selectList = new ArrayList(); + final SqlNode fromClause; + final SqlNode where; + final SqlNodeList groupBy; + final SqlNode having; + final SqlNodeList windowDecls; + final SqlNode qualify; + final List hints = new ArrayList(); + final Span s; +} +{ +