Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet;

import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.parquet.VersionParser.ParsedVersion;
import org.apache.parquet.VersionParser.VersionParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Not all parquet writers populate the int96 statistics correctly. For example: arrow-rs
* https://github.com/apache/arrow-rs/blob/3ed9aedabc9e5a90170e43ff818f24a29eafb35b/parquet/src/file/statistics.rs#L212-L215
* This class is used to detect whether a file was written with a version that has correct int96 statistics.
*/
public class ValidInt96Stats {
private static final AtomicBoolean alreadyLogged = new AtomicBoolean(false);

private static final Logger LOG = LoggerFactory.getLogger(ValidInt96Stats.class);

/**
* Decides if the statistics from a file created by createdBy (the created_by field from parquet format)
* should be trusted for INT96 columns.
*
* @param createdBy the created-by string from a file footer
* @return true if the statistics are valid and can be trusted, false otherwise
*/
public static boolean hasValidInt96Stats(String createdBy) {
if (Strings.isNullOrEmpty(createdBy)) {
warnOnce("Cannot verify INT96 statistics because created_by is null or empty");
return false;
}

try {
ParsedVersion version = VersionParser.parse(createdBy);
if ("parquet-mr".equals(version.application)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recall that @rdblue has an opinion in maintaining an allow-list here.

return version.version != null && version.version.compareTo("1.16.0") > 0;
}
if ("parquet-mr compatible Photon".equals(version.application)) {
return true;
}
} catch (RuntimeException | VersionParseException e) {
warnParseErrorOnce(createdBy, e);
}
return false;
}

private static void warnParseErrorOnce(String createdBy, Throwable e) {
if (!alreadyLogged.getAndSet(true)) {
LOG.warn("Cannot verify INT96 statistics because created_by could not be parsed: " + createdBy, e);
}
}

private static void warnOnce(String message) {
if (!alreadyLogged.getAndSet(true)) {
LOG.warn(message);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,34 @@ public String toString() {
}
};

/*
* This comparator is for comparing two timestamps represented as int96 binary.
* It is a two level comparison.
* Days (last 4 bytes compared as unsigned little endian int32),
* Nanoseconds (first 8 bytes compared as unsigned little endian int64)
*/
static final PrimitiveComparator<Binary> BINARY_AS_INT96_TIMESTAMP_COMPARATOR = new BinaryComparator() {
@Override
int compareBinary(Binary b1, Binary b2) {
ByteBuffer bb1 = b1.toByteBuffer().slice();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we check if their lengths are exactly 12 before anything? I recall that BINARY_AS_FLOAT16_COMPARATOR did this.

ByteBuffer bb2 = b2.toByteBuffer().slice();
bb1.order(java.nio.ByteOrder.LITTLE_ENDIAN);
bb2.order(java.nio.ByteOrder.LITTLE_ENDIAN);
int jd1 = bb1.getInt(8);
int jd2 = bb2.getInt(8);
if (jd1 != jd2) return Integer.compareUnsigned(jd1, jd2) < 0 ? -1 : 1;
long s1 = bb1.getLong(0);
long s2 = bb2.getLong(0);
if (s1 != s2) return Long.compareUnsigned(s1, s2) < 0 ? -1 : 1;
return 0;
}

@Override
public String toString() {
return "BINARY_AS_INT96_TIMESTAMP_COMPARATOR";
}
};

/*
* This comparator is for comparing two signed decimal values represented in twos-complement binary. In case of the
* binary length of one value is shorter than the other it will be padded by the corresponding prefix (0xFF for
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.ColumnOrder.ColumnOrderName;
import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation;

/**
Expand Down Expand Up @@ -363,7 +362,7 @@ public <T, E extends Exception> T convert(PrimitiveTypeNameConverter<T, E> conve

@Override
PrimitiveComparator<?> comparator(LogicalTypeAnnotation logicalType) {
return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR;
return PrimitiveComparator.BINARY_AS_INT96_TIMESTAMP_COMPARATOR;
}
},
FIXED_LEN_BYTE_ARRAY("getBinary", Binary.class) {
Expand Down Expand Up @@ -542,9 +541,7 @@ public PrimitiveType(
this.decimalMeta = decimalMeta;

if (columnOrder == null) {
columnOrder = primitive == PrimitiveTypeName.INT96 || originalType == OriginalType.INTERVAL
? ColumnOrder.undefined()
: ColumnOrder.typeDefined();
columnOrder = originalType == OriginalType.INTERVAL ? ColumnOrder.undefined() : ColumnOrder.typeDefined();
}
this.columnOrder = requireValidColumnOrder(columnOrder);
}
Expand Down Expand Up @@ -587,21 +584,14 @@ public PrimitiveType(
}

if (columnOrder == null) {
columnOrder = primitive == PrimitiveTypeName.INT96
|| logicalTypeAnnotation instanceof LogicalTypeAnnotation.IntervalLogicalTypeAnnotation
columnOrder = logicalTypeAnnotation instanceof LogicalTypeAnnotation.IntervalLogicalTypeAnnotation
? ColumnOrder.undefined()
: ColumnOrder.typeDefined();
}
this.columnOrder = requireValidColumnOrder(columnOrder);
}

private ColumnOrder requireValidColumnOrder(ColumnOrder columnOrder) {
if (primitive == PrimitiveTypeName.INT96) {
Preconditions.checkArgument(
columnOrder.getColumnOrderName() == ColumnOrderName.UNDEFINED,
"The column order %s is not supported by INT96",
columnOrder);
}
if (getLogicalTypeAnnotation() != null) {
Preconditions.checkArgument(
getLogicalTypeAnnotation().isValidColumnOrder(columnOrder),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,8 +413,8 @@ public THIS scale(int scale) {
/**
* Adds the column order for the primitive type.
* <p>
* In case of not set the default column order is {@link ColumnOrderName#TYPE_DEFINED_ORDER} except the type
* {@link PrimitiveTypeName#INT96} and the types annotated by {@link OriginalType#INTERVAL} where the default column
* In case of not set the default column order is {@link ColumnOrderName#TYPE_DEFINED_ORDER} except the types
* annotated by {@link OriginalType#INTERVAL} where the default column
* order is {@link ColumnOrderName#UNDEFINED}.
*
* @param columnOrder the column order for the primitive type
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import org.junit.Test;

public class ValidInt96StatsTest {

@Test
public void testNullAndEmpty() {
assertFalse(ValidInt96Stats.hasValidInt96Stats(null));
assertFalse(ValidInt96Stats.hasValidInt96Stats(""));
}

@Test
public void testParquetMrValid() {
// Versions > 1.15.0 should be valid
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.16.0"));
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.15.1"));
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 2.0.0"));
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.16.0 (build abcd)"));
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.15.1-SNAPSHOT"));
}

@Test
public void testParquetMrInvalid() {
// Versions <= 1.15.0 should be invalid
assertFalse(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.15.0"));
assertFalse(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.12.3"));
assertFalse(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.14.0"));
assertFalse(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.12.3 (build abcd)"));
assertFalse(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.12.3-SNAPSHOT"));
assertFalse(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.12.3rc1"));
assertFalse(ValidInt96Stats.hasValidInt96Stats("parquet-mr version 1.12.3rc1-SNAPSHOT"));
}

@Test
public void testParquetMrCompatiblePhotonValid() {
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr compatible Photon version 1.0.0"));
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr compatible Photon version 1.0.0 (build abcd)"));
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr compatible Photon version 1.0.0-SNAPSHOT"));
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr compatible Photon version 1.0.0rc1"));
assertTrue(ValidInt96Stats.hasValidInt96Stats("parquet-mr compatible Photon version 1.0.0rc1-SNAPSHOT"));
}

@Test
public void testInvalidApplications() {
assertFalse(ValidInt96Stats.hasValidInt96Stats("arrow-rs version 0.1.0"));
assertFalse(ValidInt96Stats.hasValidInt96Stats("impala version 1.6.0"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
Expand Down Expand Up @@ -93,7 +95,7 @@ public void testContractNonStringTypes() {
testTruncator(
Types.required(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("test_fixed_interval"), false);
testTruncator(Types.required(BINARY).as(DECIMAL).precision(10).scale(2).named("test_binary_decimal"), false);
testTruncator(Types.required(INT96).named("test_int96"), false);
testInt96Truncator(Types.required(INT96).named("test_int96"), false);
}

@Test
Expand Down Expand Up @@ -157,6 +159,21 @@ public void testContractStringTypes() {
testTruncator(Types.required(FIXED_LEN_BYTE_ARRAY).length(5).named("test_fixed"), true);
}

private Binary createInt96Value(long nanoseconds, int julianDay) {
return Binary.fromConstantByteArray(ByteBuffer.allocate(12)
.order(ByteOrder.LITTLE_ENDIAN)
.putLong(nanoseconds)
.putInt(julianDay)
.array());
}

private void testInt96Truncator(PrimitiveType type, boolean strict) {
BinaryTruncator truncator = BinaryTruncator.getTruncator(type);
Comparator<Binary> comparator = type.comparator();
checkContract(truncator, comparator, createInt96Value(0, 2458849), strict, strict);
checkContract(truncator, comparator, createInt96Value(100, 128849), strict, strict);
}

private void testTruncator(PrimitiveType type, boolean strict) {
BinaryTruncator truncator = BinaryTruncator.getTruncator(type);
Comparator<Binary> comparator = type.comparator();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.parquet.schema;

import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_FLOAT16_COMPARATOR;
import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_INT96_TIMESTAMP_COMPARATOR;
import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR;
import static org.apache.parquet.schema.PrimitiveComparator.BOOLEAN_COMPARATOR;
import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_COMPARATOR;
Expand All @@ -33,8 +34,10 @@

import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import org.apache.parquet.example.data.simple.NanoTime;
import org.apache.parquet.io.api.Binary;
import org.junit.Test;

Expand Down Expand Up @@ -274,6 +277,61 @@ public void testBinaryAsSignedIntegerComparatorWithEquals() {
}
}

private Binary timestampToInt96(String timestamp) {
LocalDateTime dt = LocalDateTime.parse(timestamp);
long julianDay = dt.toLocalDate().toEpochDay() + 2440588; // Convert to Julian Day
long nanos = dt.toLocalTime().toNanoOfDay();
return new NanoTime((int) julianDay, nanos).toBinary();
}

@Test
public void testInt96Comparator() {
Binary[] valuesInAscendingOrder = {
timestampToInt96("2020-01-01T00:00:00.000"),
timestampToInt96("2020-01-01T10:00:00.000"),
timestampToInt96("2020-02-29T23:59:59.999"),
timestampToInt96("2020-12-31T23:59:59.999"),
timestampToInt96("2021-01-01T00:00:00.000"),
timestampToInt96("2023-06-15T12:30:45.500"),
timestampToInt96("2024-02-29T15:45:30.750"),
timestampToInt96("2024-12-25T07:00:00.000"),
timestampToInt96("2025-01-01T00:00:00.000"),
timestampToInt96("2025-07-04T20:00:00.000"),
timestampToInt96("2025-07-04T20:50:00.000"),
timestampToInt96("2025-12-31T23:59:59.999")
};

java.util.function.Function<Binary, Binary>[] perturb = new java.util.function.Function[] {
(java.util.function.Function<Binary, Binary>) b -> b,
(java.util.function.Function<Binary, Binary>) b -> Binary.fromReusedByteArray(b.getBytes()),
(java.util.function.Function<Binary, Binary>) b -> Binary.fromConstantByteArray(b.getBytes()),
(java.util.function.Function<Binary, Binary>) b -> {
byte[] originalBytes = b.getBytes();
byte[] paddedBuffer = new byte[originalBytes.length + 20];
int offset = 10;
for (int i = 0; i < paddedBuffer.length; i++) {
paddedBuffer[i] = (byte) (0xAA + (i % 5));
}
System.arraycopy(originalBytes, 0, paddedBuffer, offset, originalBytes.length);
return Binary.fromReusedByteArray(paddedBuffer, offset, originalBytes.length);
}
};

for (int i = 0; i < valuesInAscendingOrder.length; ++i) {
for (int j = 0; j < valuesInAscendingOrder.length; ++j) {
Binary bi = valuesInAscendingOrder[i];
Binary bj = valuesInAscendingOrder[j];
for (java.util.function.Function<Binary, Binary> fi : perturb) {
for (java.util.function.Function<Binary, Binary> fj : perturb) {
Binary perturbedBi = fi.apply(bi);
Binary perturbedBj = fj.apply(bj);
assertEquals(Integer.compare(i, j), BINARY_AS_INT96_TIMESTAMP_COMPARATOR.compare(perturbedBi, perturbedBj));
}
}
}
}
}

@Test
public void testFloat16Comparator() {
Binary[] valuesInAscendingOrder = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1334,7 +1334,7 @@ public void testTypeConstructionWithUndefinedColumnOrder() {
@Test
public void testTypeConstructionWithTypeDefinedColumnOrder() {
PrimitiveTypeName[] types =
new PrimitiveTypeName[] {BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BINARY, FIXED_LEN_BYTE_ARRAY};
new PrimitiveTypeName[] {BOOLEAN, INT32, INT64, INT96, FLOAT, DOUBLE, BINARY, FIXED_LEN_BYTE_ARRAY};
for (PrimitiveTypeName type : types) {
String name = type.toString() + "_";
int len = type == FIXED_LEN_BYTE_ARRAY ? 42 : 0;
Expand All @@ -1350,8 +1350,6 @@ public void testTypeConstructionWithTypeDefinedColumnOrder() {

@Test
public void testTypeConstructionWithUnsupportedColumnOrder() {
assertThrows(null, IllegalArgumentException.class, (Callable<PrimitiveType>) () ->
Types.optional(INT96).columnOrder(ColumnOrder.typeDefined()).named("int96_unsupported"));
assertThrows(null, IllegalArgumentException.class, (Callable<PrimitiveType>)
() -> Types.optional(FIXED_LEN_BYTE_ARRAY)
.length(12)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,6 @@ public void testIntervalAnnotationRejectsNonFixed12() {

@Test
public void testTypeConstructionWithUnsupportedColumnOrder() {
assertThrows(null, IllegalArgumentException.class, () -> Types.optional(INT96)
.columnOrder(ColumnOrder.typeDefined())
.named("int96_unsupported"));
assertThrows(null, IllegalArgumentException.class, () -> Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
.length(12)
.as(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance())
Expand Down
Loading
Loading