Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,6 @@ public Statistics<?> build() {

// Builder for FLOAT16 type to handle special cases of min/max values like NaN, -0.0, and 0.0
private static class Float16Builder extends Builder {
private static final Binary POSITIVE_ZERO_LITTLE_ENDIAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x00});
private static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN =
Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80});

public Float16Builder(PrimitiveType type) {
super(type);
assert type.getPrimitiveTypeName() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
Expand All @@ -162,15 +158,17 @@ public Statistics<?> build() {
short max = bMax.get2BytesLittleEndian();
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
if (Float16.isNaN(min) || Float16.isNaN(max)) {
stats.setMinMax(POSITIVE_ZERO_LITTLE_ENDIAN, NEGATIVE_ZERO_LITTLE_ENDIAN);
stats.setMinMax(Float16.POSITIVE_ZERO_LITTLE_ENDIAN, Float16.POSITIVE_ZERO_LITTLE_ENDIAN);
((Statistics<?>) stats).hasNonNullValue = false;
} else {
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
if (min == (short) 0x0000) {
stats.setMinMax(NEGATIVE_ZERO_LITTLE_ENDIAN, bMax);
bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN;
stats.setMinMax(bMin, bMax);
}
if (max == (short) 0x8000) {
stats.setMinMax(bMin, POSITIVE_ZERO_LITTLE_ENDIAN);
bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN;
stats.setMinMax(bMin, bMax);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import java.util.List;
import org.apache.parquet.filter2.predicate.Statistics;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.Float16;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.PrimitiveComparator;
import org.apache.parquet.schema.PrimitiveType;

Expand Down Expand Up @@ -82,6 +84,8 @@ int compareValueToMax(int arrayIndex) {
private final List<Binary> maxValues = new ArrayList<>();
private final BinaryTruncator truncator;
private final int truncateLength;
private final boolean isFloat16;
private boolean invalid;

private static Binary convert(ByteBuffer buffer) {
return Binary.fromReusedByteBuffer(buffer);
Expand All @@ -94,6 +98,7 @@ private static ByteBuffer convert(Binary value) {
BinaryColumnIndexBuilder(PrimitiveType type, int truncateLength) {
truncator = BinaryTruncator.getTruncator(type);
this.truncateLength = truncateLength;
this.isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation;
}

@Override
Expand All @@ -104,12 +109,43 @@ void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) {

@Override
void addMinMax(Object min, Object max) {
minValues.add(min == null ? null : truncator.truncateMin((Binary) min, truncateLength));
maxValues.add(max == null ? null : truncator.truncateMax((Binary) max, truncateLength));
Binary bMin = (Binary) min;
Binary bMax = (Binary) max;

if (isFloat16 && bMin != null && bMax != null) {
if (bMin.length() != LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES
|| bMax.length() != LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES) {
// Should not happen for Float16
invalid = true;
} else {
short sMin = bMin.get2BytesLittleEndian();
short sMax = bMax.get2BytesLittleEndian();

if (Float16.isNaN(sMin) || Float16.isNaN(sMax)) {
invalid = true;
}

// Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to
// ensure that no 0.0 values are skipped
// +0.0 is 0x0000, -0.0 is 0x8000 (little endian: 00 00, 00 80)
if (sMin == (short) 0x0000) {
bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN;
}
if (sMax == (short) 0x8000) {
bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN;
}
}
}

minValues.add(bMin == null ? null : truncator.truncateMin(bMin, truncateLength));
maxValues.add(bMax == null ? null : truncator.truncateMax(bMax, truncateLength));
}

@Override
ColumnIndexBase<Binary> createColumnIndex(PrimitiveType type) {
if (invalid) {
return null;
}
BinaryColumnIndex columnIndex = new BinaryColumnIndex(type);
columnIndex.minValues = minValues.toArray(new Binary[0]);
columnIndex.maxValues = maxValues.toArray(new Binary[0]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@
* Ref: https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java
*/
public class Float16 {
// Positive zero of type half-precision float.
public static final Binary POSITIVE_ZERO_LITTLE_ENDIAN =
Binary.fromConstantByteArray(new byte[] {0x00, 0x00}, 0, 2);
// Negative zero of type half-precision float.
public static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN =
Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80}, 0, 2);

// Positive infinity of type half-precision float.
private static final short POSITIVE_INFINITY = (short) 0x7c00;
// A Not-a-Number representation of a half-precision float.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import static org.apache.parquet.schema.LogicalTypeAnnotation.float16Type;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;

import java.io.File;
import java.io.IOException;
Expand Down Expand Up @@ -122,37 +123,25 @@ public class TestFloat16ReadWriteRoundTrip {
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7c})
}; // Infinity

private Binary[] valuesAllPositiveZeroMinMax = {
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0
private Binary[] valuesAllZeroMinMax = {
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00})
}; // +0

private Binary[] valuesAllNegativeZeroMinMax = {
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80}), // -0
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80})
}; // -0

private Binary[] valuesWithNaNMinMax = {
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0xc0}), // -2.0
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x7e})
}; // NaN

@Test
public void testFloat16ColumnIndex() throws IOException {
List<Binary[]> testValues = List.of(
valuesInAscendingOrder,
valuesInDescendingOrder,
valuesUndefinedOrder,
valuesAllPositiveZero,
valuesAllNegativeZero,
valuesWithNaN);
valuesAllNegativeZero);
List<Binary[]> expectedValues = List.of(
valuesInAscendingOrderMinMax,
valuesInDescendingOrderMinMax,
valuesUndefinedOrderMinMax,
valuesAllPositiveZeroMinMax,
valuesAllNegativeZeroMinMax,
valuesWithNaNMinMax);
valuesAllZeroMinMax,
valuesAllZeroMinMax);

for (int i = 0; i < testValues.size(); i++) {
MessageType schema = Types.buildMessage()
Expand Down Expand Up @@ -187,6 +176,37 @@ public void testFloat16ColumnIndex() throws IOException {
}
}

@Test
public void testFloat16NanColumnIndex() throws IOException {
MessageType schema = Types.buildMessage()
.required(FIXED_LEN_BYTE_ARRAY)
.as(float16Type())
.length(2)
.named("col_float16")
.named("msg");

Configuration conf = new Configuration();
GroupWriteSupport.setSchema(schema, conf);
GroupFactory factory = new SimpleGroupFactory(schema);
Path path = newTempPath();
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
.withConf(conf)
.withDictionaryEncoding(false)
.build()) {

for (Binary value : valuesWithNaN) {
writer.write(factory.newGroup().append("col_float16", value));
}
}

try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
ColumnChunkMetaData column =
reader.getFooter().getBlocks().get(0).getColumns().get(0);
ColumnIndex index = reader.readColumnIndex(column);
assertNull(index);
}
}

private Path newTempPath() throws IOException {
File file = temp.newFile();
Preconditions.checkArgument(file.delete(), "Could not remove temp file");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ public class TestFloat16Statistics {
// Float16Builder: Drop min/max values in case of NaN as the sorting order of values is undefined
private Binary[] valuesWithNaNStatsMinMax = {
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00}), // +0
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x80})
}; // -0
Binary.fromConstantByteArray(new byte[] {(byte) 0x00, (byte) 0x00})
}; // +0

@Test
public void testFloat16StatisticsMultipleCases() throws IOException {
Expand Down