-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Flink: Add Orc value reader, writer implementations #1255
Changes from 1 commit
0144a1f
7e03783
11f5a08
b77a713
dbc4cce
9cf929e
7ffb1a9
75c2434
21dd824
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -88,7 +88,7 @@ public static OrcValueWriter<UUID> uuids() { | |
return UUIDWriter.INSTANCE; | ||
} | ||
|
||
public static OrcValueWriter<byte[]> fixed() { | ||
public static OrcValueWriter<byte[]> bytes() { | ||
return FixedWriter.INSTANCE; | ||
} | ||
|
||
|
@@ -337,7 +337,7 @@ public void nonNullWrite(int rowId, BigDecimal data, ColumnVector output) { | |
"Cannot write value as decimal(%s,%s), invalid precision: %s", precision, scale, data); | ||
|
||
((DecimalColumnVector) output).vector[rowId] | ||
.setFromLongAndScale(data.unscaledValue().longValueExact(), scale); | ||
.setFromLongAndScale(data.unscaledValue().longValueExact(), data.scale()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about the TODO to check the scale matches the column's scale? As long as we're updating this, does it make sense to fix that, since we just had a decimal scale problem? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, we don't need to change this now, because this merged patch has fixed it. 6f96b36#diff-b1b07b15f036000a3f2bed76fdd9f961R334 |
||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -99,7 +99,7 @@ public OrcValueWriter<?> primitive(Type.PrimitiveType iPrimitive, LogicalType fl | |
case DOUBLE: | ||
return GenericOrcWriters.doubles(); | ||
case DATE: | ||
return GenericOrcWriters.dates(); | ||
return FlinkOrcWriters.dates(); | ||
case TIME: | ||
return FlinkOrcWriters.times(); | ||
case TIMESTAMP: | ||
|
@@ -114,9 +114,8 @@ public OrcValueWriter<?> primitive(Type.PrimitiveType iPrimitive, LogicalType fl | |
case UUID: | ||
return GenericOrcWriters.uuids(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for UUID type, we flink should return |
||
case FIXED: | ||
return GenericOrcWriters.fixed(); | ||
case BINARY: | ||
return GenericOrcWriters.byteBuffers(); | ||
return GenericOrcWriters.bytes(); | ||
case DECIMAL: | ||
Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; | ||
return FlinkOrcWriters.decimals(decimalType.scale(), decimalType.precision()); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,11 +22,16 @@ | |
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import org.apache.commons.compress.utils.Lists; | ||
import org.apache.flink.table.data.RowData; | ||
import org.apache.flink.table.types.logical.RowType; | ||
import org.apache.iceberg.Files; | ||
import org.apache.iceberg.Schema; | ||
import org.apache.iceberg.data.DataTest; | ||
import org.apache.iceberg.data.RandomGenericData; | ||
import org.apache.iceberg.data.Record; | ||
import org.apache.iceberg.data.orc.GenericOrcWriter; | ||
import org.apache.iceberg.flink.FlinkSchemaUtil; | ||
import org.apache.iceberg.io.CloseableIterable; | ||
import org.apache.iceberg.io.FileAppender; | ||
|
@@ -43,7 +48,25 @@ public class TestFlinkOrcReaderWriter extends DataTest { | |
|
||
@Override | ||
protected void writeAndValidate(Schema schema) throws IOException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like this validates records read by the reader against records written by the reader and subsequently read by the reader. I think it should validate the reader and writer separately. I think it should have to parts:
That way, we're always comparing results against the generics that were originally generated. I think we already have the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You testing method is correct, but we don't have
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @openinx , Sorry to block you so long. Now it is merged. You might want to take a look. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since your |
||
Iterable<RowData> iterable = RandomData.generateRowData(schema, NUM_RECORDS, 1990L); | ||
List<Record> records = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); | ||
|
||
File recordsFile = temp.newFile(); | ||
Assert.assertTrue("Delete should succeed", recordsFile.delete()); | ||
|
||
try (FileAppender<Record> writer = ORC.write(Files.localOutput(recordsFile)) | ||
.schema(schema) | ||
.createWriterFunc(GenericOrcWriter::buildWriter) | ||
.build()) { | ||
writer.addAll(records); | ||
} | ||
|
||
List<RowData> rowDataList = Lists.newArrayList(); | ||
try (CloseableIterable<RowData> reader = ORC.read(Files.localInput(recordsFile)) | ||
.project(schema) | ||
.createReaderFunc(type -> FlinkOrcReader.buildReader(schema, type)) | ||
.build()) { | ||
reader.forEach(rowDataList::add); | ||
} | ||
|
||
File testFile = temp.newFile(); | ||
Assert.assertTrue("Delete should succeed", testFile.delete()); | ||
|
@@ -53,20 +76,21 @@ protected void writeAndValidate(Schema schema) throws IOException { | |
.schema(schema) | ||
.createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) | ||
.build()) { | ||
writer.addAll(iterable); | ||
writer.addAll(rowDataList); | ||
} | ||
|
||
try (CloseableIterable<RowData> reader = ORC.read(Files.localInput(testFile)) | ||
.project(schema) | ||
.createReaderFunc(type -> FlinkOrcReader.buildReader(schema, type)) | ||
.build()) { | ||
Iterator<RowData> expected = iterable.iterator(); | ||
Iterator<RowData> expected = rowDataList.iterator(); | ||
Iterator<RowData> rows = reader.iterator(); | ||
for (int i = 0; i < NUM_RECORDS; i += 1) { | ||
Assert.assertTrue("Should have expected number of rows", rows.hasNext()); | ||
Assert.assertEquals(expected.next(), rows.next()); | ||
} | ||
Assert.assertFalse("Should not have extra rows", rows.hasNext()); | ||
Assert.assertFalse("Should not have extra rows", expected.hasNext()); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should the
FixedWriter
class also be renamed toBytesWriter
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good.