-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
More accurate estimate on parquet row groups size #11258
base: main
Are you sure you want to change the base?
Changes from 1 commit
07f6b97
5854ccb
c83198e
5e64668
742eaac
07fe927
98ecfac
a6e9ef3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,9 +35,8 @@ | |
import java.nio.file.Path; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Random; | ||
import java.util.UUID; | ||
import java.util.function.Function; | ||
import java.util.stream.IntStream; | ||
import org.apache.avro.generic.GenericData; | ||
import org.apache.avro.generic.GenericRecord; | ||
import org.apache.avro.generic.GenericRecordBuilder; | ||
|
@@ -223,20 +222,18 @@ public void testTwoLevelList() throws IOException { | |
assertThat(recordRead.get("topbytes")).isEqualTo(expectedBinary); | ||
} | ||
|
||
|
||
@Test | ||
public void testParquetRowGroupSize() throws IOException { | ||
// verify parquet row group size should be close to configured size | ||
ImmutableList.Builder<Types.NestedField> columnsBuilder = ImmutableList.builder(); | ||
|
||
for (int i = 1; i <= 50; i++) { | ||
columnsBuilder.add(optional(i, "stringCol" + i, Types.StringType.get())); | ||
} | ||
int recordCount = 100000; | ||
int columnCount = 50; | ||
|
||
List<Types.NestedField> columns = columnsBuilder.build(); | ||
List<Types.NestedField> columns = | ||
IntStream.rangeClosed(1, columnCount) | ||
.mapToObj(i -> optional(i, "stringCol" + i, Types.StringType.get())) | ||
.collect(ImmutableList.toImmutableList()); | ||
Schema schema = new Schema(columns); | ||
|
||
int recordCount = 100000; | ||
File file = createTempFile(temp); | ||
|
||
List<GenericData.Record> records = Lists.newArrayListWithCapacity(recordCount); | ||
|
@@ -252,19 +249,22 @@ public void testParquetRowGroupSize() throws IOException { | |
} | ||
|
||
long actualSize = | ||
write( | ||
file, | ||
schema, | ||
ImmutableMap.of("write.parquet.row-group-size-bytes", "1048576"), | ||
ParquetAvroWriter::buildWriter, | ||
records.toArray(new GenericData.Record[] {})); | ||
write( | ||
file, | ||
schema, | ||
ImmutableMap.of("write.parquet.row-group-size-bytes", "1048576"), | ||
ParquetAvroWriter::buildWriter, | ||
records.toArray(new GenericData.Record[] {})); | ||
|
||
try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(localInput(file)))) { | ||
ParquetMetadata footer = reader.getFooter(); | ||
for (int i = 1; i < footer.getBlocks().size() - 1; i++) { | ||
BlockMetaData blockMetaData = footer.getBlocks().get(i); | ||
System.out.println("Block " + i + " compressed size: " + blockMetaData.getCompressedSize()); | ||
assertThat(footer.getBlocks().get(i).getCompressedSize()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it correct to look at the compressed size here instead of looking at There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If parquet row group size is referring to on disk size, then it should be getCompressedSize(). |
||
.isBetween((long) 900 * 1024, (long) 1200 * 1024); | ||
} | ||
|
||
assertThat(footer.getBlocks().get(footer.getBlocks().size() - 1).getCompressedSize()) | ||
.isLessThan((long) 1200 * 1024); | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the test skips first row group. The first row group is expected to be not accurate in size.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you please add a comment so that this is clear for a reader why the first one is skipped?