Skip to content
This repository has been archived by the owner on Nov 11, 2022. It is now read-only.

Commit

Permalink
Fix HadoopFileSource’s split size estimate (#534)
Browse files Browse the repository at this point in the history
* Fix HadoopFileSource’s split size estimate

* Properly set interrupted state
  • Loading branch information
igorbernstein2 authored and dhalperi committed Jan 26, 2017
1 parent 2e57ab1 commit efd33cc
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,21 @@ private <T> Coder<T> getDefaultCoder(Class<T> c) {
public long getEstimatedSizeBytes(PipelineOptions options) {
long size = 0;
try {
// If this source represents a split from splitIntoBundles, then return the size of the split,
// rather then the entire input
if (serializableSplit != null) {
return serializableSplit.getSplit().getLength();
}

Job job = Job.getInstance(); // new instance
for (FileStatus st : listStatus(createFormat(job), job)) {
size += st.getLen();
}
} catch (IOException | NoSuchMethodException | InvocationTargetException
| IllegalAccessException | InstantiationException e) {
| IllegalAccessException | InstantiationException) {
// ignore, and return 0
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
// ignore, and return 0
}
return size;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,29 @@ public void testSplits() throws Exception {
assertTrue(nonEmptySplits > 2);
}

@Test
public void testSplitEstimatedSize() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();

List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0);
File file = createFileWithData("tmp.avro", expectedResults);

HadoopFileSource<IntWritable, Text> source = HadoopFileSource.from(
file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class
);

long originalSize = source.getEstimatedSizeBytes(options);
long splitTotalSize = 0;
List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source.splitIntoBundles(
SequenceFile.SYNC_INTERVAL, options
);
for (BoundedSource<KV<IntWritable, Text>> splitSource : splits) {
splitTotalSize += splitSource.getEstimatedSizeBytes(options);
}
// Assert that the estimated size of the whole is the sum of its parts
assertEquals(originalSize, splitTotalSize);
}

private File createFileWithData(String filename, List<KV<IntWritable, Text>> records)
throws IOException {
File tmpFile = tmpFolder.newFile(filename);
Expand Down

0 comments on commit efd33cc

Please sign in to comment.