GoogleCloudPlatform · dhalperi · Jan 26, 2017 · Jan 13, 2017 · Jan 26, 2017 · dhalperi
diff --git a/contrib/hadoop/src/main/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSource.java b/contrib/hadoop/src/main/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSource.java
@@ -239,12 +239,21 @@ private <T> Coder<T> getDefaultCoder(Class<T> c) {
   public long getEstimatedSizeBytes(PipelineOptions options) {
     long size = 0;
     try {
+      // If this source represents a split from splitIntoBundles, then return the size of the split,
+      // rather then the entire input
+      if (serializableSplit != null) {
+        return serializableSplit.getSplit().getLength();
+      }
+
       Job job = Job.getInstance(); // new instance
       for (FileStatus st : listStatus(createFormat(job), job)) {
         size += st.getLen();
       }
     } catch (IOException | NoSuchMethodException | InvocationTargetException
-        | IllegalAccessException | InstantiationException e) {
+        | IllegalAccessException | InstantiationException) {
+      // ignore, and return 0
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
       // ignore, and return 0
     }
     return size;

diff --git a/...b/hadoop/src/test/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSourceTest.java b/...b/hadoop/src/test/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSourceTest.java
@@ -152,6 +152,29 @@ public void testSplits() throws Exception {
     assertTrue(nonEmptySplits > 2);
   }
 
+  @Test
+  public void testSplitEstimatedSize() throws Exception {
+    PipelineOptions options = PipelineOptionsFactory.create();
+
+    List<KV<IntWritable, Text>> expectedResults = createRandomRecords(3, 10000, 0);
+    File file = createFileWithData("tmp.avro", expectedResults);
+
+    HadoopFileSource<IntWritable, Text> source = HadoopFileSource.from(
+        file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class
+    );
+
+    long originalSize = source.getEstimatedSizeBytes(options);
+    long splitTotalSize = 0;
+    List<? extends BoundedSource<KV<IntWritable, Text>>> splits = source.splitIntoBundles(
+        SequenceFile.SYNC_INTERVAL, options
+    );
+    for (BoundedSource<KV<IntWritable, Text>> splitSource : splits) {
+      splitTotalSize += splitSource.getEstimatedSizeBytes(options);
+    }
+    // Assert that the estimated size of the whole is the sum of its parts
+    assertEquals(originalSize, splitTotalSize);
+  }
+
   private File createFileWithData(String filename, List<KV<IntWritable, Text>> records)
       throws IOException {
     File tmpFile = tmpFolder.newFile(filename);