GoogleCloudPlatform · dhalperi · Apr 19, 2017 · Apr 19, 2017
diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java
@@ -1211,6 +1211,8 @@ private abstract static class BigQuerySourceBase extends BoundedSource<TableRow>
     protected final BigQueryServices bqServices;
     protected final ValueProvider<String> executingProject;
 
+    private List<BoundedSource<TableRow>> cachedSplitResult;
+
     private BigQuerySourceBase(
         String jobIdToken,
         String extractDestinationDir,
@@ -1225,19 +1227,30 @@ private BigQuerySourceBase(
     @Override
     public List<BoundedSource<TableRow>> splitIntoBundles(
         long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
-      BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
-      TableReference tableToExtract = getTableToExtract(bqOptions);
-      JobService jobService = bqServices.getJobService(bqOptions);
-      String extractJobId = getExtractJobId(jobIdToken);
-      List<String> tempFiles = executeExtract(extractJobId, tableToExtract, jobService);
-
-      TableSchema tableSchema = bqServices.getDatasetService(bqOptions).getTable(
-          tableToExtract.getProjectId(),
-          tableToExtract.getDatasetId(),
-          tableToExtract.getTableId()).getSchema();
-
-      cleanupTempResource(bqOptions);
-      return createSources(tempFiles, tableSchema);
+      // splitIntoBundles() can be called multiple times, e.g. Dataflow runner may call it multiple
+      // times with different desiredBundleSizeBytes in case the splitIntoBundles() call produces
+      // too many sources. We ignore desiredBundleSizeBytes anyway, however in any case, we should
+      // not initiate another BigQuery extract job for the repeated splitIntoBundles() calls.
+      if (cachedSplitResult == null) {
+        BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
+        TableReference tableToExtract = getTableToExtract(bqOptions);
+        JobService jobService = bqServices.getJobService(bqOptions);
+        String extractJobId = getExtractJobId(jobIdToken);
+        List<String> tempFiles = executeExtract(extractJobId, tableToExtract, jobService);
+
+        TableSchema tableSchema =
+            bqServices
+                .getDatasetService(bqOptions)
+                .getTable(
+                    tableToExtract.getProjectId(),
+                    tableToExtract.getDatasetId(),
+                    tableToExtract.getTableId())
+                .getSchema();
+
+        cleanupTempResource(bqOptions);
+        cachedSplitResult = createSources(tempFiles, tableSchema);
+      }
+      return cachedSplitResult;
     }
 
     protected abstract TableReference getTableToExtract(BigQueryOptions bqOptions) throws Exception;

diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java
@@ -30,6 +30,7 @@
 import static org.mockito.Matchers.eq;
 import static org.mockito.Mockito.doNothing;
 import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.when;
 
 import com.google.api.client.util.Data;
@@ -1130,10 +1131,14 @@ public void testBigQueryTableSourceInitSplit() throws Exception {
 
     List<? extends BoundedSource<TableRow>> sources = bqSource.splitIntoBundles(100, options);
     assertEquals(1, sources.size());
+    // Simulate a repeated call to splitIntoBundles(), like a Dataflow worker will sometimes do.
+    sources = bqSource.splitIntoBundles(200, options);
+    assertEquals(1, sources.size());
     BoundedSource<TableRow> actual = sources.get(0);
     assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
 
-    Mockito.verify(mockJobService)
+    // A repeated call to splitIntoBundles() should not have caused a duplicate extract job.
+    Mockito.verify(mockJobService, times(1))
         .startExtractJob(Mockito.<JobReference>any(), Mockito.<JobConfigurationExtract>any());
   }