gauravgcet
diff --git a/‎sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/CustomSources.java
Lines changed: 71 additions & 65 deletions b/‎sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/dataflow/CustomSources.java
Lines changed: 71 additions & 65 deletions
diff --git a/‎sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/DataflowApiUtils.java
Lines changed: 60 additions & 0 deletions b/‎sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/DataflowApiUtils.java
Lines changed: 60 additions & 0 deletions
diff --git a/‎sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/DataflowWorker.java
Lines changed: 12 additions & 5 deletions b/‎sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/DataflowWorker.java
Lines changed: 12 additions & 5 deletions
@@ -25,6 +25,7 @@
 import static com.google.cloud.dataflow.sdk.util.Structs.addStringList;
 import static com.google.cloud.dataflow.sdk.util.Structs.getString;
 import static com.google.cloud.dataflow.sdk.util.Structs.getStrings;
+import static com.google.common.base.Preconditions.checkArgument;
 
 import com.google.api.client.util.BackOff;
 import com.google.api.client.util.Base64;
@@ -33,7 +34,6 @@
 import com.google.api.services.dataflow.model.DerivedSource;
 import com.google.api.services.dataflow.model.DynamicSourceSplit;
 import com.google.api.services.dataflow.model.SourceMetadata;
-import com.google.api.services.dataflow.model.SourceOperationRequest;
 import com.google.api.services.dataflow.model.SourceOperationResponse;
 import com.google.api.services.dataflow.model.SourceSplitOptions;
 import com.google.api.services.dataflow.model.SourceSplitRequest;
@@ -47,6 +47,7 @@
 import com.google.cloud.dataflow.sdk.options.PipelineOptions;
 import com.google.cloud.dataflow.sdk.runners.DataflowPipelineTranslator;
 import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
+import com.google.cloud.dataflow.sdk.runners.worker.DataflowApiUtils;
 import com.google.cloud.dataflow.sdk.runners.worker.ReaderFactory;
 import com.google.cloud.dataflow.sdk.runners.worker.SourceTranslationUtils;
 import com.google.cloud.dataflow.sdk.runners.worker.StreamingModeExecutionContext;
@@ -61,7 +62,6 @@
 import com.google.cloud.dataflow.sdk.util.common.worker.NativeReader;
 import com.google.cloud.dataflow.sdk.values.PValue;
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
 import com.google.protobuf.ByteString;
 
 import org.joda.time.Duration;
@@ -87,15 +87,11 @@ public class CustomSources {
   private static final String SERIALIZED_SOURCE = "serialized_source";
   @VisibleForTesting static final String SERIALIZED_SOURCE_SPLITS = "serialized_source_splits";
   private static final long DEFAULT_DESIRED_BUNDLE_SIZE_BYTES = 64 * (1 << 20);
-
-  public static final String TOO_MANY_SOURCE_SPLITS_ERROR =
-      "Total number of Source objects generated by splitIntoBundles() operation, %d, is"
-      + " larger than the allowable limit, %d. For more information, please check the corresponding"
-      + " FAQ entry at:\n"
-      + "https://cloud.google.com/dataflow/faq";
-
-  // Maximum number of custom source splits currently supported by Dataflow.
-  private static final int MAX_NUMBER_OF_SPLITS = 16000;
+  /**
+   * The current limit on the size of a ReportWorkItemStatus RPC to Google Cloud Dataflow, which
+   * includes the initial splits, is 20 MB.
+   */
+  public static final long DATAFLOW_SPLIT_RESPONSE_API_SIZE_BYTES = 20 * (1 << 20);
 
   private static final Logger LOG = LoggerFactory.getLogger(CustomSources.class);
 
@@ -144,17 +140,47 @@ public static DynamicSourceSplit toSourceSplit(
    * Executes a protocol-level split {@code SourceOperationRequest} for bounded sources
    * by deserializing its source to a {@code BoundedSource}, splitting it, and
    * serializing results back.
+   *
+   * <p>When the splits produced by this function are too large to be serialized to the Dataflow
+   * API, splitting is retried once with an increase in the desired bundle size. This change aims
+   * to work around API limitations on split size.
    */
-  public static SourceOperationResponse performSourceOperation(
-      SourceOperationRequest request, PipelineOptions options) throws Exception {
-    SourceOperationResponse response = new SourceOperationResponse();
-    if (request.getSplit() != null) {
-      response.setSplit(performSplit(request.getSplit(), options));
-    } else {
-      throw new UnsupportedOperationException(
-          "Unsupported source operation request: " + request);
-    }
-    return response;
+  public static SourceOperationResponse performSplit(
+      SourceSplitRequest request, PipelineOptions options) throws Exception {
+    Source<?> anySource = deserializeFromCloudSource(request.getSource().getSpec());
+    checkArgument(
+        anySource instanceof BoundedSource, "Cannot split a non-Bounded source: %s", anySource);
+    BoundedSource<?> source = (BoundedSource<?>) anySource;
+
+    // Compute the desired bundle size given by the service, or default if none was provided.
+    long desiredBundleSizeBytes = DEFAULT_DESIRED_BUNDLE_SIZE_BYTES;
+    SourceSplitOptions splitOptions = request.getOptions();
+    if (splitOptions != null && splitOptions.getDesiredBundleSizeBytes() != null) {
+      desiredBundleSizeBytes = splitOptions.getDesiredBundleSizeBytes();
+    }
+
+    // Try generating initial splits normally.
+    SourceSplitResponse splits = performSplit(source, options, desiredBundleSizeBytes);
+    long serializedSize = DataflowApiUtils.computeSerializedSizeBytes(splits);
+
+    // If split response is too large, scale desired size for expected DATAFLOW_API_SIZE_BYTES/2.
+    if (serializedSize > DATAFLOW_SPLIT_RESPONSE_API_SIZE_BYTES) {
+      double expansion = 2 * (double) serializedSize / DATAFLOW_SPLIT_RESPONSE_API_SIZE_BYTES;
+      long expandedBundleSizeBytes = (long) (desiredBundleSizeBytes * expansion);
+      LOG.warn(
+          "Splitting source {} into bundles of estimated size {} bytes produced {} bundles, which"
+              + " have total serialized size {} bytes. As this is too large for the Google Cloud"
+              + " Dataflow API, retrying splitting once with increased desiredBundleSizeBytes {}"
+              + " to reduce the number of splits.",
+          source,
+          desiredBundleSizeBytes,
+          splits.getBundles().size(),
+          serializedSize,
+          expandedBundleSizeBytes);
+      splits = performSplit(source, options, expandedBundleSizeBytes);
+    }
+
+    return new SourceOperationResponse().setSplit(splits);
   }
 
   /**
@@ -270,9 +296,8 @@ private UnboundedSource<T, UnboundedSource.CheckpointMark> parseSource(int index
       } catch (Exception e) {
         throw new RuntimeException("Parsing serialized source splits failed: ", e);
       }
-      Preconditions.checkArgument(
-          serializedSplits != null, "UnboundedSource object did not contain splits");
-      Preconditions.checkArgument(
+      checkArgument(serializedSplits != null, "UnboundedSource object did not contain splits");
+      checkArgument(
           index < serializedSplits.size(),
           "UnboundedSource splits contained too few splits.  Requested index was %s, size was %s",
           index,
@@ -287,66 +312,48 @@ private UnboundedSource<T, UnboundedSource.CheckpointMark> parseSource(int index
   }
 
   private static SourceSplitResponse performSplit(
-      SourceSplitRequest request, PipelineOptions options)
+      BoundedSource<?> source, PipelineOptions options, long desiredBundleSizeBytes)
       throws Exception {
-    Source<?> anySource = deserializeFromCloudSource(request.getSource().getSpec());
-    if (!(anySource instanceof BoundedSource)) {
-      throw new UnsupportedOperationException("Cannot split a non-Bounded source: " + anySource);
-    }
-    BoundedSource<?> source = (BoundedSource<?>) anySource;
-    LOG.debug("Splitting source: {}", source);
+    LOG.debug("Splitting source {} into bundles of size {}", source, desiredBundleSizeBytes);
 
-    // Produce simple independent, unsplittable bundles with no metadata attached.
-    SourceSplitResponse response = new SourceSplitResponse();
-    response.setBundles(new ArrayList<DerivedSource>());
-    SourceSplitOptions splitOptions = request.getOptions();
-    Long desiredBundleSizeBytes =
-        (splitOptions == null) ? null : splitOptions.getDesiredBundleSizeBytes();
-    if (desiredBundleSizeBytes == null) {
-      desiredBundleSizeBytes = DEFAULT_DESIRED_BUNDLE_SIZE_BYTES;
-    }
     List<? extends BoundedSource<?>> bundles =
-        source.splitIntoBundles(desiredBundleSizeBytes, options);
-
-    if (bundles.size() > MAX_NUMBER_OF_SPLITS) {
-      throw new IOException(
-          String.format(TOO_MANY_SOURCE_SPLITS_ERROR, bundles.size(), MAX_NUMBER_OF_SPLITS));
-    }
+        ((BoundedSource<?>) source).splitIntoBundles(desiredBundleSizeBytes, options);
+    List<DerivedSource> splits = new ArrayList<>(bundles.size());
 
+    // Produce simple independent, unsplittable bundles with no metadata attached.
     LOG.debug("Splitting produced {} bundles", bundles.size());
     for (BoundedSource<?> split : bundles) {
       try {
         split.validate();
       } catch (Exception e) {
         throw new IllegalArgumentException(
-            "Splitting a valid source produced an invalid bundle. "
-                + "\nOriginal source: "
-                + source
-                + "\nInvalid bundle: "
-                + split,
+            String.format(
+                "Splitting a valid source produced an invalid source."
+                    + "\nOriginal source: %s\nInvalid source: %s",
+                source,
+                split),
             e);
       }
-      DerivedSource bundle = new DerivedSource();
 
-      com.google.api.services.dataflow.model.Source cloudSource =
-          serializeToCloudSource(split, options);
-      cloudSource.setDoesNotNeedSplitting(true);
-
-      bundle.setDerivationMode("SOURCE_DERIVATION_MODE_INDEPENDENT");
-      bundle.setSource(cloudSource);
-      response.getBundles().add(bundle);
+      splits.add(
+          new DerivedSource()
+              .setDerivationMode("SOURCE_DERIVATION_MODE_INDEPENDENT")
+              .setSource(serializeToCloudSource(split, options).setDoesNotNeedSplitting(true)));
     }
-    response.setOutcome("SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED");
-    return response;
+
+    // Return all the splits in the SourceSplitResponse.
+    return new SourceSplitResponse()
+        .setBundles(splits)
+        .setOutcome("SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED");
   }
 
-  public static Source<?> deserializeFromCloudSource(Map<String, Object> spec) throws Exception {
+  private static Source<?> deserializeFromCloudSource(Map<String, Object> spec) throws Exception {
     Source<?> source = (Source<?>) deserializeFromByteArray(
         Base64.decodeBase64(getString(spec, SERIALIZED_SOURCE)), "Source");
     try {
       source.validate();
     } catch (Exception e) {
-      LOG.error("Invalid source: " + source, e);
+      LOG.error("Invalid source: {}", source, e);
       throw e;
     }
     return source;
@@ -396,8 +403,7 @@ public static com.google.api.services.dataflow.model.Source serializeToCloudSour
           unboundedSource.generateInitialSplits(desiredNumSplits, options)) {
         encodedSplits.add(encodeBase64String(serializeToByteArray(split)));
       }
-      Preconditions.checkArgument(
-          !encodedSplits.isEmpty(), "UnboundedSources must have at least one split");
+      checkArgument(!encodedSplits.isEmpty(), "UnboundedSources must have at least one split");
       addStringList(cloudSource.getSpec(), SERIALIZED_SOURCE_SPLITS, encodedSplits);
     } else {
       throw new IllegalArgumentException("Unexpected source kind: " + source.getClass());
 
@@ -0,0 +1,60 @@
+/*******************************************************************************
+ * Copyright (C) 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ ******************************************************************************/
+package com.google.cloud.dataflow.sdk.runners.worker;
+
+import com.google.api.client.json.GenericJson;
+import com.google.api.client.json.JsonFactory;
+import com.google.api.client.json.JsonGenerator;
+import com.google.cloud.dataflow.sdk.util.Transport;
+import com.google.common.io.ByteStreams;
+import com.google.common.io.CountingOutputStream;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * A utility class for generic interactions with the Google Cloud Dataflow API.
+ */
+public final class DataflowApiUtils {
+  /**
+   * Determines the serialized size (in bytes) of the {@link GenericJson} object that will be
+   * serialized and sent to the Google Cloud Dataflow service API.
+   *
+   * <p>Uses only constant memory.
+   */
+  public static long computeSerializedSizeBytes(GenericJson object) throws IOException {
+    JsonFactory factory = object.getFactory();
+    if (factory == null) {
+      factory = Transport.getJsonFactory();
+    }
+
+    CountingOutputStream stream = new CountingOutputStream(ByteStreams.nullOutputStream());
+    JsonGenerator generator = null;
+    try {
+      generator = factory.createJsonGenerator(stream, StandardCharsets.UTF_8);
+      generator.serialize(object);
+      generator.close(); // also closes the stream.
+    } finally {
+      if (generator != null) {
+        generator.close();
+      }
+    }
+    return stream.getCount();
+  }
+
+  // Prevent construction of utility class.
+  private DataflowApiUtils() {}
+}
@@ -41,6 +41,7 @@
 import com.google.cloud.dataflow.sdk.util.PCollectionViewWindow;
 import com.google.cloud.dataflow.sdk.util.SideInputReader;
 import com.google.cloud.dataflow.sdk.util.UserCodeException;
+import com.google.cloud.dataflow.sdk.util.Weighted;
 import com.google.cloud.dataflow.sdk.util.WeightedValue;
 import com.google.cloud.dataflow.sdk.util.common.Counter;
 import com.google.cloud.dataflow.sdk.util.common.CounterSet;
@@ -106,7 +107,7 @@ public class DataflowWorker {
   private final UserCodeTimeTracker userCodeTimeTracker = new UserCodeTimeTracker();
 
   /**
-   * A weight in "bytes" for the overhead of a {@link Sized} wrapper in the cache. It is just an
+   * A weight in "bytes" for the overhead of a {@link Weighted} wrapper in the cache. It is just an
    * approximation so it is OK for it to be fairly arbitrary as long as it is nonzero.
    */
   private static final int OVERHEAD_WEIGHT = 8;
@@ -154,7 +155,7 @@ private boolean doWork(WorkItem workItem) throws IOException {
       // Populate PipelineOptions with data from work unit.
       options.setProject(workItem.getProjectId());
 
-      DataflowExecutionContext executionContext =
+      DataflowExecutionContext<?> executionContext =
           new DataflowWorkerExecutionContext(sideInputCache, options);
 
       CounterSet counters = new CounterSet();
@@ -268,9 +269,15 @@ private void handleWorkError(WorkItem workItem, WorkExecutor worker, long nextRe
     // TODO: Attach the stack trace as exception details, not to the message.
     error.setMessage(DataflowWorkerLoggingHandler.formatException(t));
 
-    reportStatus(options, "Failure", workItem, worker == null ? null : worker.getOutputCounters(),
-        worker == null ? null : worker.getOutputMetrics(), null/*sourceOperationResponse*/,
-        error == null ? null : Collections.singletonList(error), nextReportIndex);
+    reportStatus(
+        options,
+        "Failure",
+        workItem,
+        worker == null ? null : worker.getOutputCounters(),
+        worker == null ? null : worker.getOutputMetrics(),
+        null /*sourceOperationResponse*/,
+        Collections.singletonList(error),
+        nextReportIndex);
   }
 
   private void reportStatus(DataflowWorkerHarnessOptions options, String status, WorkItem workItem,