25
25
import static com .google .cloud .dataflow .sdk .util .Structs .addStringList ;
26
26
import static com .google .cloud .dataflow .sdk .util .Structs .getString ;
27
27
import static com .google .cloud .dataflow .sdk .util .Structs .getStrings ;
28
+ import static com .google .common .base .Preconditions .checkArgument ;
28
29
29
30
import com .google .api .client .util .BackOff ;
30
31
import com .google .api .client .util .Base64 ;
33
34
import com .google .api .services .dataflow .model .DerivedSource ;
34
35
import com .google .api .services .dataflow .model .DynamicSourceSplit ;
35
36
import com .google .api .services .dataflow .model .SourceMetadata ;
36
- import com .google .api .services .dataflow .model .SourceOperationRequest ;
37
37
import com .google .api .services .dataflow .model .SourceOperationResponse ;
38
38
import com .google .api .services .dataflow .model .SourceSplitOptions ;
39
39
import com .google .api .services .dataflow .model .SourceSplitRequest ;
47
47
import com .google .cloud .dataflow .sdk .options .PipelineOptions ;
48
48
import com .google .cloud .dataflow .sdk .runners .DataflowPipelineTranslator ;
49
49
import com .google .cloud .dataflow .sdk .runners .DirectPipelineRunner ;
50
+ import com .google .cloud .dataflow .sdk .runners .worker .DataflowApiUtils ;
50
51
import com .google .cloud .dataflow .sdk .runners .worker .ReaderFactory ;
51
52
import com .google .cloud .dataflow .sdk .runners .worker .SourceTranslationUtils ;
52
53
import com .google .cloud .dataflow .sdk .runners .worker .StreamingModeExecutionContext ;
61
62
import com .google .cloud .dataflow .sdk .util .common .worker .NativeReader ;
62
63
import com .google .cloud .dataflow .sdk .values .PValue ;
63
64
import com .google .common .annotations .VisibleForTesting ;
64
- import com .google .common .base .Preconditions ;
65
65
import com .google .protobuf .ByteString ;
66
66
67
67
import org .joda .time .Duration ;
@@ -87,15 +87,11 @@ public class CustomSources {
87
87
private static final String SERIALIZED_SOURCE = "serialized_source" ;
88
88
@ VisibleForTesting static final String SERIALIZED_SOURCE_SPLITS = "serialized_source_splits" ;
89
89
private static final long DEFAULT_DESIRED_BUNDLE_SIZE_BYTES = 64 * (1 << 20 );
90
-
91
- public static final String TOO_MANY_SOURCE_SPLITS_ERROR =
92
- "Total number of Source objects generated by splitIntoBundles() operation, %d, is"
93
- + " larger than the allowable limit, %d. For more information, please check the corresponding"
94
- + " FAQ entry at:\n "
95
- + "https://cloud.google.com/dataflow/faq" ;
96
-
97
- // Maximum number of custom source splits currently supported by Dataflow.
98
- private static final int MAX_NUMBER_OF_SPLITS = 16000 ;
90
+ /**
91
+ * The current limit on the size of a ReportWorkItemStatus RPC to Google Cloud Dataflow, which
92
+ * includes the initial splits, is 20 MB.
93
+ */
94
+ public static final long DATAFLOW_SPLIT_RESPONSE_API_SIZE_BYTES = 20 * (1 << 20 );
99
95
100
96
private static final Logger LOG = LoggerFactory .getLogger (CustomSources .class );
101
97
@@ -144,17 +140,47 @@ public static DynamicSourceSplit toSourceSplit(
144
140
* Executes a protocol-level split {@code SourceOperationRequest} for bounded sources
145
141
* by deserializing its source to a {@code BoundedSource}, splitting it, and
146
142
* serializing results back.
143
+ *
144
+ * <p>When the splits produced by this function are too large to be serialized to the Dataflow
145
+ * API, splitting is retried once with an increase in the desired bundle size. This change aims
146
+ * to work around API limitations on split size.
147
147
*/
148
- public static SourceOperationResponse performSourceOperation (
149
- SourceOperationRequest request , PipelineOptions options ) throws Exception {
150
- SourceOperationResponse response = new SourceOperationResponse ();
151
- if (request .getSplit () != null ) {
152
- response .setSplit (performSplit (request .getSplit (), options ));
153
- } else {
154
- throw new UnsupportedOperationException (
155
- "Unsupported source operation request: " + request );
156
- }
157
- return response ;
148
+ public static SourceOperationResponse performSplit (
149
+ SourceSplitRequest request , PipelineOptions options ) throws Exception {
150
+ Source <?> anySource = deserializeFromCloudSource (request .getSource ().getSpec ());
151
+ checkArgument (
152
+ anySource instanceof BoundedSource , "Cannot split a non-Bounded source: %s" , anySource );
153
+ BoundedSource <?> source = (BoundedSource <?>) anySource ;
154
+
155
+ // Compute the desired bundle size given by the service, or default if none was provided.
156
+ long desiredBundleSizeBytes = DEFAULT_DESIRED_BUNDLE_SIZE_BYTES ;
157
+ SourceSplitOptions splitOptions = request .getOptions ();
158
+ if (splitOptions != null && splitOptions .getDesiredBundleSizeBytes () != null ) {
159
+ desiredBundleSizeBytes = splitOptions .getDesiredBundleSizeBytes ();
160
+ }
161
+
162
+ // Try generating initial splits normally.
163
+ SourceSplitResponse splits = performSplit (source , options , desiredBundleSizeBytes );
164
+ long serializedSize = DataflowApiUtils .computeSerializedSizeBytes (splits );
165
+
166
+ // If split response is too large, scale desired size for expected DATAFLOW_API_SIZE_BYTES/2.
167
+ if (serializedSize > DATAFLOW_SPLIT_RESPONSE_API_SIZE_BYTES ) {
168
+ double expansion = 2 * (double ) serializedSize / DATAFLOW_SPLIT_RESPONSE_API_SIZE_BYTES ;
169
+ long expandedBundleSizeBytes = (long ) (desiredBundleSizeBytes * expansion );
170
+ LOG .warn (
171
+ "Splitting source {} into bundles of estimated size {} bytes produced {} bundles, which"
172
+ + " have total serialized size {} bytes. As this is too large for the Google Cloud"
173
+ + " Dataflow API, retrying splitting once with increased desiredBundleSizeBytes {}"
174
+ + " to reduce the number of splits." ,
175
+ source ,
176
+ desiredBundleSizeBytes ,
177
+ splits .getBundles ().size (),
178
+ serializedSize ,
179
+ expandedBundleSizeBytes );
180
+ splits = performSplit (source , options , expandedBundleSizeBytes );
181
+ }
182
+
183
+ return new SourceOperationResponse ().setSplit (splits );
158
184
}
159
185
160
186
/**
@@ -270,9 +296,8 @@ private UnboundedSource<T, UnboundedSource.CheckpointMark> parseSource(int index
270
296
} catch (Exception e ) {
271
297
throw new RuntimeException ("Parsing serialized source splits failed: " , e );
272
298
}
273
- Preconditions .checkArgument (
274
- serializedSplits != null , "UnboundedSource object did not contain splits" );
275
- Preconditions .checkArgument (
299
+ checkArgument (serializedSplits != null , "UnboundedSource object did not contain splits" );
300
+ checkArgument (
276
301
index < serializedSplits .size (),
277
302
"UnboundedSource splits contained too few splits. Requested index was %s, size was %s" ,
278
303
index ,
@@ -287,66 +312,48 @@ private UnboundedSource<T, UnboundedSource.CheckpointMark> parseSource(int index
287
312
}
288
313
289
314
private static SourceSplitResponse performSplit (
290
- SourceSplitRequest request , PipelineOptions options )
315
+ BoundedSource <?> source , PipelineOptions options , long desiredBundleSizeBytes )
291
316
throws Exception {
292
- Source <?> anySource = deserializeFromCloudSource (request .getSource ().getSpec ());
293
- if (!(anySource instanceof BoundedSource )) {
294
- throw new UnsupportedOperationException ("Cannot split a non-Bounded source: " + anySource );
295
- }
296
- BoundedSource <?> source = (BoundedSource <?>) anySource ;
297
- LOG .debug ("Splitting source: {}" , source );
317
+ LOG .debug ("Splitting source {} into bundles of size {}" , source , desiredBundleSizeBytes );
298
318
299
- // Produce simple independent, unsplittable bundles with no metadata attached.
300
- SourceSplitResponse response = new SourceSplitResponse ();
301
- response .setBundles (new ArrayList <DerivedSource >());
302
- SourceSplitOptions splitOptions = request .getOptions ();
303
- Long desiredBundleSizeBytes =
304
- (splitOptions == null ) ? null : splitOptions .getDesiredBundleSizeBytes ();
305
- if (desiredBundleSizeBytes == null ) {
306
- desiredBundleSizeBytes = DEFAULT_DESIRED_BUNDLE_SIZE_BYTES ;
307
- }
308
319
List <? extends BoundedSource <?>> bundles =
309
- source .splitIntoBundles (desiredBundleSizeBytes , options );
310
-
311
- if (bundles .size () > MAX_NUMBER_OF_SPLITS ) {
312
- throw new IOException (
313
- String .format (TOO_MANY_SOURCE_SPLITS_ERROR , bundles .size (), MAX_NUMBER_OF_SPLITS ));
314
- }
320
+ ((BoundedSource <?>) source ).splitIntoBundles (desiredBundleSizeBytes , options );
321
+ List <DerivedSource > splits = new ArrayList <>(bundles .size ());
315
322
323
+ // Produce simple independent, unsplittable bundles with no metadata attached.
316
324
LOG .debug ("Splitting produced {} bundles" , bundles .size ());
317
325
for (BoundedSource <?> split : bundles ) {
318
326
try {
319
327
split .validate ();
320
328
} catch (Exception e ) {
321
329
throw new IllegalArgumentException (
322
- "Splitting a valid source produced an invalid bundle. "
323
- + " \n Original source: "
324
- + source
325
- + " \n Invalid bundle: "
326
- + split ,
330
+ String . format (
331
+ "Splitting a valid source produced an invalid source. "
332
+ + " \n Original source: %s \n Invalid source: %s" ,
333
+ source ,
334
+ split ) ,
327
335
e );
328
336
}
329
- DerivedSource bundle = new DerivedSource ();
330
337
331
- com .google .api .services .dataflow .model .Source cloudSource =
332
- serializeToCloudSource (split , options );
333
- cloudSource .setDoesNotNeedSplitting (true );
334
-
335
- bundle .setDerivationMode ("SOURCE_DERIVATION_MODE_INDEPENDENT" );
336
- bundle .setSource (cloudSource );
337
- response .getBundles ().add (bundle );
338
+ splits .add (
339
+ new DerivedSource ()
340
+ .setDerivationMode ("SOURCE_DERIVATION_MODE_INDEPENDENT" )
341
+ .setSource (serializeToCloudSource (split , options ).setDoesNotNeedSplitting (true )));
338
342
}
339
- response .setOutcome ("SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED" );
340
- return response ;
343
+
344
+ // Return all the splits in the SourceSplitResponse.
345
+ return new SourceSplitResponse ()
346
+ .setBundles (splits )
347
+ .setOutcome ("SOURCE_SPLIT_OUTCOME_SPLITTING_HAPPENED" );
341
348
}
342
349
343
- public static Source <?> deserializeFromCloudSource (Map <String , Object > spec ) throws Exception {
350
+ private static Source <?> deserializeFromCloudSource (Map <String , Object > spec ) throws Exception {
344
351
Source <?> source = (Source <?>) deserializeFromByteArray (
345
352
Base64 .decodeBase64 (getString (spec , SERIALIZED_SOURCE )), "Source" );
346
353
try {
347
354
source .validate ();
348
355
} catch (Exception e ) {
349
- LOG .error ("Invalid source: " + source , e );
356
+ LOG .error ("Invalid source: {}" , source , e );
350
357
throw e ;
351
358
}
352
359
return source ;
@@ -396,8 +403,7 @@ public static com.google.api.services.dataflow.model.Source serializeToCloudSour
396
403
unboundedSource .generateInitialSplits (desiredNumSplits , options )) {
397
404
encodedSplits .add (encodeBase64String (serializeToByteArray (split )));
398
405
}
399
- Preconditions .checkArgument (
400
- !encodedSplits .isEmpty (), "UnboundedSources must have at least one split" );
406
+ checkArgument (!encodedSplits .isEmpty (), "UnboundedSources must have at least one split" );
401
407
addStringList (cloudSource .getSpec (), SERIALIZED_SOURCE_SPLITS , encodedSplits );
402
408
} else {
403
409
throw new IllegalArgumentException ("Unexpected source kind: " + source .getClass ());
0 commit comments