@@ -251,6 +251,21 @@ def benchmark_xla_1_gpu_fp16(self):
251
251
FLAGS .batch_size = 256
252
252
self ._run_and_report_benchmark ()
253
253
254
+ def benchmark_xla_1_gpu_fp16_tweaked (self ):
255
+ """Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
256
+ self ._setup ()
257
+
258
+ FLAGS .num_gpus = 1
259
+ FLAGS .enable_eager = True
260
+ FLAGS .enable_xla = True
261
+ FLAGS .distribution_strategy = 'default'
262
+ FLAGS .model_dir = self ._get_model_dir ('benchmark_xla_1_gpu_fp16_tweaked' )
263
+ FLAGS .dtype = 'fp16'
264
+ FLAGS .batch_size = 256
265
+ FLAGS .tf_gpu_thread_mode = 'gpu_private'
266
+ FLAGS .data_prefetch_with_slack = True
267
+ self ._run_and_report_benchmark ()
268
+
254
269
def benchmark_xla_1_gpu_fp16_dynamic (self ):
255
270
"""Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
256
271
self ._setup ()
@@ -313,6 +328,23 @@ def benchmark_graph_xla_1_gpu_fp16(self):
313
328
FLAGS .batch_size = 256
314
329
self ._run_and_report_benchmark ()
315
330
331
+ def benchmark_graph_xla_1_gpu_fp16_tweaked (self ):
332
+ """Test Keras model in legacy graph mode with 1 GPU, fp16, XLA, and manual
333
+ config tuning.
334
+ """
335
+ self ._setup ()
336
+
337
+ FLAGS .num_gpus = 1
338
+ FLAGS .enable_eager = False
339
+ FLAGS .enable_xla = True
340
+ FLAGS .distribution_strategy = 'default'
341
+ FLAGS .model_dir = self ._get_model_dir (
342
+ 'benchmark_graph_xla_1_gpu_fp16_tweaked' )
343
+ FLAGS .dtype = 'fp16'
344
+ FLAGS .batch_size = 256
345
+ FLAGS .tf_gpu_thread_mode = 'gpu_private'
346
+ self ._run_and_report_benchmark ()
347
+
316
348
def benchmark_8_gpu (self ):
317
349
"""Test Keras model with 8 GPUs."""
318
350
self ._setup ()
@@ -334,6 +366,7 @@ def benchmark_8_gpu_tweaked(self):
334
366
FLAGS .model_dir = self ._get_model_dir ('benchmark_8_gpu_tweaked' )
335
367
FLAGS .batch_size = 128 * 8 # 8 GPUs
336
368
FLAGS .datasets_num_private_threads = 14
369
+ FLAGS .data_prefetch_with_slack = True
337
370
self ._run_and_report_benchmark ()
338
371
339
372
def benchmark_xla_8_gpu (self ):
@@ -371,6 +404,7 @@ def benchmark_8_gpu_fp16_tweaked(self):
371
404
FLAGS .model_dir = self ._get_model_dir ('benchmark_8_gpu_fp16_tweaked' )
372
405
FLAGS .batch_size = 256 * 8 # 8 GPUs
373
406
FLAGS .tf_gpu_thread_mode = 'gpu_private'
407
+ FLAGS .data_prefetch_with_slack = True
374
408
self ._run_and_report_benchmark ()
375
409
376
410
def benchmark_8_gpu_fp16_dynamic_tweaked (self ):
@@ -386,6 +420,7 @@ def benchmark_8_gpu_fp16_dynamic_tweaked(self):
386
420
FLAGS .batch_size = 256 * 8 # 8 GPUs
387
421
FLAGS .loss_scale = 'dynamic'
388
422
FLAGS .tf_gpu_thread_mode = 'gpu_private'
423
+ FLAGS .data_prefetch_with_slack = True
389
424
self ._run_and_report_benchmark ()
390
425
391
426
def benchmark_xla_8_gpu_fp16 (self ):
@@ -412,7 +447,8 @@ def benchmark_xla_8_gpu_fp16_tweaked(self):
412
447
FLAGS .distribution_strategy = 'default'
413
448
FLAGS .model_dir = self ._get_model_dir ('benchmark_xla_8_gpu_fp16_tweaked' )
414
449
FLAGS .batch_size = 256 * 8 # 8 GPUs
415
- FLAGS .tf_gpu_thread_mode = 'gpu_private'
450
+ # FLAGS.tf_gpu_thread_mode = 'gpu_private'
451
+ FLAGS .data_prefetch_with_slack = True
416
452
self ._run_and_report_benchmark ()
417
453
418
454
def benchmark_xla_8_gpu_fp16_dynamic_tweaked (self ):
@@ -429,6 +465,7 @@ def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self):
429
465
FLAGS .batch_size = 256 * 8 # 8 GPUs
430
466
FLAGS .loss_scale = 'dynamic'
431
467
FLAGS .tf_gpu_thread_mode = 'gpu_private'
468
+ FLAGS .data_prefetch_with_slack = True
432
469
self ._run_and_report_benchmark ()
433
470
434
471
def benchmark_xla_8_gpu_fp16_tensorboard_tweaked (self ):
@@ -444,6 +481,7 @@ def benchmark_xla_8_gpu_fp16_tensorboard_tweaked(self):
444
481
'benchmark_xla_8_gpu_fp16_tensorboard_tweaked' )
445
482
FLAGS .batch_size = 256 * 8 # 8 GPUs
446
483
FLAGS .tf_gpu_thread_mode = 'gpu_private'
484
+ FLAGS .data_prefetch_with_slack = True
447
485
FLAGS .enable_tensorboard = True
448
486
self ._run_and_report_benchmark ()
449
487
@@ -636,6 +674,7 @@ def benchmark_8_gpu_tweaked(self):
636
674
FLAGS .model_dir = self ._get_model_dir ('benchmark_8_gpu_tweaked' )
637
675
FLAGS .batch_size = 256 * 8
638
676
FLAGS .tf_gpu_thread_mode = 'gpu_private'
677
+ FLAGS .data_prefetch_with_slack = True
639
678
self ._run_and_report_benchmark ()
640
679
641
680
def benchmark_graph_8_gpu (self ):
0 commit comments