update multi gpu

bubbliiiing · bubbliiiing · commit c16cd5e012a3 · 2022-04-14T19:52:16.000+08:00
diff --git a/README.md b/README.md
@@ -13,6 +13,8 @@
 9. [参考资料 Reference](#Reference)
 
 ## Top News
+**`2022-04`**:**支持多GPU训练，新增各个种类目标数量计算，新增heatmap。**  
+
 **`2022-03`**:**进行了大幅度的更新，修改了loss组成，使得分类、目标、回归loss的比例合适、支持step、cos学习率下降法、支持adam、sgd优化器选择、支持学习率根据batch_size自适应调整、新增图片裁剪。**  
 BiliBili视频中的原仓库地址为：https://github.com/bubbliiiing/yolov4-tiny-tf2/tree/bilibili
 
diff --git a/train.py b/train.py
@@ -230,8 +230,11 @@
     for gpu in gpus:
         tf.config.experimental.set_memory_growth(gpu, True)
         
-    strategy = tf.distribute.MirroredStrategy()
-    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
+    if ngpus_per_node > 1:
+        strategy = tf.distribute.MirroredStrategy()
+    else:
+        strategy = None
+    print('Number of devices: {}'.format(ngpus_per_node))
 
     #----------------------------------------------------#
     #   获取classes和anchor
@@ -386,7 +389,7 @@
                 K.set_value(optimizer.lr, lr)
 
                 fit_one_epoch(model_body, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, 
-                            end_epoch, input_shape, anchors, anchors_mask, num_classes, label_smoothing, save_period, save_dir)
+                            end_epoch, input_shape, anchors, anchors_mask, num_classes, label_smoothing, save_period, save_dir, strategy)
 
                 train_dataloader.on_epoch_end()
                 val_dataloader.on_epoch_end()
@@ -418,8 +421,8 @@
 
             if start_epoch < end_epoch:
                 print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
-                model.fit_generator(
-                    generator           = train_dataloader,
+                model.fit(
+                    x                   = train_dataloader,
                     steps_per_epoch     = epoch_step,
                     validation_data     = val_dataloader,
                     validation_steps    = epoch_step_val,
@@ -471,8 +474,8 @@
                 val_dataloader.batch_size      = Unfreeze_batch_size
 
                 print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
-                model.fit_generator(
-                    generator           = train_dataloader,
+                model.fit(
+                    x                   = train_dataloader,
                     steps_per_epoch     = epoch_step,
                     validation_data     = val_dataloader,
                     validation_steps    = epoch_step_val,
diff --git a/utils/utils_fit.py b/utils/utils_fit.py
@@ -8,7 +8,7 @@
 #------------------------------#
 #   防止bug
 #------------------------------#
-def get_train_step_fn(input_shape, anchors, anchors_mask, num_classes, label_smoothing):
+def get_train_step_fn(input_shape, anchors, anchors_mask, num_classes, label_smoothing, strategy):
     @tf.function
     def train_step(imgs, targets, net, optimizer):
         with tf.GradientTape() as tape:
@@ -32,11 +32,23 @@ def train_step(imgs, targets, net, optimizer):
         grads = tape.gradient(loss_value, net.trainable_variables)
         optimizer.apply_gradients(zip(grads, net.trainable_variables))
         return loss_value
-    return train_step
+    
+    if strategy == None:
+        return train_step
+    else:
+        #----------------------#
+        #   多gpu训练
+        #----------------------#
+        @tf.function
+        def distributed_train_step(images, targets, net, optimizer):
+            per_replica_losses = strategy.run(train_step, args=(images, targets, net, optimizer,))
+            return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
+                                    axis=None)
+        return distributed_train_step
 
 def fit_one_epoch(net, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, 
-            input_shape, anchors, anchors_mask, num_classes, label_smoothing, save_period, save_dir):
-    train_step  = get_train_step_fn(input_shape, anchors, anchors_mask, num_classes, label_smoothing)
+            input_shape, anchors, anchors_mask, num_classes, label_smoothing, save_period, save_dir, strategy):
+    train_step  = get_train_step_fn(input_shape, anchors, anchors_mask, num_classes, label_smoothing, strategy)
     loss        = 0
     val_loss    = 0
     print('Start Train')