Fixed CTA rates update in DDP

vfdev-5 · vfdev-5 · commit 4df90ebd4cba · 2020-04-21T15:54:44.000+02:00
- added main_fully_supervised.py for debugging DDP vs DP
diff --git a/README.md b/README.md
@@ -16,6 +16,11 @@ pip install --upgrade --pre pytorch-ignite
 python -u main_fixmatch.py
 # or python -u main_fixmatch.py --params "data_path=/path/to/cifar10"
 ```
+### DDP
+
+```bash
+python -u -m torch.distributed.launch --nproc_per_node=2 main_fixmatch.py --params="dist_backend='nccl'"
+```
 
 ## TODO
 
@@ -26,7 +31,10 @@ BUGS:
     * [x] save/load CTA
     * [x] save ema model
 
-* [ ] DDP: Synchronize CTA across processes
+* [x] DDP: Synchronize CTA across processes
+
+* [ ] Bug: DDP performances are worse than DP on the first epochs
+    * [ ] Increase batch_size -> batch_size * WS => LR, epoch_length
 
 * [ ] Logging to online platform: NeptuneML or Trains or W&B
 
diff --git a/base_train.py b/base_train.py
@@ -54,7 +54,7 @@ def run(trainer, config):
     unsup_criterion = nn.CrossEntropyLoss(reduction='none').to(utils.device)
 
     num_epochs = config["num_epochs"]
-    epoch_length = config["epoch_length"]
+    epoch_length = config["epoch_length"]    
     total_num_iters = num_epochs * epoch_length
     lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_num_iters, eta_min=0.0)
 
@@ -64,15 +64,14 @@ def run(trainer, config):
         model=model, ema_model=ema_model, optimizer=optimizer,
         sup_criterion=sup_criterion, unsup_criterion=unsup_criterion,
         cta=cta,
-        device=utils.device
     )
 
     # Setup handler to prepare data batches
     @trainer.on(Events.ITERATION_STARTED)
     def prepare_batch(e):
         sup_batch = next(supervised_train_loader_iter)
         unsup_batch = next(unsupervised_train_loader_iter)
-        cta_probe_batch = next(cta_probe_loader_iter)        
+        cta_probe_batch = next(cta_probe_loader_iter)
         e.state.batch = {
             "sup_batch": utils.sup_prepare_batch(sup_batch, utils.device, non_blocking=True),
             "unsup_batch": (
@@ -119,27 +118,30 @@ def update_ema_model(ema_decay):
             ema_param.data.mul_(ema_decay).add_(param.data, alpha=1.0 - ema_decay)
 
     # Setup handlers for debugging
-    if debug and rank == 0:
+    if debug:
 
         @trainer.on(Events.STARTED | Events.ITERATION_COMPLETED(every=100))
         def log_weights_norms(_):
-            wn = []
-            ema_wn = []
-            for ema_param, param in zip(ema_model.parameters(), model.parameters()):
-                wn.append(torch.mean(param.data))
-                ema_wn.append(torch.mean(ema_param.data))
-
-            print("\n\nWeights norms")
-            print("\n- Raw model: {}".format(utils.to_list_str(torch.tensor(wn[:10] + wn[-10:]))))
-            print("- EMA model: {}\n".format(utils.to_list_str(torch.tensor(ema_wn[:10] + ema_wn[-10:]))))
-
-        profiler = BasicTimeProfiler()
-        profiler.attach(trainer)
-        
-        @trainer.on(Events.ITERATION_COMPLETED(every=200))
-        def log_profiling(_):
-            results = profiler.get_results()
-            profiler.print_results(results)
+
+            if rank == 0:
+                wn = []
+                ema_wn = []
+                for ema_param, param in zip(ema_model.parameters(), model.parameters()):
+                    wn.append(torch.mean(param.data))
+                    ema_wn.append(torch.mean(ema_param.data))
+
+                print("\n\nWeights norms")
+                print("\n- Raw model: {}".format(utils.to_list_str(torch.tensor(wn[:10] + wn[-10:]))))
+                print("- EMA model: {}\n".format(utils.to_list_str(torch.tensor(ema_wn[:10] + ema_wn[-10:]))))
+
+        if rank == 0:
+            profiler = BasicTimeProfiler()
+            profiler.attach(trainer)
+            
+            @trainer.on(Events.ITERATION_COMPLETED(every=200))
+            def log_profiling(_):
+                results = profiler.get_results()
+                profiler.print_results(results)
 
     # Setup validation engine
     metrics = {
@@ -190,7 +192,7 @@ def run_evaluation():
         if config["display_iters"]:
             ProgressBar(persist=False, desc="Test evaluation").attach(evaluator)
             ProgressBar(persist=False, desc="Test EMA evaluation").attach(ema_evaluator)
-
+    
     data = list(range(epoch_length))
 
     resume_from = list(Path(config["output_path"]).rglob("training_checkpoint*.pt*"))
@@ -212,6 +214,8 @@ def run_evaluation():
     if rank == 0:
         tb_logger.close()
 
+    supervised_train_loader_iter = unsupervised_train_loader_iter = cta_probe_loader_iter = None
+
 
 def main(trainer, config):
     parser = argparse.ArgumentParser("Semi-Supervised Learning - FixMatch with CTA: Train WRN-28-2 on CIFAR10 dataset")
@@ -238,8 +242,8 @@ def main(trainer, config):
                 value = eval(value)
             config[key] = value
 
-    if config["local_rank"] == 0:
-        ds_id = "{}".format(config["num_train_samples_per_class"] * 10)
+    ds_id = "{}".format(config["num_train_samples_per_class"] * 10)
+    if config["local_rank"] == 0:    
         print("SSL Training of {} on CIFAR10@{}".format(config["model"], ds_id))
         print("- PyTorch version: {}".format(torch.__version__))
         print("- Ignite version: {}".format(ignite.__version__))
diff --git a/ctaugment.py b/ctaugment.py
@@ -1,4 +1,5 @@
 # https://raw.githubusercontent.com/google-research/fixmatch/master/libml/ctaugment.py
+#
 # Copyright 2019 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +20,7 @@
 import numpy as np
 from PIL import Image, ImageOps, ImageEnhance, ImageFilter
 
+
 OPS = {}
 OP = namedtuple('OP', ('f', 'bins'))
 Sample = namedtuple('Sample', ('train', 'probe'))
@@ -32,15 +34,6 @@ def wrap(f):
     return wrap
 
 
-def apply(x, ops):
-    if ops is None:
-        return x
-    y = Image.fromarray(np.round(127.5 * (1 + x)).clip(0, 255).astype('uint8'))
-    for op, args in ops:
-        y = OPS[op].f(y, *args)
-    return np.asarray(y).astype('f') / 127.5 - 1
-
-
 class CTAugment:
     def __init__(self, depth=2, th=0.85, decay=0.99):
         self.decay = decay
diff --git a/main_fixmatch.py b/main_fixmatch.py
@@ -1,10 +1,34 @@
+from collections import defaultdict
 
 import torch
+import torch.distributed as dist
 
 from ignite.engine import Events
 
 import utils
 from base_train import main, BaseTrainer, get_default_config
+from ctaugment import OPS
+
+
+sorted_op_names = sorted(list(OPS.keys()))
+
+
+def pack_as_tensor(k, bins, error, size=5, pad_value=-555.0):
+    out = torch.empty(size).fill_(pad_value).to(error)
+    out[0] = sorted_op_names.index(k)
+    le = len(bins)
+    out[1] = le
+    out[2:2 + le] = torch.tensor(bins).to(error)
+    out[2 + le] = error
+    return out
+
+
+def unpack_from_tensor(t):
+    k_index = int(t[0].item())
+    le = int(t[1].item())
+    bins = t[2:2 + le].tolist()
+    error = t[2 + le].item()
+    return sorted_op_names[k_index], bins, error
 
 
 class FixMatchTrainer(BaseTrainer):
@@ -55,14 +79,15 @@ def train_step(self, engine, batch):
             "total_loss": total_loss.item(),
             "sup_loss": sup_loss.item(),
             "unsup_loss": unsup_loss.item(),
-            "mask": unsup_loss_mask.mean().item()
+            "mask": unsup_loss_mask.mean().item()  # this should not be averaged for DDP
         }
 
     def setup(self, **kwargs):
         super(FixMatchTrainer, self).setup(**kwargs)
         self.confidence_threshold = self.config["confidence_threshold"]
         self.lambda_u = self.config["lambda_u"]
-        self.add_event_handler(Events.ITERATION_COMPLETED, self.update_cta_rates)
+        # self.add_event_handler(Events.ITERATION_COMPLETED, self.update_cta_rates)
+        self.distributed = dist.is_available() and dist.is_initialized()
 
     def update_cta_rates(self):
         x, y, policies = self.state.batch["cta_probe_batch"]
@@ -71,13 +96,32 @@ def update_cta_rates(self):
             y_pred = self.ema_model(x)
             y_probas = torch.softmax(y_pred, dim=1)  # (N, C)
 
-            # for y_proba, t, policy_str in zip(y_probas, y, policies):
-            for y_proba, t, policy in zip(y_probas, y, policies):                
-                error = y_proba
-                error[t] -= 1
-                error = torch.abs(error).sum()
-                self.cta.update_rates(policy, 1.0 - 0.5 * error.item())
-
+            if not self.distributed:
+                for y_proba, t, policy in zip(y_probas, y, policies):                
+                    error = y_proba
+                    error[t] -= 1
+                    error = torch.abs(error).sum()
+                    self.cta.update_rates(policy, 1.0 - 0.5 * error.item())
+            else:
+                error_per_op = []
+                for y_proba, t, policy in zip(y_probas, y, policies):
+                    error = y_proba
+                    error[t] -= 1
+                    error = torch.abs(error).sum()
+                    for k, bins in policy:            
+                        error_per_op.append(pack_as_tensor(k, bins, error))
+                error_per_op = torch.stack(error_per_op)
+                # all gather 
+                tensor_list = [
+                    torch.empty_like(error_per_op) 
+                    for _ in range(dist.get_world_size())
+                ]
+                dist.all_gather(tensor_list, error_per_op)
+                tensor_list = torch.cat(tensor_list, dim=0)
+                # update cta rates
+                for t in tensor_list:
+                    k, bins, error = unpack_from_tensor(t)        
+                    self.cta.update_rates([(k, bins), ], 1.0 - 0.5 * error)
 
 if __name__ == "__main__":
     main(FixMatchTrainer(), get_default_config())
diff --git a/main_fully_supervised.py b/main_fully_supervised.py
@@ -0,0 +1,45 @@
+from collections import defaultdict
+
+import torch
+import torch.distributed as dist
+
+from ignite.engine import Events
+
+import utils
+from base_train import main, BaseTrainer, get_default_config
+
+
+class FullySupervisedTrainer(BaseTrainer):
+
+    output_names = ["sup_loss", ]
+
+    def train_step(self, engine, batch):
+        self.model.train()
+        self.optimizer.zero_grad()
+
+        x, y = batch["sup_batch"]
+
+        y_pred = self.model(x)
+
+        # supervised learning:
+        sup_loss = self.sup_criterion(y_pred, y)
+
+        if self.config["with_amp_level"] is not None:
+            from apex import amp
+            with amp.scale_loss(sup_loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            sup_loss.backward()
+
+        self.optimizer.step()
+
+        return {
+            "sup_loss": sup_loss.item(),
+        }
+
+    def setup(self, **kwargs):
+        super(FullySupervisedTrainer, self).setup(**kwargs)
+        self.distributed = dist.is_available() and dist.is_initialized()
+
+if __name__ == "__main__":
+    main(FullySupervisedTrainer(), get_default_config())
diff --git a/notebooks/dist_policy_update.ipynb b/notebooks/dist_policy_update.ipynb
diff --git a/utils.py b/utils.py