hpcaitech
diff --git a/‎README.md‎
Lines changed: 3 additions & 11 deletions b/‎README.md‎
Lines changed: 3 additions & 11 deletions
diff --git a/‎colossalai/builder/__init__.py‎
Lines changed: 9 additions & 1 deletion b/‎colossalai/builder/__init__.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎colossalai/builder/builder.py‎
Lines changed: 12 additions & 16 deletions b/‎colossalai/builder/builder.py‎
Lines changed: 12 additions & 16 deletions
diff --git a/‎colossalai/engine/_base_engine.py‎
Lines changed: 79 additions & 50 deletions b/‎colossalai/engine/_base_engine.py‎
Lines changed: 79 additions & 50 deletions
diff --git a/‎colossalai/engine/schedule/_no_pipeline.py‎
Lines changed: 31 additions & 11 deletions b/‎colossalai/engine/schedule/_no_pipeline.py‎
Lines changed: 31 additions & 11 deletions
@@ -42,26 +42,18 @@ pip install -v --no-cache-dir --global-option="--cuda_ext" .
 
 ```python
 import colossalai
-from colossalai.engine import Engine
 from colossalai.trainer import Trainer
 from colossalai.core import global_context as gpc
 
-model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
-engine = Engine(
-    model=model,
-    criterion=criterion,
-    optimizer=optimizer,
-    lr_scheduler=lr_scheduler,
-    schedule=schedule
-)
+engine, train_dataloader, test_dataloader = colossalai.initialize()
 
 trainer = Trainer(engine=engine,
-                  hooks_cfg=gpc.config.hooks,
                   verbose=True)
 trainer.fit(
     train_dataloader=train_dataloader,
     test_dataloader=test_dataloader,
-    max_epochs=gpc.config.num_epochs,
+    epochs=gpc.config.num_epochs,
+    hooks_cfg=gpc.config.hooks,
     display_progress=True,
     test_interval=5
 )
 
@@ -1,2 +1,10 @@
-from .builder import *
+from .builder import (build_schedule, build_lr_scheduler, build_model, build_optimizer, build_optimizer_wrapper,
+                      build_layer, build_loss, build_hooks, build_dataset, build_transform, build_data_sampler,
+                      build_gradient_handler)
 from .pipeline import ModelInitializer
+
+__all__ = [
+    'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer', 'build_optimizer_wrapper',
+    'build_layer', 'build_loss', 'build_hooks', 'build_dataset', 'build_transform', 'build_data_sampler',
+    'build_gradient_handler', 'ModelInitializer'
+]
@@ -181,18 +181,6 @@ def build_transform(config):
     return build_from_registry(config, TRANSFORMS)
 
 
-def build_pipe_alloc_policy(config):
-    """Returns a pipeline allocation policy object constructed from `config`.
-
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: A pipeline allocation policy object
-    :rtype: 
-    """
-    return build_from_registry(config, PIPE_ALLOC_POLICY)
-
-
 def build_data_sampler(config, dataset):
     """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
     constructed from `config`.
@@ -254,8 +242,16 @@ def build_lr_scheduler(config, optimizer):
     """
     config_ = config.copy()
     mod_type = config_.pop('type')
-    # warmup epochs will overwrite warmup steps
-    # if 'warmup_epochs' in config_:
-    #     warmup_epochs = config_.pop('warmup_epochs')
-    #     config_['warmup_steps'] = int(num_steps_per_epoch * warmup_epochs)
     return LR_SCHEDULERS.get_module(mod_type)(optimizer, **config_)
+
+
+def build_schedule(config):
+    """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
+    :rtype: :class:`colossalai.engine.schedule.BaseSchedule`
+    """
+    return build_from_registry(config, SCHEDULE)
@@ -1,19 +1,17 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Optional
+from torch.nn import Module
+from torch.nn.modules.loss import _Loss
+from torch.optim import Optimizer
 
 from colossalai.builder import build_gradient_handler
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
                            ZeroRedundancyOptimizer_Level_3)
-from torch.nn import Module
-from torch.nn.modules.loss import _Loss
-from torch.optim import Optimizer
-
-from .schedule import BaseSchedule, NoPipelineSchedule
+from .schedule import BaseSchedule
 
 
 class Engine:
@@ -36,49 +34,80 @@ class Engine:
     def __init__(self,
                  model: Module,
                  optimizer: Optimizer,
-                 step_schedule: BaseSchedule = None,
+                 criterion: _Loss,
+                 step_schedule: BaseSchedule,
+                 gradient_handlers: list = None,
                  gradient_accumulation: int = 1,
-                 gradient_clipping: float = 0.0):
-        self.schedule = step_schedule if step_schedule is not None \
-            else NoPipelineSchedule()
-        self.schedule.initialize(model, optimizer)
-        self.grad_accum_size = gradient_accumulation
-        self.grad_accum_cur_step = 0
-        self.grad_clip = gradient_clipping
+                 gradient_clipping: float = 0.0,
+                 ):
+        self._model = model
+        self._optimizer = optimizer
+        self._criterion = criterion
+        self._schedule = step_schedule
+
+        # schedule initialize
+        self._schedule.initialize(model, optimizer)
+
+        # state
         self.training = True  # default
+
+        # gradient accumulation
+        assert gradient_accumulation > 0, 'gradient accumulation size must be larger than 0'
+        self._grad_accum_size = gradient_accumulation
+        self._grad_clip = gradient_clipping
         self._logger = get_global_dist_logger()
 
         # build gradient handler
         self._gradient_handlers = []
-        gradient_handler_cfg = []
 
-        if hasattr(gpc.config, 'gradient_handler'):
-            assert isinstance(gpc.config.gradient_handler, list), \
+        if gradient_handlers is not None:
+            assert isinstance(gradient_handlers, list), \
                 f'argument gradient_handler_cfg expected type list, ' \
-                f'but got type {type(gpc.config.gradient_handler)}'
-            gradient_handler_cfg = gpc.config.gradient_handler
+                f'but got type {type(gradient_handlers)}'
         elif isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2,
                                     ZeroRedundancyOptimizer_Level_3)):
-            gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
+            gradient_handlers = [dict(type='ZeROGradientHandler')]
             self._logger.info(
                 "Training with zero is detected, ZeROGradientHandler is automatically "
                 "added even though not specified in the configuration",
                 ranks=[0])
         elif gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(
                 ParallelMode.DATA) > 1:
-            gradient_handler_cfg = [dict(type='DataParallelGradientHandler')]
+            gradient_handlers = [dict(type='DataParallelGradientHandler')]
             self._logger.info(
                 "Data parallel training is detected, DataParallelGradientHandler is automatically "
                 "added even though not specified in the configuration",
                 ranks=[0])
-        if len(gradient_handler_cfg) == 0:
+
+        if gradient_handlers is None:
             self._logger.warning(
                 "No gradient handler is set up, please make sure you do not need "
                 "to all-reduce the gradients after a training step.",
                 ranks=[0])
-        for cfg in gradient_handler_cfg:
-            handler = build_gradient_handler(cfg, model, optimizer)
-            self._gradient_handlers.append(handler)
+        else:
+            for cfg in gradient_handlers:
+                handler = build_gradient_handler(cfg, model, optimizer)
+                self._gradient_handlers.append(handler)
+
+    @property
+    def model(self):
+        return self._model
+
+    @property
+    def optimizer(self):
+        return self._optimizer
+
+    @property
+    def criterion(self):
+        return self._criterion
+
+    @property
+    def schedule(self):
+        return self._schedule
+
+    @property
+    def gradient_accumulation(self):
+        return self._grad_accum_size
 
     def handle_gradient(self):
         """Handles all-reduce operations of gradients across different parallel groups.
@@ -90,57 +119,57 @@ def train(self):
         """Sets the model to training mode.
         """
         self.training = True
+        self._model.train()
 
     def eval(self):
         """Sets the model to evaluation mode.
         """
         self.training = False
+        self._model.eval()
 
     def step(self,
              data_iter,
-             model: Module,
-             criterion: _Loss,
-             optimizer: Optimizer = None,
              is_last_iteration: bool = False,
              return_loss=True):
         """A running step based on the schedule. Usually, it runs a training or
         evaluation over a batch of dataset.
 
         :param data_iter: Data iterator of the dataset
-        :param model: The neural network model
-        :param criterion: Loss function used to calculate
-        :param optimizer: Optimizer for updating the parameters
         :param is_last_iteration: If True, this iteration is the last iteration in the epoch
         :param return_loss: loss will be returned if True
         :type data_iter: Iterator
-        :type model: Module
-        :type criterion: _Loss
-        :type optimizer: Optimizer, optional
         :type is_last_iteration: bool, optional
         :type return_loss: bool, optional
         :return: (output, lablel, loss)
         """
-        if self.training and self.grad_accum_cur_step == 0:
-            optimizer.zero_grad()
-
-        output, label, loss = self.schedule.forward_backward_step(
-            data_iter, model, criterion, optimizer,
-            forward_only=not self.training,
-            grad_accum_size=self.grad_accum_size,
-            return_loss=return_loss)
-
         if self.training:
-            self.grad_accum_cur_step += 1
-            if self.grad_accum_cur_step == self.grad_accum_size:
-                # all reduce gradients
-                self.handle_gradient()
-                self.schedule.optimizer_step(model, optimizer, self.grad_clip)
-                self.grad_accum_cur_step = 0
+            self._optimizer.zero_grad()
 
+        # differentiate training and eval with grad accum
+        if self.training:
+            for i in range(self._grad_accum_size):
+                output, label, loss = self._schedule.forward_backward_step(
+                    data_iter, self._model, self._criterion, self._optimizer,
+                    forward_only=False,
+                    grad_accum_size=self._grad_accum_size,
+                    return_loss=return_loss)
+
+                if i == self._grad_accum_size - 1:
+                    # all reduce gradients
+                    self.handle_gradient()
+                    self._schedule.optimizer_step(self._model, self._optimizer, self._grad_clip)
+        else:
+            output, label, loss = self._schedule.forward_backward_step(
+                data_iter, self._model, self._criterion, self._optimizer,
+                forward_only=True,
+                grad_accum_size=1,
+                return_loss=return_loss)
+
+        # consume the remaining dataset left out due to gradient accumulation
         if is_last_iteration:
             while True:
                 try:
-                    trash = next(data_iter)
+                    _ = next(data_iter)
                 except StopIteration:
                     break
 
 
@@ -4,17 +4,23 @@
 try:
     import apex.amp as apex_amp
 except:
-    print('apex is required for mixed precision training')
+    pass
+
 try:
     import torch.cuda.amp as torch_amp
 except:
-    print('PyTorch amp is not supported with the current PyTorch version')
+    pass
+
+from typing import Iterable
+
+import torch.nn as nn
+from torch.optim import Optimizer
 
 from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
                            ZeroRedundancyOptimizer_Level_3)
 from colossalai.nn.optimizer._utils import clip_grad_norm_fp32
-from ._utils import convert_to_fp16, convert_to_fp32
 from ._base_schedule import BaseSchedule
+from ._utils import convert_to_fp16, convert_to_fp32
 from ..amp import AMP_TYPE, GradScaler
 
 
@@ -73,7 +79,7 @@ def __init__(
             self.fp16 = False
             self.amp_type = None
 
-    def initialize(self, model, optimizer):
+    def initialize(self, model: nn.Module, optimizer: Optimizer):
         if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2,
                                   ZeroRedundancyOptimizer_Level_3)):
             self.use_zero_level_2_3 = True
@@ -89,16 +95,30 @@ def initialize(self, model, optimizer):
         return model, optimizer
 
     def forward_backward_step(self,
-                              data_iter,
-                              model,
-                              criterion,
-                              optimizer=None,
-                              forward_only=False,
+                              data_iter: Iterable,
+                              model: nn.Module,
+                              criterion: nn.modules.loss._Loss,
+                              optimizer: Optimizer = None,
+                              forward_only: bool = False,
                               grad_accum_size: int = 1,
-                              return_loss=True):
+                              return_loss: bool = True):
         """The process function that loads loads a batch of dataset and feeds it to the model.
         The returned labels and loss will None if :attr:`return_loss` is False.
 
+        :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
+        :param model: Model for training and inference
+        :param criterion: Loss function for training
+        :param optimizer: Optimizer used for training
+        :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
+        :param grad_accum_size: The number of iterations for gradient accumulation
+        :param return_loss: Loss will be returned if True
+        :type data_iter: Iterator
+        :type model: torch.nn.Module
+        :type criterion: torch.nn.modules.loss._Loss
+        :type optimizer: torch.optim.Optimizer
+        :type forward_only: bool, optional
+        :type grad_accum_size: int
+        :type return_loss: bool, optional
         :return: (output, label, loss)
         """
         assert forward_only or return_loss, \
@@ -154,7 +174,7 @@ def forward_backward_step(self,
         else:
             return output, None, None
 
-    def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0):
+    def optimizer_step(self, model: nn.Module, optimizer: Optimizer, grad_clipping: float = 0.0):
         # step optimizer
         if self.fp16 and self.amp_type == AMP_TYPE.TORCH:
             if grad_clipping > 0.0: