feat: Added warmup each cycle feature in CyclicalScheduler (#3064)

sihyeong671 · sadra-barikbin · vfdev-5 · web-flow · commit 169551e9c327 · 2023-09-25T01:20:17.000+02:00
* feat: add feature each cycle in cosine annealing - add _get_cycle_param method in CyclicalScheduler - add warmup_each_cycle, warmup_duration variable in CyclicalScheduler - add warmup phase in CosineAnnealingScheduler issue #3036 * fix: f string - remove f in not using f string variable sentence * refactor: add _get_cycle_param, get_param in CyclicalScheduler - rename get_param in LinearCyclicalScheduler, CosineAnnealingScheduler to _get_clcye_param * fix: add total_cycle_size in _state_attrs * fix: add docstring, change function to abstractmethod - fix typo - add docsting which is in PR review - change _get_cycle_param to abstractmethod - raise ValueError when warmup_each_cycle=True && warmup_duration is None * feat: add test function - add test_cyclical_scheduler_asserts - add test_cosine_annealing_scheduler_warmup * fix: keep previous tag, fix test sentence * fix: type error match sentence * feat: refactor _get_cycle_param to _get_param - change _get_cycle_param to _get_param - add _get_param in ParamScheduler - remove warmup_each_cycle variable * docs: back docstring * feat: remove & fix - remove test_cyclical_scheduler - remove warmup_each_cycle variable * feat: remove first cycle warmup * feat: fix lrs value * A few improvements * Update param_scheduler.py * Update test_param_scheduler.py --------- Co-authored-by: Sadra Barikbin <sadraqazvin1@yahoo.com> Co-authored-by: vfdev <vfdev.5@gmail.com>
diff --git a/ignite/handlers/param_scheduler.py b/ignite/handlers/param_scheduler.py
@@ -193,7 +193,7 @@ def __init__(
         self._state_attrs += ["param_group_index"]
 
     def __call__(self, engine: Optional[Engine], name: Optional[str] = None) -> None:
-        value = self.get_param()
+        value = self._get_param()
 
         if isinstance(value, list):
             if len(value) != len(self.optimizer_param_groups):
@@ -261,6 +261,11 @@ def simulate_values(cls, num_events: int, **scheduler_kwargs: Any) -> List[List[
             values.append([i, scheduler.optimizer_param_groups[0][scheduler.param_name]])
         return values
 
+    def _get_param(self) -> Union[List[float], float]:
+        # `ParamScheduler` does nothing special, only returning what child class returns.
+        # Intermediate child classes edit this method
+        return self.get_param()
+
 
 class CyclicalScheduler(ParamScheduler):
     """An abstract class for updating an optimizer's parameter value over a
@@ -279,6 +284,9 @@ class CyclicalScheduler(ParamScheduler):
             end of each cycle (default=1.0).
         end_value_mult: ratio by which to change the end value at the
             end of each cycle (default=1.0).
+        warmup_duration: duration of warm-up to be applied before each cycle.
+            Through this warm-up, the parameter starts from the last cycle's end value
+            and linearly goes to next cycle's start value. Default is no cyclic warm-up.
         save_history: whether to log the parameter values to
             `engine.state.param_history`, (default=False).
         param_group_index: optimizer's parameters group to use.
@@ -288,6 +296,9 @@ class CyclicalScheduler(ParamScheduler):
         usually be the number of batches in an epoch.
 
     .. versionadded:: 0.4.5
+
+    .. versionchanged:: 0.4.13
+        Added cyclic warm-up to the scheduler using ``warmup_duration``.
     """
 
     def __init__(
@@ -300,6 +311,7 @@ def __init__(
         cycle_mult: float = 1.0,
         start_value_mult: float = 1.0,
         end_value_mult: float = 1.0,
+        warmup_duration: int = 0,
         save_history: bool = False,
         param_group_index: Optional[int] = None,
     ):
@@ -308,11 +320,13 @@ def __init__(
         )
         self.start_value = start_value
         self.end_value = end_value
-        self.cycle_size = int(cycle_size)  # Ensure cycle_size is integer
+        self.cycle_size = cycle_size
         self.cycle_mult = cycle_mult
         self.cycle = 0
         self.start_value_mult = start_value_mult
         self.end_value_mult = end_value_mult
+        self.warmup_duration = warmup_duration
+        self.total_cycle_size = self.warmup_duration + self.cycle_size
 
         if self.cycle_size < 2:
             raise ValueError(f"Argument cycle_size should be positive and larger than 1, but given {cycle_size}")
@@ -325,18 +339,33 @@ def __init__(
             "cycle",
             "start_value_mult",
             "end_value_mult",
+            "warmup_duration",
+            "total_cycle_size",
         ]
 
     def __call__(self, engine: Optional[Engine], name: Optional[str] = None) -> None:
-        if self.event_index != 0 and self.event_index % self.cycle_size == 0:
+        if self.event_index != 0 and self.event_index == self.cycle_size:
+            self.start_value *= self.start_value_mult
+        if self.event_index != 0 and self.event_index == self.total_cycle_size:
             self.event_index = 0
             self.cycle_size = int(self.cycle_size * self.cycle_mult)
+            self.warmup_duration = int(self.warmup_duration * self.cycle_mult)
+            self.total_cycle_size = self.warmup_duration + self.cycle_size
             self.cycle += 1
-            self.start_value *= self.start_value_mult
             self.end_value *= self.end_value_mult
 
         return super(CyclicalScheduler, self).__call__(engine, name)
 
+    def _get_param(self) -> Union[List[float], float]:
+        """Applies warm-up if the scheduler is in the warm-up phase,
+        otherwise returns what is returned by `self.get_param()`
+        """
+        if self.event_index > self.cycle_size:
+            warmup_progress = (self.event_index - self.cycle_size) / self.warmup_duration
+            return self.end_value + (self.start_value - self.end_value) * warmup_progress
+
+        return self.get_param()
+
 
 class LinearCyclicalScheduler(CyclicalScheduler):
     """Linearly adjusts param value to 'end_value' for a half-cycle, then linearly
@@ -355,6 +384,9 @@ class LinearCyclicalScheduler(CyclicalScheduler):
             end of each cycle (default=1.0).
         end_value_mult: ratio by which to change the end value at the
             end of each cycle (default=1.0).
+        warmup_duration: duration of warm-up to be applied before each cycle.
+            Through this warm-up, the parameter starts from the last cycle's end value
+            and linearly goes to next cycle's start value. Default is no cyclic warm-up.
         save_history: whether to log the parameter values to
             `engine.state.param_history`, (default=False).
         param_group_index: optimizer's parameters group to use.
@@ -430,9 +462,13 @@ def print_lr():
             ...
 
     .. versionadded:: 0.4.5
+
+    .. versionchanged:: 0.4.13
+        Added cyclic warm-up to the scheduler using ``warmup_duration``.
     """
 
     def get_param(self) -> float:
+        """Method to get current optimizer's parameter value"""
         cycle_progress = self.event_index / self.cycle_size
         return self.end_value + (self.start_value - self.end_value) * abs(cycle_progress - 0.5) * 2
 
@@ -456,6 +492,9 @@ class CosineAnnealingScheduler(CyclicalScheduler):
             end of each cycle (default=1.0).
         end_value_mult: ratio by which to change the end value at the
             end of each cycle (default=1.0).
+        warmup_duration: duration of warm-up to be applied before each cycle.
+            Through this warm-up, the parameter starts from the last cycle's end value
+            and linearly goes to next cycle's start value. Default is no cyclic warm-up.
         save_history: whether to log the parameter values to
             `engine.state.param_history`, (default=False).
         param_group_index: optimizer's parameters group to use.
@@ -534,6 +573,9 @@ def print_lr():
                  Applications of Computer Vision (WACV), 2017 IEEE Winter Conference on. IEEE, 2017
 
     .. versionadded:: 0.4.5
+
+    .. versionchanged:: 0.4.13
+        Added cyclic warm-up to the scheduler using ``warmup_duration``.
     """
 
     def get_param(self) -> float:
diff --git a/tests/ignite/handlers/test_param_scheduler.py b/tests/ignite/handlers/test_param_scheduler.py
@@ -55,7 +55,7 @@ def test_param_scheduler_asserts():
         FakeParamScheduler({}, "lr")
 
 
-def test_linear_scheduler():
+def test_linear_scheduler_asserts():
     with pytest.raises(TypeError, match=r"Argument optimizer should be torch.optim.Optimizer"):
         LinearCyclicalScheduler({}, "lr", 1, 0, cycle_size=0)
 
@@ -68,6 +68,11 @@ def test_linear_scheduler():
     with pytest.raises(ValueError, match=r"Argument cycle_size should be positive and larger than 1"):
         LinearCyclicalScheduler(optimizer, "lr", 1, 0, cycle_size=1)
 
+
+def test_linear_scheduler():
+    tensor = torch.zeros([1], requires_grad=True)
+    optimizer = torch.optim.SGD([tensor], lr=0.0)
+
     scheduler = LinearCyclicalScheduler(optimizer, "lr", 1, 0, 10)
     state_dict = scheduler.state_dict()
 
@@ -77,38 +82,12 @@ def save_lr(engine):
     trainer = Engine(lambda engine, batch: None)
     trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
     trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)
-
+    lr_values_in_cycle = [1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8]
     for _ in range(2):
         lrs = []
-        trainer.run([0] * 9, max_epochs=2)
+        trainer.run([0] * 10, max_epochs=2)
 
-        assert lrs == list(
-            map(
-                pytest.approx,
-                [
-                    # Cycle 1
-                    1.0,
-                    0.8,
-                    0.6,
-                    0.4,
-                    0.2,
-                    0.0,
-                    0.2,
-                    0.4,
-                    0.6,
-                    0.8,
-                    # Cycle 2
-                    1.0,
-                    0.8,
-                    0.6,
-                    0.4,
-                    0.2,
-                    0.0,
-                    0.2,
-                    0.4,  # 0.6, 0.8,
-                ],
-            )
-        )
+        assert lrs == pytest.approx([*lr_values_in_cycle, *lr_values_in_cycle])
         scheduler.load_state_dict(state_dict)
 
     optimizer = torch.optim.SGD([tensor], lr=0)
@@ -164,49 +143,6 @@ def save_lr(engine):
         )
         scheduler.load_state_dict(state_dict)
 
-    # With float cycle_size
-    optimizer = torch.optim.SGD([tensor], lr=0)
-    scheduler = LinearCyclicalScheduler(
-        optimizer, "lr", start_value=1.2, end_value=0.2, cycle_size=10.00000012, cycle_mult=1.0
-    )
-    state_dict = scheduler.state_dict()
-
-    trainer = Engine(lambda engine, batch: None)
-    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
-    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)
-
-    for _ in range(2):
-        lrs = []
-        trainer.run([0] * 9, max_epochs=2)
-        assert lrs == list(
-            map(
-                pytest.approx,
-                [
-                    # Cycle 1
-                    1.2,
-                    1.0,
-                    0.8,
-                    0.6,
-                    0.4,
-                    0.2,
-                    0.4,
-                    0.6,
-                    0.8,
-                    1.0,
-                    # Cycle 2
-                    1.2,
-                    1.0,
-                    0.8,
-                    0.6,
-                    0.4,
-                    0.2,
-                    0.4,
-                    0.6,  # 0.8, 1.0,
-                ],
-            )
-        )
-        scheduler.load_state_dict(state_dict)
-
 
 def test_linear_scheduler_cycle_size_two():
     tensor = torch.zeros([1], requires_grad=True)
@@ -239,17 +175,23 @@ def save_lr(engine):
     assert lrs == pytest.approx([v for i, v in simulated_values])
 
 
-def test_cosine_annealing_scheduler():
+@pytest.mark.parametrize("cyclic_warmup", [False, True])
+def test_cosine_annealing_scheduler(cyclic_warmup):
     tensor = torch.zeros([1], requires_grad=True)
     optimizer = torch.optim.SGD([tensor], lr=0)
 
-    scheduler = CosineAnnealingScheduler(optimizer, "lr", 0, 1, 10)
+    scheduler = CosineAnnealingScheduler(optimizer, "lr", 0, 1, 10, warmup_duration=2 if cyclic_warmup else 0)
     state_dict = scheduler.state_dict()
 
-    data = [0] * 9
+    data = [0] * (10 + int(cyclic_warmup))
     max_epochs = 2
     simulated_values = CosineAnnealingScheduler.simulate_values(
-        num_events=len(data) * max_epochs, param_name="lr", start_value=0, end_value=1, cycle_size=10
+        num_events=len(data) * max_epochs,
+        param_name="lr",
+        start_value=0,
+        end_value=1,
+        cycle_size=10,
+        warmup_duration=2 if cyclic_warmup else 0,
     )
 
     def save_lr(engine):
@@ -258,36 +200,25 @@ def save_lr(engine):
     trainer = Engine(lambda engine, batch: None)
     trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
     trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)
+    lr_values_in_cycle = [
+        0.0,
+        0.02447174185242318,
+        0.09549150281252627,
+        0.20610737385376332,
+        0.3454915028125263,
+        0.5,
+        0.6545084971874737,
+        0.7938926261462365,
+        0.9045084971874737,
+        0.9755282581475768,
+    ]
+    lr_values_in_warmup = np.linspace(1.0, 0.0, 2 + 1)[:-1].tolist() if cyclic_warmup else []
 
     for _ in range(2):
         lrs = []
         trainer.run(data, max_epochs=max_epochs)
 
-        assert lrs == list(
-            map(
-                pytest.approx,
-                [
-                    0.0,
-                    0.02447174185242318,
-                    0.09549150281252627,
-                    0.20610737385376332,
-                    0.3454915028125263,
-                    0.5,
-                    0.6545084971874737,
-                    0.7938926261462365,
-                    0.9045084971874737,
-                    0.9755282581475768,
-                    0.0,
-                    0.02447174185242318,
-                    0.09549150281252627,
-                    0.20610737385376332,
-                    0.3454915028125263,
-                    0.5,
-                    0.6545084971874737,
-                    0.7938926261462365,  # 0.9045084971874737, 0.9755282581475768
-                ],
-            )
-        )
+        assert lrs == pytest.approx([*lr_values_in_cycle, *lr_values_in_warmup, *lr_values_in_cycle])
         scheduler.load_state_dict(state_dict)
 
         assert lrs == pytest.approx([v for i, v in simulated_values])