[Cherry-pick]to Release/2.3, Improve MSRAInitializer (#43721)

Jackwaterveg · web-flow · commit 1aafc31bd912 · 2022-06-22T16:57:57.000+08:00
* fix conflict

* improve the doc
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
@@ -679,20 +679,23 @@ class MSRAInitializer(Initializer):
 
     .. math::
 
-        x = \sqrt{\\frac{6.0}{fan\_in}}
+        x = gain \times \sqrt{\frac{3}{fan\_in}}
 
     In case of Normal distribution, the mean is 0 and the standard deviation
     is
 
     .. math::
 
-        \sqrt{\\frac{2.0}{fan\_in}}
+        \frac{gain}{\sqrt{{fan\_in}}}
 
     Args:
         uniform (bool): whether to use uniform or normal distribution
-        fan_in (float32|None): fan_in for MSRAInitializer. If None, it is\
-        inferred from the variable. default is None.
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
         seed (int32): random seed
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.
 
     Note:
         It is recommended to set fan_in to None for most cases.
@@ -709,7 +712,12 @@ class MSRAInitializer(Initializer):
 
     """
 
-    def __init__(self, uniform=True, fan_in=None, seed=0):
+    def __init__(self,
+                 uniform=True,
+                 fan_in=None,
+                 seed=0,
+                 negative_slope=0,
+                 nonlinearity='relu'):
         """Constructor for MSRAInitializer
         """
         assert uniform is not None
@@ -718,6 +726,8 @@ def __init__(self, uniform=True, fan_in=None, seed=0):
         self._uniform = uniform
         self._fan_in = fan_in
         self._seed = seed
+        self._negative_slope = negative_slope
+        self._nonlinearity = nonlinearity
 
     def __call__(self, var, block=None):
         """Initialize the input tensor with MSRA initialization.
@@ -759,13 +769,16 @@ def __call__(self, var, block=None):
 
         if framework._non_static_mode():
             if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in))
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
+
                 out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
                                                 -limit, 'max', limit, 'seed',
                                                 self._seed, 'dtype',
                                                 int(out_dtype))
             else:
-                std = math.sqrt(2.0 / float(fan_in))
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
                 if in_dygraph_mode():
                     place = _current_expected_place()
                     out_var = _C_ops.final_state_gaussian_random(
@@ -786,33 +799,33 @@ def __call__(self, var, block=None):
             return None
         else:
             if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in))
-                op = block.append_op(
-                    type="uniform_random",
-                    inputs={},
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": int(out_dtype),
-                        "min": -limit,
-                        "max": limit,
-                        "seed": self._seed
-                    },
-                    stop_gradient=True)
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
+                op = block.append_op(type="uniform_random",
+                                     inputs={},
+                                     outputs={"Out": out_var},
+                                     attrs={
+                                         "shape": out_var.shape,
+                                         "dtype": int(out_dtype),
+                                         "min": -limit,
+                                         "max": limit,
+                                         "seed": self._seed
+                                     },
+                                     stop_gradient=True)
 
             else:
-                std = np.sqrt(2.0 / float(fan_in))
-                op = block.append_op(
-                    type="gaussian_random",
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": int(out_dtype),
-                        "mean": 0.0,
-                        "std": std,
-                        "seed": self._seed
-                    },
-                    stop_gradient=True)
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
+                op = block.append_op(type="gaussian_random",
+                                     outputs={"Out": out_var},
+                                     attrs={
+                                         "shape": out_var.shape,
+                                         "dtype": int(out_dtype),
+                                         "mean": 0.0,
+                                         "std": std,
+                                         "seed": self._seed
+                                     },
+                                     stop_gradient=True)
 
             if var.dtype == VarDesc.VarType.FP16 or (
                     var.dtype == VarDesc.VarType.BF16 and not self._uniform):
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
@@ -33,11 +33,14 @@ class KaimingNormal(MSRAInitializer):
 
     .. math::
 
-        \sqrt{\frac{2.0}{fan\_in}}
+        \frac{gain}{\sqrt{{fan\_in}}}
 
     Args:
-        fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\
-        inferred from the variable. default is None.
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.
 
     Note:
         It is recommended to set fan_in to None for most cases.
@@ -56,9 +59,12 @@ class KaimingNormal(MSRAInitializer):
 
     """
 
-    def __init__(self, fan_in=None):
-        super(KaimingNormal, self).__init__(
-            uniform=False, fan_in=fan_in, seed=0)
+    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
+        super(KaimingNormal, self).__init__(uniform=False,
+                                            fan_in=fan_in,
+                                            seed=0,
+                                            negative_slope=negative_slope,
+                                            nonlinearity=nonlinearity)
 
 
 class KaimingUniform(MSRAInitializer):
@@ -75,11 +81,14 @@ class KaimingUniform(MSRAInitializer):
 
     .. math::
 
-        x = \sqrt{\frac{6.0}{fan\_in}}
+        x = gain \times \sqrt{\frac{3}{fan\_in}}
 
     Args:
-        fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
-        inferred from the variable. default is None.
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.
 
     Note:
         It is recommended to set fan_in to None for most cases.
@@ -98,6 +107,9 @@ class KaimingUniform(MSRAInitializer):
 
     """
 
-    def __init__(self, fan_in=None):
-        super(KaimingUniform, self).__init__(
-            uniform=True, fan_in=fan_in, seed=0)
+    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
+        super(KaimingUniform, self).__init__(uniform=True,
+                                             fan_in=fan_in,
+                                             seed=0,
+                                             negative_slope=negative_slope,
+                                             nonlinearity=nonlinearity)