diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index 06d52369808a..e932f8fef63a 100755
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -32,40 +32,41 @@ def __init__(self, batch_size, hidden_size, intermediate_size, heads, attn_dropo
 
 
 class DeepSpeedTransformerConfig(TransformerConfig):
-    """Initialize the DeepSpeed Transformer Config.
-
-        Arguments:
-            batch_size: The maximum batch size used for running the kernel on each GPU
-            hidden_size: The hidden size of the transformer layer
-            intermediate_size: The intermediate size of the feed-forward part of transformer layer
-            heads: The number of heads in the self-attention of the transformer layer
-            attn_dropout_ratio: The ratio of dropout for the attention's output
-            hidden_dropout_ratio: The ratio of dropout for the transformer's output
-            num_hidden_layers: The number of transformer layers
-            initializer_range: BERT model's initializer range for initializing parameter data
-            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
-                to use if the model already set the current device, otherwise need to set it
-                so that the transformer kernel can work on the right device
-            seed: The random seed for the dropout layers
-            fp16: Enable half-precision computation
-            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
-            normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation),
-                default is False
-            gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory,
-                default is False
-            adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of
-                its self-attention output and layer output, False keeps the initializer_range no change.
-                See the adjustment below:
-                    output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers)
-            attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory,
-                default is False
-            stochastic_mode:  Enable for high performance, please note that this flag has some level of
-                non-determinism and can produce different results on different runs.  However, we have seen
-                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
-                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
-                to turn it off in order to be able to reproduce the same result through the regular kernel execution.
-            return_tuple: Enable if using the return_tuple interface style for sending out the forward results.
-            training: Enable for training rather than inference.
+    """
+    Initialize the DeepSpeed Transformer Config.
+
+    Arguments:
+        batch_size: The maximum batch size used for running the kernel on each GPU
+        hidden_size: The hidden size of the transformer layer
+        intermediate_size: The intermediate size of the feed-forward part of transformer layer
+        heads: The number of heads in the self-attention of the transformer layer
+        attn_dropout_ratio: The ratio of dropout for the attention's output
+        hidden_dropout_ratio: The ratio of dropout for the transformer's output
+        num_hidden_layers: The number of transformer layers
+        initializer_range: BERT model's initializer range for initializing parameter data
+        local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
+            to use if the model already set the current device, otherwise need to set it
+            so that the transformer kernel can work on the right device
+        seed: The random seed for the dropout layers
+        fp16: Enable half-precision computation
+        pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
+        normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation),
+            default is False
+        gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory,
+            default is False
+        adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of
+            its self-attention output and layer output, False keeps the initializer_range no change.
+            See the adjustment below:
+                output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers)
+        attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory,
+            default is False
+        stochastic_mode:  Enable for high performance, please note that this flag has some level of
+            non-determinism and can produce different results on different runs.  However, we have seen
+            that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
+            a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
+            to turn it off in order to be able to reproduce the same result through the regular kernel execution.
+        return_tuple: Enable if using the return_tuple interface style for sending out the forward results.
+        training: Enable for training rather than inference.
     """
 
     def __init__(self,