diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index 06d52369808a..e932f8fef63a 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -32,40 +32,41 @@ def __init__(self, batch_size, hidden_size, intermediate_size, heads, attn_dropo class DeepSpeedTransformerConfig(TransformerConfig): - """Initialize the DeepSpeed Transformer Config. - - Arguments: - batch_size: The maximum batch size used for running the kernel on each GPU - hidden_size: The hidden size of the transformer layer - intermediate_size: The intermediate size of the feed-forward part of transformer layer - heads: The number of heads in the self-attention of the transformer layer - attn_dropout_ratio: The ratio of dropout for the attention's output - hidden_dropout_ratio: The ratio of dropout for the transformer's output - num_hidden_layers: The number of transformer layers - initializer_range: BERT model's initializer range for initializing parameter data - local_rank: Optional: The rank of GPU running the transformer kernel, it is not required - to use if the model already set the current device, otherwise need to set it - so that the transformer kernel can work on the right device - seed: The random seed for the dropout layers - fp16: Enable half-precision computation - pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture - normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation), - default is False - gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory, - default is False - adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of - its self-attention output and layer output, False keeps the initializer_range no change. - See the adjustment below: - output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers) - attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory, - default is False - stochastic_mode: Enable for high performance, please note that this flag has some level of - non-determinism and can produce different results on different runs. However, we have seen - that by enabling it, the pretraining tasks such as BERT are not affected and can obtain - a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend - to turn it off in order to be able to reproduce the same result through the regular kernel execution. - return_tuple: Enable if using the return_tuple interface style for sending out the forward results. - training: Enable for training rather than inference. + """ + Initialize the DeepSpeed Transformer Config. + + Arguments: + batch_size: The maximum batch size used for running the kernel on each GPU + hidden_size: The hidden size of the transformer layer + intermediate_size: The intermediate size of the feed-forward part of transformer layer + heads: The number of heads in the self-attention of the transformer layer + attn_dropout_ratio: The ratio of dropout for the attention's output + hidden_dropout_ratio: The ratio of dropout for the transformer's output + num_hidden_layers: The number of transformer layers + initializer_range: BERT model's initializer range for initializing parameter data + local_rank: Optional: The rank of GPU running the transformer kernel, it is not required + to use if the model already set the current device, otherwise need to set it + so that the transformer kernel can work on the right device + seed: The random seed for the dropout layers + fp16: Enable half-precision computation + pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture + normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation), + default is False + gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory, + default is False + adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of + its self-attention output and layer output, False keeps the initializer_range no change. + See the adjustment below: + output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers) + attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory, + default is False + stochastic_mode: Enable for high performance, please note that this flag has some level of + non-determinism and can produce different results on different runs. However, we have seen + that by enabling it, the pretraining tasks such as BERT are not affected and can obtain + a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend + to turn it off in order to be able to reproduce the same result through the regular kernel execution. + return_tuple: Enable if using the return_tuple interface style for sending out the forward results. + training: Enable for training rather than inference. """ def __init__(self,