Skip to content

Commit 83d73ae

Browse files
authored
fixed 1D and 2D convergence (#38)
* optimized 2D operations * fixed 1D ViT convergence problem
1 parent d0cb7bb commit 83d73ae

File tree

13 files changed

+285
-411
lines changed

13 files changed

+285
-411
lines changed

colossalai/context/parallel_context.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ def set_seed(self):
415415
if hasattr(self.config, 'seed'):
416416
seed = getattr(self.config, 'seed')
417417
else:
418-
seed = 2 # default seed
418+
seed = 1024 # default seed
419419

420420
random.seed(seed)
421421
np.random.seed(seed)
@@ -426,15 +426,18 @@ def set_seed(self):
426426
if torch.cuda.is_available():
427427
# create random seed for different parallel modes
428428
# data parallel seed are kept the same
429-
tp_rank = self._local_ranks.get(ParallelMode.TENSOR, 0)
430-
pp_rank = self._local_ranks.get(ParallelMode.PIPELINE, 0)
431-
parallel_seed = seed + tp_rank + pp_rank * 1024
429+
parallel_seed = seed
432430
add_seed(ParallelMode.DATA, parallel_seed)
433431

432+
# model parallel seeds are different across ranks
433+
pipeline_offset = self._local_ranks.get(ParallelMode.PIPELINE, 0)
434+
434435
# add seed for data parallel and tensor parallel only
435436
if self.is_initialized(ParallelMode.TENSOR):
436-
dp_rank = self._local_ranks.get(ParallelMode.DATA, 0) + 1
437-
tp_seed = parallel_seed + dp_rank * 128
437+
tp_rank = self.get_local_rank(ParallelMode.TENSOR)
438+
# 100 is only to increase the diff in seeds between pipeline stages
439+
tp_rank_with_offset = tp_rank + pipeline_offset * 1024
440+
tp_seed = seed + tp_rank_with_offset
438441
add_seed(ParallelMode.TENSOR, tp_seed)
439442

440443
set_mode(ParallelMode.DATA)

colossalai/nn/layer/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from .fused_bias_gelu import bias_gelu_impl
12
from .parallel_1d import *
23
from .parallel_2d import *
34
from .parallel_2p5d import *
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# adapted from Megatron-LM
2+
# https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/megatron/model/fused_bias_gelu.py
3+
4+
import torch
5+
6+
@torch.jit.script
7+
def bias_gelu(bias, y):
8+
x = bias + y
9+
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
10+
11+
# gradient of tanh approximation of gelu
12+
# gradient of actual gelu is:
13+
# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
14+
@torch.jit.script
15+
def bias_gelu_back(g, bias, y):
16+
x = bias + y
17+
tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
18+
# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
19+
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
20+
return ff*g
21+
22+
class GeLUFunction(torch.autograd.Function):
23+
@staticmethod
24+
# bias is an optional argument
25+
def forward(ctx, input, bias):
26+
ctx.save_for_backward(input, bias)
27+
return bias_gelu(bias, input)
28+
29+
@staticmethod
30+
def backward(ctx, grad_output):
31+
input, bias = ctx.saved_tensors
32+
tmp = bias_gelu_back(grad_output, bias, input)
33+
return tmp, tmp
34+
35+
bias_gelu_impl = GeLUFunction.apply
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from .layers import Linear1D_Col, Linear1D_Row
22
from .layers import MixedFusedLayerNorm1D as LayerNorm1D
33
from ._transformer import TransformerMLP1D, TransformerSelfAttention1D, TransformerLayer1D
4-
from ._vit import ViTMLP1D, ViTSelfAttention1D, ViTHead1D, ViTPatchEmbedding1D, ViTTokenFuser1D, ViTHeadNormal, ViTSelfAttention1DV2
4+
from ._vit import ViTMLP1D, ViTSelfAttention1D, ViTHead1D, ViTPatchEmbedding1D, ViTTokenFuser1D, ViTHeadNormal
55

66

77

88
__all__ = [
99
'Linear1D_Col', 'Linear1D_Row', 'ViTMLP1D', 'ViTSelfAttention1D', 'ViTHead1D', 'ViTPatchEmbedding1D', 'ViTTokenFuser1D',
10-
'TransformerMLP1D', 'TransformerSelfAttention1D', 'TransformerLayer1D', 'LayerNorm1D', 'ViTHeadNormal', 'ViTSelfAttention1DV2'
10+
'TransformerMLP1D', 'TransformerSelfAttention1D', 'TransformerLayer1D', 'LayerNorm1D', 'ViTHead'
1111
]
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import torch
2+
3+
try:
4+
import fused_mix_prec_layer_norm_cuda
5+
except:
6+
fused_mix_prec_layer_norm_cuda = None
7+
8+
9+
class FusedLayerNormAffineFunction1D(torch.autograd.Function):
10+
11+
@staticmethod
12+
def forward(ctx, input, weight, bias, normalized_shape, eps):
13+
ctx.normalized_shape = normalized_shape
14+
ctx.eps = eps
15+
input_ = input.contiguous()
16+
weight_ = weight.contiguous()
17+
bias_ = bias.contiguous()
18+
output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
19+
input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
20+
ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
21+
return output
22+
23+
24+
@staticmethod
25+
def backward(ctx, grad_output):
26+
input_, weight_, bias_, mean, invvar = ctx.saved_tensors
27+
grad_input = grad_weight = grad_bias = None
28+
grad_input, grad_weight, grad_bias \
29+
= fused_mix_prec_layer_norm_cuda.backward_affine(
30+
grad_output.contiguous(), mean, invvar,
31+
input_, ctx.normalized_shape,
32+
weight_, bias_, ctx.eps)
33+
34+
return grad_input, grad_weight, grad_bias, None, None

0 commit comments

Comments
 (0)