Skip to content

Commit

Permalink
update kunlunxin glm config
Browse files Browse the repository at this point in the history
  • Loading branch information
dynamicheart committed Nov 3, 2023
1 parent c5058d9 commit 0179355
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 67 deletions.
5 changes: 3 additions & 2 deletions training/kunlunxin/docker_image/pytorch/pytorch_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ set -xe
pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl
pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl

pip install psutil==5.9.5 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install accelerate==0.20.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install psutil==5.9.5 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install accelerate==0.20.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install tabulate==0.9.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

python -m xacc.install
18 changes: 3 additions & 15 deletions training/kunlunxin/glm-pytorch/config/environment_variables.sh
Original file line number Diff line number Diff line change
@@ -1,21 +1,9 @@
# =================================================
# Export variables
# =================================================

export BKCL_PCIE_RING=1
export BKCL_TIMEOUT=1800
# when using tree allreduce, the number of nodes must be a multiple of 2
export BKCL_SOCKET_FORCE_TREE=1

export XMLIR_D_XPU_L3_SIZE=32505856

export BKCL_CCIX_RING=1
export BKCL_FORCE_SYNC=1

export ALLREDUCE_ASYNC=false
export ALLREDUCE_FUSION=0

export XMLIR_F_XPU_FC_GEMM_MODE=float
export XMLIR_F_FAST_INDEX_PUT=true
export XMLIR_D_XPU_L3_SIZE=66060288

export XACC_ENABLE=1
export XACC=1
export XACC_ARGS="-L O0"
78 changes: 28 additions & 50 deletions training/kunlunxin/glm-pytorch/extern/trainer_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,82 +2,60 @@
from torch import nn
import torch.distributed as dist

import torch_xmlir

import config
from optimizers import get_optimizer_param_groups
from optimizers.loss_scaler import DynamicLossScaler
from driver.dist_pytorch import main_proc_print

import torch_xmlir
import torch_xmlir.core.xpu_model as xm
from torch_xmlir.optimizer import AdamW as Adam
from torch_xmlir.nn.clip_grad import clip_grad_norm
from torch_xmlir.distributed import DistributedDataParallel as XPUDDP

from .converter import convert_model as _convert_model


class XPUTorchDDP(XPUDDP):

def named_parameters(self, prefix: str = '', recurse: bool = True):
return self.module.named_parameters(prefix=prefix, recurse=recurse)
from driver.dist_pytorch import PyTorchDistributedDataParallel as TorchDDP

def state_dict(self, destination=None, prefix='', keep_vars=False):
sd = self.module.state_dict(destination, prefix, keep_vars)
return sd
clip_grad_norm = torch.nn.utils.clip_grad_norm_

def load_state_dict(self, state_dict, strict=True):
return self.module.load_state_dict(state_dict, strict=strict)
from .converter import convert_model as _convert_model


def convert_model(model: torch.nn.Module) -> torch.nn.Module:
return _convert_model(model, config)


def create_optimizer(model, args):
param_groups = get_optimizer_param_groups(model)
optimizer = Adam(param_groups,
lr=args.lr,
weight_decay=args.weight_decay,
betas=(args.adam_beta1, args.adam_beta2),
eps=args.adam_eps)
main_proc_print(f'Optimizer = {optimizer.__class__.__name__}')
return optimizer


def model_to_fp16(model):
return model


def model_to_ddp(model: nn.Module) -> nn.Module:
if dist.is_available() and dist.is_initialized():
model = XPUTorchDDP(model)
model = TorchDDP(model)
return model


def create_grad_scaler():
return None


def backward(step, lm_loss, reduced_loss, optimizer, lr_scheduler, model):
args = config

def _clip_grad():
if args.clip_grad > 0:
clip_grad_norm(model.parameters(), args.clip_grad)

lm_loss.backward()
if step % args.gradient_accumulation_steps == 0:
allreduce_grads = reversed(
[p.grad.data for p in model.parameters() if p.grad is not None])
xm.optimizer_step(optimizer,
barrier=True,
post_allreduce_hook=_clip_grad,
allreduce_average=True,
allreduce_grads=allreduce_grads)
lr_scheduler.step()

if DynamicLossScaler._has_inf_or_nan(reduced_loss):
if not DynamicLossScaler._has_inf_or_nan(reduced_loss):
backward_step(optimizer, model, lm_loss, args)
if step % args.gradient_accumulation_steps == 0:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad(set_to_none=True)
else:
main_proc_print("Found NaN loss, skip backward")

torch_xmlir.xpu.empty_cache()
return reduced_loss


def backward_step(optimizer, model, lm_loss, args):
"""Backward step."""

# Total loss.
loss = lm_loss

loss.backward()

# Clipping gradients helps prevent the exploding gradient.
if args.clip_grad > 0:
clip_grad_norm(model.parameters(), args.clip_grad)

return lm_loss
1 change: 1 addition & 0 deletions training/run_benchmarks/config/test_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,5 +106,6 @@
# "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2"
# "resnet50:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
# "transformer_xl:pytorch:R300:1:8:1": "/raid/dataset/transformer_xl/",
# "glm:pytorch:R300:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
}

0 comments on commit 0179355

Please sign in to comment.