Skip to content

Commit

Permalink
[TorchAcc] cache the compiled results and remove some xla flags (#1160)
Browse files Browse the repository at this point in the history
  • Loading branch information
anw90 authored Jun 21, 2024
1 parent 2956815 commit ab1d992
Show file tree
Hide file tree
Showing 15 changed files with 55 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Note: TorchAcc is currently only available internally.
# torchacc dp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Baichuan2-13B-Chat
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Note: TorchAcc is currently only available internally.
# torchacc fsdp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Baichuan2-13B-Chat
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Note: TorchAcc is currently only available internally.
# torchacc dp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/chatglm3-6b
mkdir -p $XLA_PERSISTENT_CACHE_PATH


NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Note: TorchAcc is currently only available internally.
# torchacc fsdp
export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/chatglm3-6b
mkdir -p $XLA_PERSISTENT_CACHE_PATH


NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Llama-2-13b-chat-ms
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
Expand All @@ -20,7 +22,7 @@ swift sft \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 16 \
--batch_size 14 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Llama-2-13b-chat-ms
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
MASTER_PORT=27829 \
Expand All @@ -20,7 +22,7 @@ swift sft \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 24 \
--batch_size 20 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
export XLA_COORDINATOR_PORT=12457

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Meta-Llama-3-8B-Instruct
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select
# export XLA_COORDINATOR_PORT=12457

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/Meta-Llama-3-8B-Instruct
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
# 80GB GPU memory
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
# export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export TORCHACC_TRIM_GRAPH=1
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen1half-14b-chat
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=2,3 \
MASTER_PORT=23797 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ DEBUG_PREFIX=qwen15_14b
DEBUG_PATH=torchacc_debug/qwen15/
export USE_TORCHACC=1
# export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen1half-14b-chat
mkdir -p $XLA_PERSISTENT_CACHE_PATH

MASTER_PORT=23783 \
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
# export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen1half-32b-chat
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
# Note: TorchAcc is currently only available internally.

export USE_TORCHACC=1
export XLA_FLAGS='--xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.97

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen-72b-chat
mkdir -p $XLA_PERSISTENT_CACHE_PATH
# Note: You need to set the correct MASTER_ADDR, MASTER_PORT and NODE_RANK for each node.

MASTER_ADDR=127.0.0.1 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Note: TorchAcc is currently only available internally.

export USE_TORCHACC=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=100000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/qwen-72b-chat
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
Expand All @@ -18,7 +20,7 @@ swift sft \
--output_dir output_qwen_72b \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 4 \
--batch_size 8 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
# Note: TorchAcc is currently only available internally.
export USE_TORCHACC=1
export TORCHACC_TRIM_GRAPH=1
export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
export XLA_IR_SHAPE_CACHE_SIZE=1000000000
export XLA_ALLOCATOR_FRACTION=0.95
export XLA_EXPERIMENTAL=nonzero:masked_select

export XLA_PERSISTENT_CACHE_PATH=./output/compiled_cache/yi-34b-chat
mkdir -p $XLA_PERSISTENT_CACHE_PATH

NPROC_PER_NODE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3 \
swift sft \
Expand All @@ -18,7 +20,7 @@ swift sft \
--output_dir output \
--num_train_epochs 1 \
--max_length 2048 \
--batch_size 12 \
--batch_size 10 \
--use_flash_attn true \
--gradient_accumulation_steps 1 \
--gradient_checkpointing no \
Expand Down
12 changes: 8 additions & 4 deletions swift/trainers/trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,20 +201,19 @@ def compute_loss(self, model, inputs, return_outputs=None):
loss = self.label_smoother(outputs, labels)
else:
loss = outputs['loss'] if isinstance(outputs, dict) else outputs[0]
if use_torchacc():
ta_trim_graph()
if labels is None:
labels = inputs['labels']

if self.sequence_parallel_size > 1:
from swift.trainers.xtuner import reduce_xtuner_sequence_parallel_loss
loss = reduce_xtuner_sequence_parallel_loss(loss, labels)

if labels is None:
labels = inputs['labels']
preds = outputs.logits.argmax(dim=2)[..., :-1]
labels = labels[..., 1:]
masks = labels != -100
acc_strategy = getattr(self.args, 'acc_strategy', 'token')
acc: Optional[Tensor] = None

if preds.shape != labels.shape:
pass
elif acc_strategy == 'sentence':
Expand All @@ -223,6 +222,11 @@ def compute_loss(self, model, inputs, return_outputs=None):
acc_list.append(torch.all(preds[i, m] == labels[i, m]).to(torch.int64).item())
acc = torch.tensor(acc_list, device=preds.device).float().mean()
else:
if use_torchacc():
ta_trim_graph()
preds = preds.to('cpu')
masks = masks.to('cpu')
labels = labels.to('cpu')
acc = (torch.masked_select(preds, masks) == torch.masked_select(labels, masks)).float().mean()
if model.training and acc is not None:
if 'acc' not in self._custom_metrics:
Expand Down

0 comments on commit ab1d992

Please sign in to comment.