From 1b3f1f547fb6e65241c8e9ccdd346409de8a73f4 Mon Sep 17 00:00:00 2001 From: jeromeku Date: Tue, 4 Jun 2024 19:13:21 +0000 Subject: [PATCH] clean up profile.sh --- profile.sh | 142 ++++++++++++++++++----------------------------------- 1 file changed, 48 insertions(+), 94 deletions(-) diff --git a/profile.sh b/profile.sh index b2b1990..80264bf 100755 --- a/profile.sh +++ b/profile.sh @@ -1,97 +1,51 @@ -# Running below will result in a directory `Llama-2-7b_qlora-{local_rank}` with the following artifacts: -# - Llama-2-7b_qlora-chrome-trace.json.gz - interactive trace that can be viewed using `chrome::tracing` or `perfetto` -# - Llama-2-7b_qlora-key_averages.txt - sorted table of events, e.g.: -# | Name | Self CPU % | Self CPU | CPU total % | CPU total | CPU time avg | Self CUDA | Self CUDA % | CUDA total | CUDA time avg | CPU Mem | Self CPU Mem | CUDA Mem | Self CUDA Mem | # of Calls | Source Location | -# |---------------------------------------|------------|------------|-------------|------------|--------------|-------------|-------------|-------------|---------------|---------|--------------|----------|---------------|------------|----------------------------------------------------------------------------------| -# | ProfilerStep* | 0.00% | 0.000us | 0.00% | 0.000us | 0.000us | 4.816s | 44.60% | 4.816s | 963.233ms | 0 b | 0 b | 0 b | 0 b | 5 | | -# | | | | | | | | | | | | | | | | train.py(962): fsdp_main | -# | | | | | | | | | | | | | | | | torch/multiprocessing/spawn.py(75): _wrap | -# | | | | | | | | | | | | | | | | multiprocessing/process.py(108): run | -# | | | | | | | | | | | | | | | | multiprocessing/process.py(314): _bootstrap | -# | FullyShardedDataParallel.forward | 0.00% | 0.000us | 0.00% | 0.000us | 0.000us | 2.208s | 20.45% | 2.208s | 441.555ms | 0 b | 0 b | 0 b | 0 b | 5 | | -# | | | | | | | | | | | | | | | | torch/nn/functional.py(2154): embedding | -# | | | | | | | | | | | | | | | | torch/nn/modules/sparse.py(162): forward | -# | | | | | | | | | | | | | | | | torch/nn/modules/module.py(1534): _call_impl | -# | | | | | | | | | | | | | | | | nn.Module: Embedding_0 | -# | aten::mm | 0.44% | 31.314ms | 0.69% | 48.739ms | 43.517us | 332.421ms | 3.08% | 337.208ms | 301.079us | 0 b | 0 b | 3.26 Gb | 3.26 Gb | 1120 | | -# | | | | | | | | | | | | | | | | bitsandbytes/autograd/_functions.py(492): forward | -# | | | | | | | | | | | | | | | | | -# | | | | | | | | | | | | | | | | torch/autograd/function.py(582): apply | -# | | | | | | | | | | | | | | | | bitsandbytes/autograd/_functions.py(559): matmul_4bit | -# | MatMul4Bit | 2.81% | 198.511ms | 4.93% | 347.437ms | 310.212us | 284.169ms | 2.63% | 630.417ms | 562.872us | 0 b | 0 b | 3.26 Gb | -62.31 Gb | 1120 | | -# | | | | | | | | | | | | | | | | torch/autograd/function.py(582): apply | -# | | | | | | | | | | | | | | | | bitsandbytes/autograd/_functions.py(559): matmul_4bit | -# | | | | | | | | | | | | | | | | bitsandbytes/nn/modules.py(442): forward | -# | | | | | | | | | | | | | | | | torch/nn/modules/module.py(1534): _call_impl | +#See PROFILING.md for documentation -# - Llama-2-7b_qlora-memory-timeline.html - Stacked time series plot of memory use broken down by `Parameter`, `Gradients`, `Activations`, etc. -# - Llama-2-7b_qlora-stacks.txt - Stack trace. See [docs](https://pytorch.org/docs/stable/profiler.html#torch.profiler._KinetoProfile.export_stacks). - -# Detailed `CLI` options: -# - `profile` - whether to profile -# - `profiling_outputs` - output directory for `torch.profiler` artifacts -# - `export_trace` - enables exporting of interactive trace that can be viewed and analyzed using `chrome::tracing` -# - `export_memory_timeline` - exports an HTML memory timeline which shows memory use by category (`parameters`, `activations`, `gradients`, etc.) -# - `with_stack` - exports stack trace -# - `with_shapes` - adds shapes of operators to the trace -# - `{wait, warmup, active}_steps` - controls how many profiling steps are recorded: -# - `wait_steps` - number of steps for the profiler to wait before starting to profile -# - `warmup_steps` - number of steps for profiler to profile without recording -# - `active_steps` - number of steps to record -# See [docs](https://pytorch.org/docs/stable/profiler.html#torch.profiler.schedule) for further details. - -# The default schedule for the profiler is set such that only 2 steps of the each epoch are recorded (not counting `wait` and `warmup` steps which are not recorded). - -# Note that `with_stack` and `with_shapes` are overridden by `export_memory_timeline` since the memory profile requires these options to be `True`. - -#**IMPORTANT** There are issues with recording stack traces and exporting traces simultaneously (see this [issue](https://github.com/pytorch/pytorch/issues/113564)) depending on `python` version. The only combination I was able to get both to work at the same time was with `python=3.11.9` and `torch=2.3.0`. -#Tested on `python=3.11.9 and torch=2.3.0`` - -#"meta-llama/Llama-2-7b-hf" +# Run profiler contiguously on a 5-step cycle: 4 warmup steps and 1 active (recording) step. +python train.py \ +--model_name "hf-internal-testing/tiny-random-LlamaForCausalLM" \ +--gradient_accumulation_steps 2 \ +--batch_size 1 \ +--context_length 256 \ +--num_epochs 1 \ +--sharding_strategy full_shard \ +--precision bf16 \ +--train_type qlora \ +--use_gradient_checkpointing false \ +--use_cpu_offload false \ +--log_to stdout \ +--dataset dummy \ +--profile true \ +--export_trace true \ +--export_memory_timeline false \ +--with_stack true \ +--max_steps 20 \ +--repeat 0 \ +--warmup_steps 4 \ +--active_steps 1 \ +--profiling_frequency 5 \ +--profiling_output llama-test +# Run for 1 cycle then stop profiling # python train.py \ -# --model_name "hf-internal-testing/tiny-random-LlamaForCausalLM" \ -# --gradient_accumulation_steps 2 \ -# --batch_size 1 \ -# --context_length 256 \ -# --num_epochs 1 \ -# --sharding_strategy full_shard \ -# --precision bf16 \ -# --train_type qlora \ -# --use_gradient_checkpointing false \ -# --use_cpu_offload false \ -# --log_to stdout \ -# --dataset dummy \ -# --profile true \ -# --export_trace true \ -# --export_memory_timeline false \ -# --with_stack true \ -# --max_steps 20 \ -# --repeat 0 \ -# --warmup_steps 4 \ -# --active_steps 1 \ -# --profiling_frequency 5 \ -# --profiling_output llama-test -python train.py \ - --model_name "hf-internal-testing/tiny-random-LlamaForCausalLM" \ - --gradient_accumulation_steps 2 \ - --batch_size 1 \ - --context_length 256 \ - --num_epochs 1 \ - --sharding_strategy full_shard \ - --precision bf16 \ - --train_type qlora \ - --use_gradient_checkpointing false \ - --use_cpu_offload false \ - --log_to stdout \ - --dataset dummy \ - --profile true \ - --export_trace true \ - --export_memory_timeline true \ - --with_stack true \ - --num_epochs 1 \ - --max_steps 20 \ - --repeat 1 \ - --warmup_steps 1 \ - --active_steps 4 \ - --profiling_output llama-test2 \ No newline at end of file +# --model_name "hf-internal-testing/tiny-random-LlamaForCausalLM" \ +# --gradient_accumulation_steps 2 \ +# --batch_size 1 \ +# --context_length 256 \ +# --num_epochs 1 \ +# --sharding_strategy full_shard \ +# --precision bf16 \ +# --train_type qlora \ +# --use_gradient_checkpointing false \ +# --use_cpu_offload false \ +# --log_to stdout \ +# --dataset dummy \ +# --profile true \ +# --export_trace true \ +# --export_memory_timeline true \ +# --with_stack true \ +# --num_epochs 1 \ +# --max_steps 20 \ +# --repeat 1 \ +# --warmup_steps 1 \ +# --active_steps 4 \ +# --profiling_output llama-test2 \ No newline at end of file