refactor profiling tools

AnswerDotAI · May 29, 2024 · 9b29cfd · 9b29cfd
1 parent 512b3a3
commit 9b29cfd
Show file tree

Hide file tree

Showing 3 changed files with 560 additions and 412 deletions.
diff --git a/PROFILING.md b/PROFILING.md
@@ -0,0 +1,61 @@
+## Profiling
+
+## Usage
+
+**IMPORTANT**
+There are issues with recording stack traces and exporting traces simultaneously (see this [issue](https://github.com/pytorch/pytorch/issues/113564)) depending on `python` version. The only combination I was able to get both to work at the same time was with `python=3.11.9` and `torch=2.3.0`.
+
+Running the following:
+
+```
+python train.py \
+--model_name "meta-llama/Llama-2-7b-hf" \
+--train_type qlora \
+--profile true \
+--export_trace true \
+--export_memory_timeline true \
+--max_steps 10
+```
+
+will result in a directory `{model_name}_{train_type}-{local_rank}` with the following artifacts:
+
+- `{model_name}-{train_type}-chrome-trace.json.gz` - interactive trace that can be viewed using `chrome::tracing` or `perfetto`
+- `{model_name}-{train_type}-key_averages.txt` - sorted table of events, e.g.:
+
+```
+
+```
+
+- `{model_name}-{train_type}-memory-timeline.html` - Stacked time series plot of memory use broken down by `Parameter`, `Gradients`, `Activations`, etc.
+- `{model_name}-{train_type}-stacks.txt` - Stack trace. See [docs](https://pytorch.org/docs/stable/profiler.html#torch.profiler._KinetoProfile.export_stacks).
+
+Detailed `CLI` options:
+
+- `profile` - whether to profile
+- `profiling_outputs` - output directory for `torch.profiler` artifacts
+- `export_trace` - enables exporting of interactive trace that can be viewed and analyzed using `chrome::tracing`
+- `export_memory_timeline` - exports an HTML memory timeline which shows memory use by category (`parameters`, `activations`, `gradients`, etc.)
+- `with_stack` - exports stack trace
+- `with_shapes` - adds shapes of operators to the trace
+- `{wait, warmup, active}_steps, repeat, profiling_frequency` - controls the profiling schedule:
+
+  - `wait_steps` - number of steps for the profiler to wait before starting to profile. Overridden if `repeat=0` (see note below).
+  - `warmup_steps` - number of steps for profiler to profile without recording
+  - `active_steps` - number of steps to record
+  - `repeat` - number of times to repeat the above cycle of `wait, warmup, active` if `repeat > 0` else cycles forever
+  - `profiling_frequency` - profiling frequency in steps. Only used if `repeat = 0`, in which case `wait_steps = profiling_frequency - (warmup_steps + active_steps)` such that the effective cycle length = `profiling_frequency`. E.g., if `profiling_frequency=10`, `warmup_steps=2`, `active_steps=1`, then the profiler will wait 8 steps, warmup for 2, record for 1, then repeat.
+
+    **Note**: Simplest to think of 2 ways of scheduling the profiler:
+
+    1. Set `repeat` to the number of total number of desired profiling cycles. For example if `wait=1`, `warmup=1`, `active=1`, and `repeat=1`, then the profiler will wait for 1 step, warmup for 1, and record for 1 then stop.
+    2. Set `repeat` to `0` and `profiling_frequency` to the cycle length. E.g., with `repeat=0`, `profiling_frequency=10`, `warmup=2`, `active=1`, then `wait` will be automatically set to `profiling_frequency - (warmup + active) = 7`. The profiler will then continuously execute the following cycle: wait for 7 steps, warmup for 2, record for 1.
+
+    See [docs](https://pytorch.org/docs/stable/profiler.html#torch.profiler.schedule) for further details.
+
+- `max_steps` - maximum number of batches per epoch. E.g., with `num_epochs=1`, stops training after `max_steps` of batches. Note that this is automatically adjusted to accommodate the profiler schedule; for example, if `max_steps < wait_steps + warmup_steps + active_steps`, it will automatically be set to `wait_steps + warmup_steps + active_steps` such that the profiler can run for at least 1 cycle.
+
+#### Additional Notes
+
+The default schedule for the profiler is set such that to continuously execute a 10-step cycle: wait for 7, warmup for 2, record for 1.
+
+`with_stack` and `with_shapes` are overridden by `export_memory_timeline` since the memory profile requires these options to be `True`.
diff --git a/profiling_utils.py b/profiling_utils.py
@@ -0,0 +1,125 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import os
+import time
+import logging
+import torch
+import torch.distributed
+from functools import partial
+WARMUP = 3
+
+logger = logging.getLogger()
+
+#adapted from https://github.com/pytorch/torchtitan
+
+def trace_handler(prof, rank, export_memory_timeline, output_dir, metric="self_cuda_time_total", with_stack=True, group_by_stack=0, group_by_input_shapes=False, row_limit=25):
+    curr_trace_dir_name = str(prof.step_num)
+    curr_trace_dir = os.path.join(output_dir, curr_trace_dir_name)
+    if not os.path.exists(curr_trace_dir):
+        os.makedirs(curr_trace_dir, exist_ok=True)
+
+    #Export chrome / tensorboard trace
+    logger.info(f"Dumping traces at step {prof.step_num}")
+    begin = time.monotonic()
+    prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json")
+    logger.info(
+        f"Finished dumping traces in {time.monotonic() - begin:.2f} seconds"
+    )
+
+    #Construct the memory timeline file.
+    if export_memory_timeline:
+        prof.export_memory_timeline(
+        f"{curr_trace_dir}/rank{rank}_memory-timeline.html"
+    )
+
+    #Dump stack traces
+    if with_stack:
+        prof.export_stacks(f"{curr_trace_dir}/rank{rank}_stacks.txt", metric=metric)
+
+    #Export event averages
+    key_avgs = prof.key_averages(
+        group_by_input_shape=group_by_input_shapes, group_by_stack_n=group_by_stack
+    ).table(sort_by=metric, row_limit=row_limit)
+    with open(f"{curr_trace_dir}/rank{rank}_key_averages.txt", "w") as f:
+        print(
+            key_avgs, file=f
+        )
+    if rank == 0:
+        print(f"Saving profiling results to {curr_trace_dir}")
+
+    #TODO: Is this necessary?
+    torch.distributed.barrier()
+
+@contextlib.contextmanager
+def profiling_context(args, rank, *, global_step: int = 0):
+    enable_profiling = args.profile
+
+    if enable_profiling:          
+
+        logger.info(f"Profiling enabled. Traces will be saved at {output_dir}")
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+
+        warmup = args["warmup_steps"]
+        active = args["active_steps"]
+        repeat = args["repeat"]
+
+        if repeat == 0:
+            steps_per_cycle = args["profiling_frequency"]
+            wait = steps_per_cycle - (active + warmup)
+        else:
+            wait = args["wait_steps"]
+            steps_per_cycle = wait + warmup + active        
+        assert (
+            wait >= 0
+        ), "profile_freq must be greater than or equal to warmup + active"
+        logger.info(f"Profiler schedule - steps per cycle: {steps_per_cycle} wait: {wait} warmup: {warmup} active: {active} repeat: {repeat if repeat !=0 else 'inf'}")
+
+        profile_memory = args["export_memory_timeline"]
+        export_memory_timeline = args["export_memory_timeline"]
+        with_stack = args["with_stack"] or args["export_memory_timeline"]
+        with_shapes = args["with_shapes"] or export_memory_timeline
+        model_name = args["model_name"].split("/")[-1]
+        train_type = args["train_type"]
+        output_dir = args["profiling_output"] if args["profiling_output"] else f"./{model_name}_{train_type}"
+        callback = partial(trace_handler, rank=rank, 
+                            export_memory_timeline=export_memory_timeline, 
+                            output_dir=output_dir,
+                            with_stack=with_stack,
+                            group_by_input_shape=with_shapes, 
+                            group_by_stack=5 if with_stack else 0)
+
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            with_stack=with_stack,
+            profile_memory=profile_memory,
+            with_shapes=with_shapes,
+            schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat),
+            on_trace_ready=callback,
+            experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True) if with_stack else None,
+        ) as torch_profiler:
+            yield torch_profiler
+    else:
+        class FakeProfiler:
+            """
+            Fake profiler object when profiling is not enabled.
+            
+            """
+            def __enter__(self):
+                return self
+            def __exit__(self, *args, **kwargs):
+                pass
+
+            def step(self):
+                pass
+
+        yield FakeProfiler()