Cpu profile

jlamypoirier · jlamypoirier · commit fd75d7c40f26 · 2023-05-24T15:04:36.000-04:00
diff --git a/src/main.py b/src/main.py
@@ -58,6 +58,7 @@ def get_arg_parser() -> ArgumentParser:
     parser.add_argument("--max_log_outputs", type=int)
     parser.add_argument("--breakdown_latency", "--bl", action="store_true")
     parser.add_argument("--profile", "-p", action="store_true")
+    parser.add_argument("--profile_cpu", "--pcpu", action="store_true")
     parser.add_argument("--profile_cycles", "--pc", type=int)
     parser.add_argument("--full_trace", "--pt", action="store_true")
     parser.add_argument("--show_op_names", "--pn", action="store_true")
@@ -108,13 +109,16 @@ def main(argv: Optional[List[str]] = None) -> None:
 
     all_metrics = []
 
-    if args.profile:
+    profile = args.profile or args.profile_cpu
+
+    if profile:
         profiler = get_profiler(
             skip=args.skip + pre_warmup_cycles,
             warmup=warmup,
             cycles=post_warmup_cycles,
             full_trace=args.full_trace,
             show_op_names=args.show_op_names,
+            cpu=args.profile_cpu,
         )
     else:
         profiler = contextlib.nullcontext()
@@ -125,7 +129,7 @@ def main(argv: Optional[List[str]] = None) -> None:
         "Cycles (warmup)": args.skip + warmup,
         "Cycles (benchmark)": args.cycles,
     }
-    if args.profile:
+    if profile:
         benchmark_metrics["Cycles (profile)"] = post_warmup_cycles
     benchmark_metrics["Cycles (total)"] = args.skip + warmup + pre_warmup_cycles + post_warmup_cycles
 
@@ -158,7 +162,7 @@ def main(argv: Optional[List[str]] = None) -> None:
                 ignore_oom=args.ignore_oom,
                 pad_generated_tokens=args.pad_generated_tokens,
             )
-            if args.profile:
+            if profile:
                 p.step()
 
             if step == 0:
diff --git a/src/pipeline.py b/src/pipeline.py
@@ -619,8 +619,8 @@ def _generate_textgen(
         with torch.inference_mode():
             for key_length in range(input_length, output_length, key_length_step):
                 try:
-                    if (key_length_step > 1 and key_length>key_length) or not use_cache or not do_prefill:
-                        if not hasattr(self.model,"fast_forward"):
+                    if (key_length_step > 1 and key_length > key_length) or not use_cache or not do_prefill:
+                        if not hasattr(self.model, "fast_forward"):
                             raise NotImplementedError()
                         self.model.fast_forward(batch, key_length, use_cache)
                         last_time = self._get_time(breakdown_latency)
@@ -718,7 +718,7 @@ def __call__(
             Metrics.LATENCY_E2E: t1 - t0,
         }
 
-        output_text=[i+o for i, o in zip(text, output_text)]
+        output_text = [i + o for i, o in zip(text, output_text)]
 
         return output_text, metrics
 
diff --git a/src/profile.py b/src/profile.py
@@ -10,31 +10,33 @@
 logger = logging.getLogger(__name__)
 
 
-def get_trace_fn(full_trace: bool = False, show_op_names: bool = False, rank: int = -1):
+def get_trace_fn(full_trace: bool = False, show_op_names: bool = False, rank: int = -1, cpu: bool = False):
     def trace_fn(
         p: torch.profiler.profile,
     ):
         averages = p.key_averages()
+        var_name = f"self_{'cpu' if cpu else 'cuda'}_time_total"
         if full_trace:
             # Show every GPU op.
             # Exclude CPU cuda ops to shorten the table.
             events = torch.autograd.profiler.EventList(
-                [evt for evt in p.profiler.function_events if evt.self_cuda_time_total > 0]
+                [evt for evt in p.profiler.function_events if getattr(evt, var_name) > 0]
             )
             log_rank_n(events.table(row_limit=-1, max_src_column_width=1000), logger.info, rank)
 
         if show_op_names:
             # Show non-cropped names, in the same order as in the table.
             averages_sorted = torch.autograd.profiler.EventList(
-                sorted(averages, key=lambda evt: evt.self_cuda_time_total, reverse=True)
+                sorted(averages, key=lambda evt: getattr(evt, var_name), reverse=True)
             )
             for entry in averages_sorted:
                 log_rank_n(entry.key, logger.info, rank)
 
         # Try to avoid name cropping, still hard-coded to max 55 characters
-        log_rank_n(
-            averages.table(sort_by="self_cuda_time_total", row_limit=-1, max_src_column_width=1000), logger.info, rank
-        )
+        log_rank_n(averages.table(sort_by=var_name, row_limit=-1, max_src_column_width=1000), logger.info, rank)
+
+        # Store results for future use.
+        p.bc_profile_result = p.profiler.function_events
 
     return trace_fn
 
@@ -45,6 +47,7 @@ def get_profiler(
     cycles: int,
     full_trace: bool = False,
     show_op_names: bool = False,
+    cpu=False,
 ) -> Union[torch.profiler.profile, contextlib.nullcontext]:
     schedule = torch.profiler.schedule(
         # Warmup is a must if measuring speed as it's when all the optimizations are performed
@@ -57,6 +60,7 @@ def get_profiler(
     )
     return torch.profiler.profile(
         schedule=schedule,
-        activities=[torch.profiler.ProfilerActivity.CUDA],
-        on_trace_ready=get_trace_fn(full_trace, show_op_names),
+        activities=[torch.profiler.ProfilerActivity.CPU if cpu else torch.profiler.ProfilerActivity.CUDA],
+        on_trace_ready=get_trace_fn(full_trace, show_op_names, cpu=cpu),
+        with_modules=True,
     )