Add support for eager mode performance (#1539)

jerryzh168 · web-flow · commit 6d6aa01bf6b5 · 2025-01-10T16:31:05.000-08:00
* Add support for eager mode performance Summary: Added "compile" filed to "extra_info" that allows us to record eager mode performance as well context is eager, eager + compile, eager + compile + autoquant can all have performance improvements/changes over time, so we want to track: (1) eager perf on some previous date (configurable by user) (2) current eager perf (3) current compile perf (4) current autoqunat + compile perf Test Plan: tested locally: https://gist.github.com/jerryzh168/2a15322b0c8f40f35e52956837c67fec Reviewers: Subscribers: Tasks: Tags: * move min_sqnr * format * remove redundant headers * add upload_to_s3 script * format
diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
@@ -413,7 +413,7 @@ def set_autoquant(mask_generator, autoquant_type, min_sqnr):
     mask_generator.predictor._transforms_device = mask_generator.predictor.device
     torch.set_float32_matmul_precision("high")
     # NOTE: this fails when we run
-    # python server.py ~/checkpoints/sam2 large --port 8000 --host localhost --fast --use_autoquant --unittest
+    # python server.py ~/checkpoints/sam2 large --port 8000 --host localhost --fast --autoquant_type autoquant --unittest
     # https://gist.github.com/jerryzh168/d337cb5de0a1dec306069fe48ac8225e
     # mask_generator.predictor.model.sam_mask_decoder = autoquant(mask_generator.predictor.model.sam_mask_decoder, qtensor_class_list=DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, min_sqnr=40)
 
@@ -508,7 +508,7 @@ def main(
 
     # since autoquant is replicating what furious mode is doing, don't use these two together
     if autoquant_type is not None:
-        assert not furious, "use autoquant can't be used together with furious"
+        assert not furious, "autoquant can't be used together with furious"
         set_autoquant(mask_generator, autoquant_type, min_sqnr)
 
     with open("dog.jpg", "rb") as f:
@@ -568,10 +568,22 @@ def main(
                 benchmark_fn(image_tensors_to_masks, random_images, mask_generator)
 
         if output_json_path:
-            headers = ["name", "dtype", "device", "arch", "metric", "actual", "target"]
+            headers = [
+                "name",
+                "dtype",
+                "min_sqnr",
+                "compile",
+                "device",
+                "arch",
+                "metric",
+                "actual",
+                "target",
+            ]
             name = "sam2-" + model_type
             arch = get_arch_name()
             dtype = autoquant_type or "noquant"
+            # boolean flag to indicate whether it's eager or compile
+            compile = fast
             (
                 avg_time_per_run,
                 max_memory_allocated_bytes,
@@ -580,24 +592,19 @@ def main(
             memory_result = [
                 name,
                 dtype,
+                min_sqnr,
+                compile,
                 device,
                 arch,
                 "memory(MiB)",
                 max_memory_allocated_bytes,
                 None,
             ]
-            memory_percent_result = [
-                name,
-                dtype,
-                device,
-                arch,
-                "memory(%)",
-                max_memory_allocated_percentage,
-                None,
-            ]
             performance_result = [
                 name,
                 dtype,
+                min_sqnr,
+                compile,
                 device,
                 arch,
                 "time_s(avg)",
@@ -610,7 +617,6 @@ def main(
                 else write_json_result_ossci
             )
             write_json_result(output_json_path, headers, memory_result)
-            write_json_result(output_json_path, headers, memory_percent_result)
             write_json_result(output_json_path, headers, performance_result)
 
     if profile is not None:
diff --git a/scripts/upload_to_s3.py b/scripts/upload_to_s3.py
@@ -0,0 +1,73 @@
+import io
+import json
+import os
+from functools import lru_cache
+from typing import Any
+
+import boto3
+
+
+@lru_cache
+def get_s3_resource() -> Any:
+    return boto3.resource("s3")
+
+
+def upload_to_s3(
+    bucket_name: str,
+    key: str,
+    json_path: str,
+) -> None:
+    print(f"Writing {json_path} documents to S3")
+    data = []
+    with open(f"{os.path.splitext(json_path)[0]}.json", "r") as f:
+        for l in f.readlines():
+            data.append(json.loads(l))
+
+    body = io.StringIO()
+    for benchmark_entry in data:
+        json.dump(benchmark_entry, body)
+        body.write("\n")
+
+    try:
+        get_s3_resource().Object(
+            f"{bucket_name}",
+            f"{key}",
+        ).put(
+            Body=body.getvalue(),
+            ContentType="application/json",
+        )
+    except Exception as e:
+        print("fail to upload to s3:", e)
+        return
+    print("Done!")
+
+
+if __name__ == "__main__":
+    import argparse
+    import datetime
+
+    parser = argparse.ArgumentParser(
+        description="Upload benchmark result json file to clickhouse"
+    )
+    parser.add_argument(
+        "--json-path",
+        type=str,
+        help="json file path to upload to click house",
+        required=True,
+    )
+    args = parser.parse_args()
+    today = datetime.date.today()
+    today = datetime.datetime.combine(today, datetime.time.min)
+    today_timestamp = str(int(today.timestamp()))
+    print("Today timestamp:", today_timestamp)
+    import subprocess
+
+    # Execute the command and capture the output
+    output = subprocess.check_output(["hostname", "-s"])
+    # Decode the output from bytes to string
+    hostname = output.decode("utf-8").strip()
+    upload_to_s3(
+        "ossci-benchmarks",
+        f"v3/pytorch/ao/{hostname}/torchao-models-" + today_timestamp + ".json",
+        args.json_path,
+    )
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -1028,6 +1028,7 @@ def callback(x):
             "name",
             "dtype",
             "min_sqnr",
+            "compile",
             "device",
             "arch",
             "metric",
@@ -1037,11 +1038,22 @@ def callback(x):
         name = checkpoint_path.parent.name
         arch = get_arch_name()
         dtype = quantization or "noquant"
-        memory_result = [name, dtype, min_sqnr, device, arch, "mem/s", bandwidth, None]
+        memory_result = [
+            name,
+            dtype,
+            min_sqnr,
+            compile,
+            device,
+            arch,
+            "mem/s",
+            bandwidth,
+            None,
+        ]
         performance_result = [
             name,
             dtype,
             min_sqnr,
+            compile,
             device,
             arch,
             "tok/s",
diff --git a/torchao/_models/sam/eval_combo.py b/torchao/_models/sam/eval_combo.py
@@ -642,6 +642,7 @@ def mlp_only(mod, name):
             "name",
             "dtype",
             "min_sqnr",
+            "compile",
             "device",
             "arch",
             "metric",
@@ -651,10 +652,13 @@ def mlp_only(mod, name):
         name = sam_model_type
         arch = get_arch_name()
         dtype = compress or "noquant"
+        # boolean flag to indicate whether compile is used
+        compile = use_compile != "False"
         memory_result = [
             name,
             dtype,
             min_sqnr,
+            compile,
             device,
             arch,
             "memory(MiB)",
@@ -665,6 +669,7 @@ def mlp_only(mod, name):
             name,
             dtype,
             min_sqnr,
+            compile,
             device,
             arch,
             "img_s(avg)",
diff --git a/torchao/_models/utils.py b/torchao/_models/utils.py
@@ -30,10 +30,12 @@ def write_json_result_ossci(output_json_path, headers, row):
             "name": "TorchAO benchmark",
             "mode": "inference",
             "dtype": mapping_headers["dtype"],
-            "min_sqnr": mapping_headers["min_sqnr"],
             "extra_info": {
                 "device": mapping_headers["device"],
                 "arch": mapping_headers["arch"],
+                "min_sqnr": mapping_headers["min_sqnr"],
+                # True means compile is enabled, False means eager mode
+                "complie": mapping_headers["compile"],
             },
         },
         "model": {
@@ -80,10 +82,12 @@ def write_json_result_local(output_json_path, headers, row):
             "name": "TorchAO benchmark",
             "mode": "inference",
             "dtype": mapping_headers["dtype"],
-            "min_sqnr": mapping_headers["min_sqnr"],
             "extra_info": {
                 "device": mapping_headers["device"],
                 "arch": mapping_headers["arch"],
+                "min_sqnr": mapping_headers["min_sqnr"],
+                # True means compile is enabled, False means eager mode
+                "complie": mapping_headers["compile"],
             },
         },
         "model": {