Merge branch 'master' into msaroufim-patch-7

msaroufim · web-flow · commit ff9809a3f5bc · 2023-04-20T14:26:54.000-07:00
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -65,6 +65,8 @@ jobs:
           if_no_artifact_found: ignore
           path: /tmp/ts_artifacts
           name: ${{ matrix.hardware }}_benchmark_validation
+      - name: Validate Benchmark result
+        run: python benchmarks/validate_report.py --input-artifacts-dir /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation
       - name: Update benchmark artifacts for auto validation
         run: python benchmarks/utils/update_artifacts.py --output /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation
       - name: Upload the updated benchmark artifacts for auto validation
diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py
@@ -17,9 +17,10 @@
 
 
 class BenchmarkConfig:
-    def __init__(self, yaml_dict, skip_ts_install):
+    def __init__(self, yaml_dict, skip_ts_install, skip_upload):
         self.yaml_dict = yaml_dict
         self.skip_ts_install = skip_ts_install
+        self.skip_upload = skip_upload
         self.bm_config = {}
         yesterday = datetime.date.today() - datetime.timedelta(days=1)
         self.bm_config["version"] = "torchserve-nightly=={}.{}.{}".format(
@@ -89,9 +90,9 @@ def load_config(self):
                 self.models(v)
             elif k == "hardware":
                 self.hardware(v)
-            elif k == "metrics_cmd":
+            elif k == "metrics_cmd" and not self.skip_upload:
                 self.metrics_cmd(v)
-            elif k == "report_cmd":
+            elif k == "report_cmd" and not self.skip_upload:
                 report_cmd = v
 
         self.bm_config["model_config_path"] = (
@@ -110,12 +111,12 @@ def load_config(self):
             print("{}={}".format(k, v))
 
 
-def load_benchmark_config(bm_config_path, skip_ts_install):
+def load_benchmark_config(bm_config_path, skip_ts_install, skip_upload):
     yaml = ruamel.yaml.YAML()
     with open(bm_config_path, "r") as f:
         yaml_dict = yaml.load(f)
 
-        benchmark_config = BenchmarkConfig(yaml_dict, skip_ts_install)
+        benchmark_config = BenchmarkConfig(yaml_dict, skip_ts_install, skip_upload)
         benchmark_config.load_config()
 
     return benchmark_config.bm_config
@@ -285,14 +286,23 @@ def main():
         action="store",
         help="true: skip torchserve installation. default: true",
     )
+    parser.add_argument(
+        "--skip_upload",
+        help="true: skip uploading commands . default: false",
+    )
 
     arguments = parser.parse_args()
     skip_ts_config = (
         False
         if arguments.skip is not None and arguments.skip.lower() == "false"
         else True
     )
-    bm_config = load_benchmark_config(arguments.input, skip_ts_config)
+    skip_upload = (
+        True
+        if arguments.skip_upload is not None and arguments.skip_upload.lower() == "true"
+        else False
+    )
+    bm_config = load_benchmark_config(arguments.input, skip_ts_config, skip_upload)
     benchmark_env_setup(bm_config, skip_ts_config)
     run_benchmark(bm_config)
     clean_up_benchmark_env(bm_config)
diff --git a/benchmarks/utils/report.py b/benchmarks/utils/report.py
@@ -0,0 +1,75 @@
+import csv
+
+METRICS_VALIDATED = [
+    "TS throughput",
+    "TS latency P50",
+    "TS latency P90",
+    "TS latency P99",
+    "Model_p50",
+    "Model_p90",
+    "Model_p99",
+    "memory_percentage_mean",
+    "gpu_memory_used_mean",
+    "cpu_percentage_mean",
+    "gpu_percentage_mean",
+]
+
+
+# Acceptable metric deviation needs a more complicated logic.
+# Example: For latencies in 2 digits, 50% might be acceptable
+# For 3 digit latencies, 20-30% might be the right value
+# For cpu_memory < 15%, 50% deviation works but for CPU > 40%, 10-15%
+# might be the right value
+ACCEPTABLE_METRIC_DEVIATION = 0.3
+
+
+class Report:
+    def __init__(self, deviation=0, num_reports=0):
+        self.properties = {}
+        self.mode = None
+        self.throughput = 0
+        self.batch_size = 0
+        self.workers = 0
+        self.deviation = deviation
+        self.num_reports = num_reports
+
+    def _get_mode(self, csv_file):
+        cfg = csv_file.split("/")[-2]
+        cfg = cfg.split("_")
+        mode = cfg[0] + "_" + cfg[1]
+        self.mode = mode
+
+    def read_csv(self, csv_file):
+        with open(csv_file, newline="") as f:
+            reader = csv.DictReader(f)
+            for k, v in next(reader).items():
+                if k in METRICS_VALIDATED:
+                    self.properties[k] = float(v)
+        self._get_mode(csv_file)
+
+    def update(self, report):
+        for property in self.properties:
+            # sum the properties to find the mean later
+            self.properties[property] += report.properties[property]
+
+    def mean(self):
+        for k, v in self.properties.items():
+            self.properties[k] = v / self.num_reports
+
+
+def metric_valid(key, obs_val, exp_val, threshold):
+    # In case of throughput, higher is better
+    # In case of memory, lower is better.
+    # We ignore lower values for memory related metrices
+    lower = False
+    if "throughput" not in key:
+        lower = True
+    return check_if_within_threshold(exp_val, obs_val, threshold) or (
+        (obs_val < exp_val and lower)
+    )
+
+
+def check_if_within_threshold(value1, value2, threshold):
+    if float(value1) == 0.0:
+        return True
+    return abs((value1 - value2) / float(value1)) <= threshold
diff --git a/benchmarks/validate_report.py b/benchmarks/validate_report.py
@@ -0,0 +1,98 @@
+import argparse
+import os
+
+from utils.report import (
+    ACCEPTABLE_METRIC_DEVIATION,
+    METRICS_VALIDATED,
+    Report,
+    metric_valid,
+)
+from utils.update_artifacts import (
+    BENCHMARK_ARTIFACTS_PATH,
+    BENCHMARK_REPORT_FILE,
+    BENCHMARK_REPORT_PATH,
+)
+
+
+def validate_reports(artifacts_dir, report_dir, deviation):
+    # Read baseline reports
+    baseline_reports = {}
+    num_reports = len(os.listdir(artifacts_dir))
+    for _d in sorted(os.listdir(artifacts_dir)):
+        dir = os.path.join(artifacts_dir, _d)
+        for subdir in sorted(os.listdir(dir)):
+            csv_file = os.path.join(dir, subdir, BENCHMARK_REPORT_FILE)
+
+            report = Report(deviation, num_reports)
+            report.read_csv(csv_file)
+            if subdir not in baseline_reports:
+                baseline_reports[subdir] = report
+            else:
+                baseline_reports[subdir].update(report)
+
+    # Get the mean value each of the properties for every report
+    for model, report in baseline_reports.items():
+        report.mean()
+        baseline_reports[model] = report
+
+    # Read generated reports
+    generated_reports = {}
+    for subdir in sorted(os.listdir(report_dir)):
+        if os.path.isdir(os.path.join(report_dir, subdir)):
+            csv_file = os.path.join(report_dir, subdir, BENCHMARK_REPORT_FILE)
+            report = Report()
+            report.read_csv(csv_file)
+            generated_reports[subdir] = report
+
+    # Compare generated reports with baseline reports
+    error = False
+    for model, report in generated_reports.items():
+        for key in METRICS_VALIDATED:
+            if not metric_valid(
+                key,
+                report.properties[key],
+                baseline_reports[model].properties[key],
+                baseline_reports[model].deviation,
+            ):
+                print(
+                    f"Error while validating {key} for model: {model}, "
+                    f"Expected value: {baseline_reports[model].properties[key]:.2f}, "
+                    f"Observed value: {report.properties[key]:.2f}"
+                )
+                error = True
+        if not error:
+            print(f"Model {model} successfully validated")
+
+    if error:
+        raise Exception("Failures in benchmark validation")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--input-artifacts-dir",
+        help="directory where benchmark artifacts have been saved",
+        type=str,
+        default=BENCHMARK_ARTIFACTS_PATH,
+    )
+
+    parser.add_argument(
+        "--input-report-dir",
+        help="directory where current benchmark report is saved",
+        type=str,
+        default=BENCHMARK_REPORT_PATH,
+    )
+
+    parser.add_argument(
+        "--deviation",
+        help="acceptable variation in metrics values ",
+        type=float,
+        default=ACCEPTABLE_METRIC_DEVIATION,
+    )
+    args = parser.parse_args()
+    validate_reports(args.input_artifacts_dir, args.input_report_dir, args.deviation)
+
+
+if __name__ == "__main__":
+    main()