make github-action-benchmark look for empty files and re-enable remot…

…e push job
neuralmagic · Mar 26, 2024 · c1d3e2a · c1d3e2a · github-actions · Mar 26, 2024
1 parent b335bf7
commit c1d3e2a
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 88 deletions.
diff --git a/.github/actions/nm-github-action-benchmark/action.yml b/.github/actions/nm-github-action-benchmark/action.yml
@@ -25,6 +25,13 @@ inputs:
         - 'true'
         - 'false'
     required: true
+  reporting_enabled:
+    description: "When set to true, if there is a regression, do 3 things. 1. Mark the workflow as failed. 2. Add commit comments"
+    type: choice
+    options:
+        - 'true'
+        - 'false'
+    required: true
   github_token:
     description: "secrets.GITHUB_TOKEN from the caller"
     required: true
@@ -44,12 +51,12 @@ runs:
         # Push and deploy to Github pages automatically
         auto-push: ${{ inputs.auto_push == 'true' }}
         # Add a commit comment comparing the current benchmark with the previous.
-        comment-always: true
+        comment-always: ${{ inputs.reporting_enabled == 'true' }}
         # Create an alert when some value has regressed more than 10% 
         alert-threshold: "110%"
         # Mark the workflow as a failure when some alert is triggered
-        fail-on-alert: true
+        fail-on-alert: ${{ inputs.reporting_enabled == 'true' }}
         # Add a commit comment describing what triggered the alert
-        comment-on-alert: true
+        comment-on-alert: ${{ inputs.reporting_enabled == 'true' }}
         # TODO (varun): Is this a reasonable number ? 
         max-items-in-chart: 50
diff --git a/.github/actions/nm-produce-gha-benchmark-json/action.yml b/.github/actions/nm-produce-gha-benchmark-json/action.yml
@@ -10,6 +10,8 @@ inputs:
   smaller_is_better_output_file_path:
     description: 'Path to a file where the GHA CustomSmallerIsBetter JSON is to be stored'
     required: true
+  observation_metrics_output_file_path:
+    description: 'Path to a file where metrics that we only want to observe are stored' 
   python:
     description: 'python version, e.g. 3.10.12'
     required: true
@@ -25,7 +27,7 @@ runs:
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
       SUCCESS=0
-      python3 -m neuralmagic.benchmarks.scripts.logging.gha_benchmark_logging -i ${{inputs.vllm_benchmark_jsons_path}} --bigger-is-better-output-file-path ${{ inputs.bigger_is_better_output_file_path }} --smaller-is-better-output-file-path ${{ inputs.smaller_is_better_output_file_path }} || SUCCESS=$?
+      python3 -m neuralmagic.benchmarks.scripts.logging.gha_benchmark_logging -i ${{inputs.vllm_benchmark_jsons_path}} --bigger-is-better-metrics-output-file-path ${{ inputs.bigger_is_better_output_file_path }} --smaller-is-better-metrics-output-file-path ${{ inputs.smaller_is_better_output_file_path }} --observation-metrics-output-file-path ${{ inputs.observation_metrics_output_file_path }} || SUCCESS=$?
       echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml
@@ -145,6 +145,8 @@ jobs:
           bigger_is_better_output_file_path: gh-action-benchmark-jsons/bigger_is_better.json
           # Metrics that are "better" when the value is smaller are stored here
           smaller_is_better_output_file_path: gh-action-benchmark-jsons/smaller_is_better.json
+          # Metrics that we only want to observe are stored here
+          observation_metrics_output_file_path: gh-action-benchmark-jsons/observation_metrics.json
           python: ${{ inputs.python }}
           venv: TEST
 
@@ -189,23 +191,44 @@ jobs:
         run: ls -R ./downloads
 
       - name: nm-github-action-benchmark(bigger_is_better.json)
+        # Absence of the file indicates that there were no "bigger_is_better" metrics
+        if: ${{ hashFiles('downloads/bigger_is_better.json') != '' }}
         uses: ./.github/actions/nm-github-action-benchmark
-        if: success() || failure()
         with:
           gh_action_benchmark_name: "bigger_is_better"
           gh_action_benchmark_json_file_path:  "downloads/bigger_is_better.json"
           gh_action_benchmark_tool: "customBiggerIsBetter"
           gh_pages_branch: "nm-gh-pages"
           auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
+          reporting_enabled: "true"
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: nm-github-action-benchmark(smaller_is_better.json)
+        # Absence of the file indicates that there were no "smaller_is_better" metrics
+        if: ${{ hashFiles('downloads/smaller_is_better.json') != '' }}
         uses: ./.github/actions/nm-github-action-benchmark
-        if: success() || failure()
         with:
           gh_action_benchmark_name: "smaller_is_better"
           gh_action_benchmark_json_file_path:  "downloads/smaller_is_better.json"
           gh_action_benchmark_tool: "customSmallerIsBetter"
           gh_pages_branch: "nm-gh-pages"
           auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
+          reporting_enabled: "true"
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: nm-github-action-benchmark(observation_metrics.json)
+        # Absence of the file indicates that there were no "observation" metrics
+        if: ${{ hashFiles('downloads/observation_metrics.json') != '' }}
+        uses: ./.github/actions/nm-github-action-benchmark
+        with:
+          gh_action_benchmark_name: "observation_metrics"
+          gh_action_benchmark_json_file_path:  "downloads/observation_metrics.json"
+          # `github-action-benchmark` expects a tool name that is either
+          # "customBiggerIsBetter" or "customSmallerIsBetter". This is a hack to
+          # work around that. Since we mark the action to not report failures, this
+          # is fine.
+          gh_action_benchmark_tool: "customBiggerIsBetter"
+          gh_pages_branch: "nm-gh-pages"
+          auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
+          reporting_enabled: "false"
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
@@ -30,15 +30,15 @@ jobs:
         secrets: inherit
 
     # Benchmarks
-    #AWS-AVX2-32G-A10G-24G-Benchmark:
-    #    uses: ./.github/workflows/nm-benchmark.yml
-    #    with:
-    #        label: aws-avx2-32G-a10g-24G
-    #        benchmark_config_list_file:  ./.github/data/nm_benchmark_remote_push_configs_list.txt
-    #        timeout: 60
-    #        gitref: '${{ github.ref }}'
-    #        Gi_per_thread: 12
-    #        nvcc_threads: 1
-    #        python: "3.10.12"
-    #        push_benchmark_results_to_gh_pages: "false"
-    #    secrets: inherit
+    AWS-AVX2-32G-A10G-24G-Benchmark:
+        uses: ./.github/workflows/nm-benchmark.yml
+        with:
+            label: aws-avx2-32G-a10g-24G
+            benchmark_config_list_file:  ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            timeout: 60
+            gitref: '${{ github.ref }}'
+            Gi_per_thread: 12
+            nvcc_threads: 1
+            python: "3.10.12"
+            push_benchmark_results_to_gh_pages: "false"
+        secrets: inherit
diff --git a/neuralmagic/benchmarks/scripts/logging/benchmark_result.py b/neuralmagic/benchmarks/scripts/logging/benchmark_result.py
@@ -22,17 +22,21 @@
 BENCHMARK_RESULTS_SCHEMA_VERSION = "0.0.0"
 
 
-class GHABenchmarkToolName(str, Enum):
-    BiggerIsBetter = "CustomBiggerIsBetter"
-    SmallerIsBetter = "CustomSmallerIsBetter"
+class BenchmarkMetricType(str, Enum):
+    # Metrics that are "better" when the value is greater e.g. throughput.
+    BiggerIsBetter = "BiggerIsBetter"
+    # Metrics that are "better" when the value is smaller e.g. latency.
+    SmallerIsBetter = "SmallerIsBetter"
+    # Metrics that are too volatile and we primarily use for observation.
+    Observation = "Observation"
 
 
 @dataclass
 class MetricTemplate:
     key: str = field(default=None)
     unit: str = field(default=None)
     value: float = field(default=None)
-    tool: GHABenchmarkToolName = field(default=None)
+    type: BenchmarkMetricType = field(default=None)
 
     def from_dict(d: dict):
         template: MetricTemplate = MetricTemplate()
@@ -51,40 +55,39 @@ def from_dict(d: dict):
 
 BenchmarkServingResultMetricTemplates = SimpleNamespace(
     request_throughput=MetricTemplate("request_throughput", "prompts/s", None,
-                                      GHABenchmarkToolName.BiggerIsBetter),
+                                      BenchmarkMetricType.BiggerIsBetter),
     input_throughput=MetricTemplate("input_throughput", "tokens/s", None,
-                                    GHABenchmarkToolName.BiggerIsBetter),
+                                    BenchmarkMetricType.BiggerIsBetter),
     output_throughput=MetricTemplate("output_throughput", "tokens/s", None,
-                                     GHABenchmarkToolName.BiggerIsBetter),
-    median_request_latency=MetricTemplate(
-        "median_request_latency", "ms", None,
-        GHABenchmarkToolName.SmallerIsBetter),
+                                     BenchmarkMetricType.BiggerIsBetter),
+    median_request_latency=MetricTemplate("median_request_latency", "ms", None,
+                                          BenchmarkMetricType.SmallerIsBetter),
     p90_request_latency=MetricTemplate("p90_request_latency", "ms", None,
-                                       GHABenchmarkToolName.SmallerIsBetter),
+                                       BenchmarkMetricType.SmallerIsBetter),
     p99_request_latency=MetricTemplate("p99_request_latency", "ms", None,
-                                       GHABenchmarkToolName.SmallerIsBetter),
+                                       BenchmarkMetricType.SmallerIsBetter),
     mean_ttft_ms=MetricTemplate("mean_ttft_ms", "ms", None,
-                                GHABenchmarkToolName.SmallerIsBetter),
+                                BenchmarkMetricType.SmallerIsBetter),
     median_ttft_ms=MetricTemplate("median_ttft_ms", "ms", None,
-                                  GHABenchmarkToolName.SmallerIsBetter),
+                                  BenchmarkMetricType.SmallerIsBetter),
     p90_ttft_ms=MetricTemplate("p90_ttft_ms", "ms", None,
-                               GHABenchmarkToolName.SmallerIsBetter),
+                               BenchmarkMetricType.SmallerIsBetter),
     p99_ttft_ms=MetricTemplate("p99_ttft_ms", "ms", None,
-                               GHABenchmarkToolName.SmallerIsBetter),
+                               BenchmarkMetricType.SmallerIsBetter),
     mean_tpot_ms=MetricTemplate("mean_tpot_ms", "ms", None,
-                                GHABenchmarkToolName.SmallerIsBetter),
+                                BenchmarkMetricType.SmallerIsBetter),
     median_tpot_ms=MetricTemplate("median_tpot_ms", "ms", None,
-                                  GHABenchmarkToolName.SmallerIsBetter),
+                                  BenchmarkMetricType.SmallerIsBetter),
     p90_tpot_ms=MetricTemplate("p90_tpot_ms", "ms", None,
-                               GHABenchmarkToolName.SmallerIsBetter),
+                               BenchmarkMetricType.SmallerIsBetter),
     p99_tpot_ms=MetricTemplate("p99_tpot_ms", "ms", None,
-                               GHABenchmarkToolName.SmallerIsBetter))
+                               BenchmarkMetricType.SmallerIsBetter))
 
 BenchmarkThroughputResultMetricTemplates = SimpleNamespace(
     request_throughput=MetricTemplate("request_throughput", "prompts/s", None,
-                                      GHABenchmarkToolName.BiggerIsBetter),
+                                      BenchmarkMetricType.BiggerIsBetter),
     token_throughput=MetricTemplate("token_throughput", "tokens/s", None,
-                                    GHABenchmarkToolName.BiggerIsBetter))
+                                    BenchmarkMetricType.BiggerIsBetter))
 
 
 class BenchmarkResult:

diff --git a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py
@@ -10,7 +10,7 @@
 from dataclasses import dataclass
 from typing import List, Iterable, NamedTuple
 
-from .benchmark_result import (GHABenchmarkToolName, BenchmarkResult,
+from .benchmark_result import (BenchmarkMetricType, BenchmarkResult,
                                MetricTemplate)
 
 
@@ -79,12 +79,12 @@ def from_metric_template(metric_template: MetricTemplate, extra: dict):
                          extra=f"{json.dumps(extra, indent=2)}")
 
 
-class Tool_Record_T(NamedTuple):
-    tool: GHABenchmarkToolName
+class Type_Record_T(NamedTuple):
+    type: BenchmarkMetricType
     record: GHARecord
 
 
-def process(json_file_path: Path) -> Iterable[Tool_Record_T]:
+def process(json_file_path: Path) -> Iterable[Type_Record_T]:
 
     assert json_file_path.exists()
 
@@ -101,80 +101,101 @@ def process(json_file_path: Path) -> Iterable[Tool_Record_T]:
         lambda md: MetricTemplate.from_dict(md), metrics.values())
 
     return map(
-        lambda metric: Tool_Record_T(
-            metric.tool,
+        lambda metric: Type_Record_T(
+            metric.type,
             GHARecord.from_metric_template(metric, extra=hover_data)), metrics)
 
 
-def main(input_directory: Path, bigger_is_better_output_json_file_name: Path,
-         smaller_is_better_output_json_file_name: Path) -> None:
+def main(args: argparse.Namespace) -> None:
+    input_directory = Path(args.input_directory)
 
-    def dump_to_json(gha_records: List[GHARecord], output_path: Path):
+    json_file_paths = input_directory.glob('*.json')
+
+    type_records: List[Type_Record_T] = list(
+        reduce(lambda whole, part: whole + part,
+               (map(lambda json_file_path: list(process(json_file_path)),
+                    json_file_paths))))
+
+    def filter_and_dump_if_non_empty(type_records: List[Type_Record_T],
+                                     type: BenchmarkMetricType,
+                                     output_path: Path):
+        """
+        Given a list of type_record tuples, filter the records with the given
+        type.
+        If there are no records after we filter, don't dump json. otherwise,
+        dump all records as JSON.
+        """
         # Make output directory if it doesn't exist
         output_path.parent.mkdir(parents=True, exist_ok=True)
 
+        gha_records: List[GHARecord] = list(
+            map(
+                lambda type_record: type_record.record,
+                filter(lambda type_record: type_record.type == type,
+                       type_records)))
+
+        if len(gha_records) == 0:
+            return
+
         # Make data JSON serializable
         gha_record_dicts = list(map(lambda x: x.__dict__, gha_records))
         with open(output_path, 'w+') as f:
             json.dump(gha_record_dicts, f, indent=4)
 
-    json_file_paths = input_directory.glob('*.json')
-    tool_records: List[Tool_Record_T] = list(
-        reduce(lambda whole, part: whole + part,
-               (map(lambda json_file_path: list(process(json_file_path)),
-                    json_file_paths))))
-
-    bigger_is_better: List[GHARecord] = list(
-        map(
-            lambda tool_record: tool_record.record,
-            filter(
-                lambda tool_record: tool_record.tool == GHABenchmarkToolName.
-                BiggerIsBetter, tool_records)))
-
-    smaller_is_better: List[GHARecord] = list(
-        map(
-            lambda tool_record: tool_record.record,
-            filter(
-                lambda tool_record: tool_record.tool == GHABenchmarkToolName.
-                SmallerIsBetter, tool_records)))
-
-    dump_to_json(bigger_is_better, bigger_is_better_output_json_file_name)
-    dump_to_json(smaller_is_better, smaller_is_better_output_json_file_name)
+    filter_and_dump_if_non_empty(
+        type_records, BenchmarkMetricType.BiggerIsBetter,
+        Path(args.bigger_is_better_metrics_output_file_path))
+    filter_and_dump_if_non_empty(
+        type_records, BenchmarkMetricType.SmallerIsBetter,
+        Path(args.smaller_is_better_metrics_output_file_path))
+    filter_and_dump_if_non_empty(
+        type_records, BenchmarkMetricType.Observation,
+        Path(args.observation_metrics_output_file_path))
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="""
         Process the benchmark JSONs produced by BenchmarkResult and output JSONs
-        that could be consumed by `github-action-benchmark`
+        that could be consumed by `github-action-benchmark`.
+        The JSONs are not produced if there are no metrics to report for some
+        BenchmarkMetricType.
         Reference : https://github.com/benchmark-action/github-action-benchmark
         """)
 
     parser.add_argument(
         "-i",
-        "--input-json-directory",
+        "--input-directory",
         required=True,
         type=str,
         help="""Path to the directory containing BenchmarkResult 
                 jsons. This is typically the output directory passed 
                 to the benchmark runner scripts like 
                 neuralmagic/benchmarks/run_benchmarks.py.""")
 
-    parser.add_argument(
-        "--bigger-is-better-output-file-path",
-        type=str,
-        required=True,
-        help="""An output file path, where the GHABenchmarkToolName 
-                BiggerIsBetter metrics are to be stored.""")
+    parser.add_argument("--bigger-is-better-metrics-output-file-path",
+                        required=True,
+                        type=str,
+                        help="""
+        An output file path, where the BenchmarkMetricType
+        BiggerIsBetter metrics are stored.
+        """)
 
-    parser.add_argument(
-        "--smaller-is-better-output-file-path",
-        type=str,
-        required=True,
-        help="""An output file path, where the GHABenchmarkToolName 
-                SmallerIsBetter metrics are to be stored""")
+    parser.add_argument("--smaller-is-better-metrics-output-file-path",
+                        required=True,
+                        type=str,
+                        help="""
+        An output file path, where the BenchmarkMetricType
+        SmallerIsBetter metrics are stored.
+        """)
+
+    parser.add_argument("--observation-metrics-output-file-path",
+                        required=True,
+                        type=str,
+                        help="""
+        An output file path, where the BenchmarkMetricType
+        Observation metrics are stored.
+        """)
 
     args = parser.parse_args()
 
-    main(Path(args.input_json_directory),
-         Path(args.bigger_is_better_output_file_path),
-         Path(args.smaller_is_better_output_file_path))
+    main(args)
Benchmark suite	Current: `c1d3e2a`	Previous: `b48fdbb`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`3.985138920020423` prompts/s	`3.9056263783381566` prompts/s	`0.98`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`1530.2933452878424` tokens/s	`1499.760529281852` tokens/s	`0.98`