CI: add benchmark workflow and script (#250)

singh1203 · web-flow · commit 9cbccd6fb87c · 2025-05-29T09:21:21.000+09:00
Fixes: #85 ### What changes are included in this PR? Added Benchmark CI using Debian as runtime environment ### Are these changes tested? Yes, I have tested the changes locally. --------- Signed-off-by: Saurabh Kumar Singh <singh1203.ss@gmail.com>
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Benchmarks
+on:
+  push:
+    branches: [main]
+  pull_request:
+    paths:
+      - ".github/workflows/benchmark.yml"
+      - "ci/scripts/bench.sh"
+      - "ci/scripts/bench_adapt.py"
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    container: debian:12
+    strategy:
+      matrix:
+        go: ['1.22.7']
+        arch: ['amd64']
+    steps:
+      - name: Install dependencies
+        run: |
+          apt-get update
+          apt-get install -y git ca-certificates
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+      - name: Install Go ${{ matrix.go }} for Benchmarks
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go }}
+          cache: true
+          cache-dependency-path: go.sum
+          check-latest: false
+      - name: Run Benchmarks
+        if: github.event_name != 'push'
+        run: bash ci/scripts/bench.sh $(pwd) --json
+      - name: Upload results
+        if: github.event_name == 'push' && github.repository == 'apache/arrow-go' && github.ref_name == 'main'
+        env:
+          CONBENCH_URL: https://conbench.ursa.dev
+          CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }}
+          CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }}
+          CONBENCH_REF: ${{ github.ref_name }}
+          CONBENCH_MACHINE_INFO_NAME: ${{ matrix.arch }}-debian-12
+        run: |
+          python3 -m pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python
+          python3 ci/scripts/bench_adapt.py
diff --git a/ci/scripts/bench.sh b/ci/scripts/bench.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# this will output the benchmarks to STDOUT but if `-json` is passed
+# as the second argument, it will create a file "bench_stats.json"
+# in the directory this is called from containing a json representation
+
+set -ex
+
+# Validate input arguments
+if [ -z "$1" ]; then
+  echo "Error: Missing source directory argument"
+  exit 1
+fi
+
+source_dir="$1"
+
+PARQUET_TEST_DATA="${source_dir}/parquet-testing/data"
+export PARQUET_TEST_DATA
+
+pushd "${source_dir}"
+
+# lots of benchmarks, they can take a while
+# the timeout is for *ALL* benchmarks together,
+# not per benchmark
+go test -bench=. -benchmem -timeout 40m -run=^$ ./... | tee bench_stat.dat
+
+popd
+
+if [[ "$2" = "-json" ]]; then
+  go install go.bobheadxi.dev/gobenchdata@latest
+  PATH=$(go env GOPATH)/bin:$PATH
+  export PATH
+  cat "${source_dir}"/bench_*.dat | gobenchdata --json bench_stats.json
+fi
+
+rm "${source_dir}"/bench_*.dat
diff --git a/ci/scripts/bench_adapt.py b/ci/scripts/bench_adapt.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import os
+import uuid
+import logging
+from pathlib import Path
+from typing import List
+
+from benchadapt import BenchmarkResult
+from benchadapt.adapters import BenchmarkAdapter
+from benchadapt.log import log
+
+log.setLevel(logging.DEBUG)
+
+ARROW_ROOT = Path(__file__).parent.parent.parent.resolve()
+SCRIPTS_PATH = ARROW_ROOT / "ci" / "scripts"
+
+# `github_commit_info` is meant to communicate GitHub-flavored commit
+# information to Conbench. See
+# https://github.com/conbench/conbench/blob/cf7931f/benchadapt/python/benchadapt/result.py#L66
+# for a specification.
+github_commit_info = {"repository": "https://github.com/apache/arrow-go"}
+
+if os.environ.get("CONBENCH_REF") == "main":
+    # Assume GitHub Actions CI. The environment variable lookups below are
+    # expected to fail when not running in GitHub Actions.
+    github_commit_info = {
+        "repository": f'{os.environ["GITHUB_SERVER_URL"]}/{os.environ["GITHUB_REPOSITORY"]}',
+        "commit": os.environ["GITHUB_SHA"],
+        "pr_number": None,  # implying default branch
+    }
+    run_reason = "commit"
+else:
+    # Assume that the environment is not GitHub Actions CI. Error out if that
+    # assumption seems to be wrong.
+    assert os.getenv("GITHUB_ACTIONS") is None
+
+    # This is probably a local dev environment, for testing. In this case, it
+    # does usually not make sense to provide commit information (not a
+    # controlled CI environment). Explicitly leave out "commit" and "pr_number" to
+    # reflect that (to not send commit information).
+
+    # Reflect 'local dev' scenario in run_reason. Allow user to (optionally)
+    # inject a custom piece of information into the run reason here, from
+    # environment.
+    run_reason = "localdev"
+    custom_reason_suffix = os.getenv("CONBENCH_CUSTOM_RUN_REASON")
+    if custom_reason_suffix is not None:
+        run_reason += f" {custom_reason_suffix.strip()}"
+
+
+class GoAdapter(BenchmarkAdapter):
+    result_file = "bench_stats.json"
+    command = ["bash", SCRIPTS_PATH / "bench.sh", ARROW_ROOT, "-json"]
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(command=self.command, *args, **kwargs)
+
+    def _transform_results(self) -> List[BenchmarkResult]:
+        with open(self.result_file, "r") as f:
+            raw_results = json.load(f)
+
+        run_id = uuid.uuid4().hex
+        parsed_results = []
+        for suite in raw_results[0]["Suites"]:
+            batch_id = uuid.uuid4().hex
+            pkg = suite["Pkg"]
+
+            for benchmark in suite["Benchmarks"]:
+                data = benchmark["Mem"]["MBPerSec"] * 1e6
+                time = 1 / benchmark["NsPerOp"] * 1e9
+
+                name = benchmark["Name"].removeprefix("Benchmark")
+                ncpu = name[name.rfind("-") + 1 :]
+                pieces = name[: -(len(ncpu) + 1)].split("/")
+
+                parsed = BenchmarkResult(
+                    run_id=run_id,
+                    batch_id=batch_id,
+                    stats={
+                        "data": [data],
+                        "unit": "B/s",
+                        "times": [time],
+                        "time_unit": "i/s",
+                        "iterations": benchmark["Runs"],
+                    },
+                    context={
+                        "benchmark_language": "Go",
+                        "goos": suite["Goos"],
+                        "goarch": suite["Goarch"],
+                    },
+                    tags={
+                        "pkg": pkg,
+                        "num_cpu": ncpu,
+                        "name": pieces[0],
+                        "params": "/".join(pieces[1:]),
+                    },
+                    run_reason=run_reason,
+                    github=github_commit_info,
+                )
+                parsed.run_name = (
+                    f"{parsed.run_reason}: {github_commit_info.get('commit')}"
+                )
+                parsed_results.append(parsed)
+
+        return parsed_results
+
+
+if __name__ == "__main__":
+    go_adapter = GoAdapter(result_fields_override={"info": {}})
+    go_adapter()
+