apache · satwikmishra11 · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025
diff --git a/.github/workflows/notify.yml b/.github/workflows/notify.yml
@@ -0,0 +1,37 @@
+name: Alert on Regression
+on:
+  workflow_run:
+    workflows: ["Run Benchmarks"]
+    types:
+      - completed
+
+jobs:
+  notify:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download Results
+        uses: actions/download-artifact@v3
+        with:
+          name: benchmark-results
+
+      - name: Check for Regressions
+        run: |
+          python scripts/process_results.py \
+            --current current-results.json \
+            --baseline main-results.json
+
+          if [ -n "$REGESSED_QUERIES" ]; then
+            echo "PERF_REGRESSION=true" >> $GITHUB_ENV
+          fi
+
+      - name: Comment on PR
+        if: env.PERF_REGRESSION == 'true'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.payload.workflow_run.pull_requests[0].number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: '⚠️ Performance regression detected in queries: ${REGESSED_QUERIES}'
+            })
diff --git a/benchmarks/src/handlers.rs b/benchmarks/src/handlers.rs
@@ -0,0 +1,38 @@
+use actix_web::{web, HttpResponse};
+use serde::{Deserialize, Serialize};
+use diesel::prelude::*;
+
+#[derive(Deserialize, Serialize)]
+struct BenchmarkResult {
+    query: String,
+    duration_ms: f64,
+    datafusion_version: String,
+}
+
+// POST /results
+async fn post_results(
+    result: web::Json<BenchmarkResult>,
+    pool: web::Data<DbPool>,
+) -> HttpResponse {
+    let conn = pool.get().expect("Couldn't get DB connection");
+
+    diesel::insert_into(benchmark_results::table)
+        .values(&result.0)
+        .execute(&conn)
+        .map(|_| HttpResponse::Created().finish())
+        .unwrap_or_else(|_| HttpResponse::InternalServerError().finish())
+}
+
+// GET /results?version=23.0.0
+async fn get_results(
+    version: web::Query<String>,
+    pool: web::Data<DbPool>,
+) -> HttpResponse {
+    let conn = pool.get().unwrap();
+    let results = benchmark_results::table
+        .filter(benchmark_results::datafusion_version.eq(&version.0))
+        .load::<BenchmarkResult>(&conn)
+        .expect("Error loading results");
+
+    HttpResponse::Ok().json(results)
+}
diff --git a/benchmarks/src/main.rs b/benchmarks/src/main.rs
@@ -0,0 +1,47 @@
+use std::process::Command;
+use serde_json::Value;
+use std::fs::File;
+use std::io::prelude::*;
+
+#[derive(Debug, StructOpt)]
+struct Cli {
+    /// Benchmark suite (tpch, tpcds, custom)
+    #[structopt(short = "s", long = "suite")]
+    suite: String,
+
+    /// Scale factor (e.g., 100 for 100GB dataset)
+    #[structopt(short = "f", long = "scale-factor")]
+    scale_factor: u32,
+
+    /// Output file for results (JSON)
+    #[structopt(short = "o", long = "output")]
+    output: String,
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args = Cli::from_args();
+
+    // Run DataFusion benchmark
+    let output = Command::new("datafusion-cli")
+        .arg("-f")
+        .arg(format!("benchmarks/{}/queries", args.suite))
+        .output()?;
+
+    let results = parse_benchmark_output(&output.stdout)?;
+
+    // Write results to JSON
+    let mut file = File::create(&args.output)?;
+    file.write_all(serde_json::to_string_pretty(&results)?.as_bytes())?;
+
+    // Upload to S3
+    Command::new("aws")
+        .args(&["s3", "cp", &args.output, "s3://datafusion-benchmarks/"])
+        .status()?;
+
+    Ok(())
+}
+
+fn parse_benchmark_output(output: &[u8]) -> Result<Value, Box<dyn std::error::Error>> {
+    // Parse DataFusion's CLI output into structured JSON
+    // (Implementation omitted for brevity)
+}
diff --git a/ci/scripts/cleanup.sh b/ci/scripts/cleanup.sh
@@ -0,0 +1,10 @@
+
+#!/bin/bash
+# Tear down cloud resources after benchmarks
+TERRAFORM_DIR="$(pwd)/infra/aws"
+
+cd "$TERRAFORM_DIR" || exit 1
+terraform destroy -auto-approve
+
+# Clean up S3 artifacts
+aws s3 rm s3://datafusion-benchmarks/ --recursive
diff --git a/datafusion/core/default.toml b/datafusion/core/default.toml
@@ -0,0 +1,12 @@
+[aws]
+region = "us-west-2"
+instance_type = "c5.4xlarge"
+spot_price = 0.15
+
+[benchmarks]
+tpch_scale = 100
+runs_per_query = 5
+timeout_minutes = 60
+
+[github]
+token = "${GITHUB_TOKEN}"  # Set via environment variable
diff --git a/datafusion/core/tests/test_handlers.rs b/datafusion/core/tests/test_handlers.rs
@@ -0,0 +1,18 @@
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use actix_web::test;
+
+    #[actix_rt::test]
+    async fn test_post_results() {
+        let pool = setup_test_db();
+        let result = BenchmarkResult {
+            query: "tpch_q1".to_string(),
+            duration_ms: 452.3,
+            datafusion_version: "23.0.0".to_string(),
+        };
+
+        let resp = post_results(web::Json(result), web::Data::new(pool)).await;
+        assert_eq!(resp.status(), StatusCode::CREATED);
+    }
+}