Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,38 @@ on:
"id": "statpopgen",
"subcommand": "statpopgen",
"name": "Statistical and Population Genetics",
"local_dir": "bench-vortex/data/statpopgen",
"targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 100"
},
{
"id": "fineweb",
"subcommand": "fineweb",
"name": "FineWeb",
"name": "FineWeb NVMe",
"targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact",
"scale_factor": "--scale-factor 100"
},
{
"id": "fineweb-s3",
"subcommand": "fineweb",
"name": "FineWeb S3",
"local_dir": "bench-vortex/data/fineweb",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/fineweb/",
"targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact",
"scale_factor": "--scale-factor 100"
},
{
"id": "gharchive-nvme",
"subcommand": "gharchive",
"name": "GitHub Archive (NVMe)",
"targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact",
"scale_factor": "--scale-factor 100"
},
{
"id": "gharchive-s3",
"subcommand": "gharchive",
"name": "GitHub Archive (S3)",
"local_dir": "bench-vortex/data/gharchive",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/gharchive/",
"targets": "duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,datafusion:parquet,datafusion:vortex,datafusion:vortex-compact",
"scale_factor": "--scale-factor 100"
},
Expand Down
58 changes: 58 additions & 0 deletions bench-vortex/src/bin/query_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use bench_vortex::benchmark_driver::{DriverConfig, run_benchmark};
use bench_vortex::clickbench::{ClickBenchBenchmark, Flavor};
use bench_vortex::display::DisplayFormat;
use bench_vortex::fineweb::Fineweb;
use bench_vortex::realnest::gharchive::GithubArchive;
use bench_vortex::statpopgen::StatPopGenBenchmark;
use bench_vortex::tpcds::TpcDsBenchmark;
use bench_vortex::tpch::tpch_benchmark::TpcHBenchmark;
Expand Down Expand Up @@ -41,6 +42,9 @@ enum Commands {

#[command(name = "fineweb")]
Fineweb(FinewebArgs),

#[command(name = "gharchive")]
GhArchive(GhArchiveArgs),
}

/// Common arguments shared across benchmarks
Expand Down Expand Up @@ -228,6 +232,28 @@ struct FinewebArgs {
scale_factor: u64,
}

#[derive(Parser, Debug)]
struct GhArchiveArgs {
#[command(flatten)]
common: CommonArgs,

#[arg(long, value_delimiter = ',', value_parser = value_parser!(Target),
default_values = vec![
"duckdb:parquet",
"duckdb:vortex",
"duckdb:vortex-compact",
"datafusion:parquet",
"datafusion:vortex",
"datafusion:vortex-compact",
]
)]
targets: Vec<Target>,

// Dummy, unused but we are required to accept it to make the CI automation happy
#[arg(long)]
scale_factor: u64,
}

fn validate_scale_factor(val: &str) -> Result<String, String> {
match val.parse::<f32>() {
Ok(n) if [0.01, 0.1, 1., 10., 100., 1000.].contains(&n) => {
Expand Down Expand Up @@ -257,6 +283,7 @@ fn main() -> anyhow::Result<()> {
Commands::TpcDS(tpcds_args) => run_tpcds(tpcds_args),
Commands::StatPopGen(stat_pop_gen_args) => run_statpopgen(stat_pop_gen_args),
Commands::Fineweb(fineweb_args) => run_fineweb(fineweb_args),
Commands::GhArchive(gh_archive_args) => run_gharchive(gh_archive_args),
}
}

Expand Down Expand Up @@ -425,3 +452,34 @@ fn run_fineweb(args: FinewebArgs) -> anyhow::Result<()> {

run_benchmark(benchmark, config)
}

fn run_gharchive(args: GhArchiveArgs) -> anyhow::Result<()> {
setup_logging_and_tracing(args.common.verbose, args.common.tracing)?;

let data_url = Url::from_directory_path("gharchive".to_data_path())
.map_err(|_| anyhow::anyhow!("bad data path"))?;

let benchmark = GithubArchive::new(data_url);

let config = DriverConfig {
targets: args.targets,
iterations: args.common.iterations,
threads: args.common.threads,
display_format: args.common.display_format,
disable_datafusion_cache: args.common.disable_datafusion_cache,
delete_duckdb_database: args.common.delete_duckdb_database,
queries: args.common.queries,
exclude_queries: args.common.exclude_queries,
output_path: args.common.output_path,
emit_plan: args.common.emit_plan,
export_spans: args.common.export_spans,
show_metrics: args.common.show_metrics,
hide_progress_bar: args.common.hide_progress_bar,
track_memory: args.common.track_memory,
skip_generate: args.common.skip_generate,
explain: args.common.explain,
explain_analyze: args.common.explain_analyze,
};

run_benchmark(benchmark, config)
}
4 changes: 3 additions & 1 deletion bench-vortex/src/datasets/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ pub async fn register_vortex_files(
match dataset {
BenchmarkDataset::TpcH { .. }
| BenchmarkDataset::TpcDS { .. }
| BenchmarkDataset::Fineweb => {
| BenchmarkDataset::Fineweb
| BenchmarkDataset::GhArchive => {
info!(
"Registering table from {}, with glob {:?}",
&file_url,
Expand Down Expand Up @@ -161,6 +162,7 @@ pub async fn register_vortex_compact_files(
BenchmarkDataset::PublicBi { .. } => todo!(),
BenchmarkDataset::StatPopGen { .. } => todo!(),
BenchmarkDataset::Fineweb => todo!(),
BenchmarkDataset::GhArchive => todo!(),
}

Ok(())
Expand Down
9 changes: 9 additions & 0 deletions bench-vortex/src/datasets/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use vortex::ArrayRef;
use crate::clickbench::Flavor;
#[cfg(feature = "lance")]
use crate::file::register_lance_files;
use crate::realnest::gharchive;
use crate::{Format, clickbench, fineweb, statpopgen};

pub mod data_downloads;
Expand Down Expand Up @@ -42,6 +43,8 @@ pub enum BenchmarkDataset {
StatPopGen { n_rows: u64 },
#[serde(rename = "fineweb")]
Fineweb,
#[serde(rename = "gharchive")]
GhArchive,
}

impl BenchmarkDataset {
Expand All @@ -53,6 +56,7 @@ impl BenchmarkDataset {
BenchmarkDataset::PublicBi { .. } => "public-bi",
BenchmarkDataset::StatPopGen { .. } => "statpopgen",
BenchmarkDataset::Fineweb => "fineweb",
BenchmarkDataset::GhArchive => "gharchive",
}
}
}
Expand All @@ -69,6 +73,7 @@ impl Display for BenchmarkDataset {
BenchmarkDataset::PublicBi { name } => write!(f, "public-bi({name})"),
BenchmarkDataset::StatPopGen { n_rows } => write!(f, "statpopgen(n_rows={n_rows})"),
BenchmarkDataset::Fineweb => write!(f, "fineweb"),
BenchmarkDataset::GhArchive => write!(f, "gharchive"),
}
}
}
Expand Down Expand Up @@ -109,6 +114,7 @@ impl BenchmarkDataset {
BenchmarkDataset::ClickBench { .. } | BenchmarkDataset::PublicBi { .. } => todo!(),
BenchmarkDataset::StatPopGen { .. } => &["statpopgen"],
BenchmarkDataset::Fineweb => &["fineweb"],
BenchmarkDataset::GhArchive => &["events"],
}
}

Expand Down Expand Up @@ -167,6 +173,9 @@ impl BenchmarkDataset {
(BenchmarkDataset::Fineweb, format) => {
fineweb::register_table(session, base_url, format).await?
}
(BenchmarkDataset::GhArchive, format) => {
gharchive::register_table(session, base_url, format).await?
}
}

Ok(())
Expand Down
8 changes: 8 additions & 0 deletions bench-vortex/src/engines/ddb/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ impl DuckDBCtx {
format!("statpopgen/{n_rows}/{}", format.name()).to_data_path()
}
BenchmarkDataset::Fineweb => format!("fineweb/{}", format.name()).to_data_path(),
BenchmarkDataset::GhArchive => format!("gharchive/{}", format.name()).to_data_path(),
};
std::fs::create_dir_all(&dir)?;
let db_path = dir.join("duckdb.db");
Expand Down Expand Up @@ -259,6 +260,13 @@ impl DuckDBCtx {
duckdb_object.to_str(),
)
}
BenchmarkDataset::GhArchive => {
let path = format!("{base_dir}*.{extension}");
format!(
"CREATE {} IF NOT EXISTS events AS SELECT * FROM read_{extension}('{path}');",
duckdb_object.to_str(),
)
}
}
}
}
1 change: 1 addition & 0 deletions bench-vortex/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub mod metrics;
pub mod public_bi;
pub mod query_bench;
pub mod random_access;
pub mod realnest;
pub mod statpopgen;
pub mod tpcds;
pub mod tpch;
Expand Down
Loading
Loading