Skip to content

Commit aaf5245

Browse files
fix[ci]: use run_id in the s3 benchmark runs (#5421)
Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent e38ae64 commit aaf5245

File tree

6 files changed

+99
-19
lines changed

6 files changed

+99
-19
lines changed

.github/workflows/bench.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ jobs:
124124
"subcommand": "tpch",
125125
"name": "TPC-H SF=1 on S3",
126126
"local_dir": "bench-vortex/data/tpch/1.0",
127-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/1.0/",
127+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
128128
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
129129
"scale_factor": "--scale-factor 1.0",
130130
"build_args": "--features lance"
@@ -142,7 +142,7 @@ jobs:
142142
"subcommand": "tpch",
143143
"name": "TPC-H SF=10 on S3",
144144
"local_dir": "bench-vortex/data/tpch/10.0",
145-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/",
145+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
146146
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
147147
"scale_factor": "--scale-factor 10.0",
148148
"build_args": "--features lance"
@@ -174,7 +174,7 @@ jobs:
174174
"subcommand": "fineweb",
175175
"name": "FineWeb S3",
176176
"local_dir": "bench-vortex/data/fineweb",
177-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/fineweb/",
177+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/fineweb/",
178178
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
179179
"scale_factor": "--scale-factor 100"
180180
},
@@ -190,7 +190,7 @@ jobs:
190190
"subcommand": "gharchive",
191191
"name": "GitHub Archive (S3)",
192192
"local_dir": "bench-vortex/data/gharchive",
193-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/gharchive/",
193+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/gharchive/",
194194
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
195195
"scale_factor": "--scale-factor 100"
196196
},

.github/workflows/nightly-bench.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
"subcommand": "tpch",
4242
"name": "TPC-H on S3",
4343
"local_dir": "bench-vortex/data/tpch/10.0",
44-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/",
44+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
4545
"targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex",
4646
"scale_factor": "--scale-factor 10.0",
4747
"build_args": "--features lance"
@@ -58,7 +58,7 @@ jobs:
5858
"subcommand": "tpch",
5959
"name": "TPC-H on S3",
6060
"local_dir": "bench-vortex/data/tpch/100.0",
61-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/100.0/",
61+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/100.0/",
6262
"targets": "datafusion:parquet,duckdb:parquet,duckdb:vortex",
6363
"scale_factor": "--scale-factor 100.0"
6464
},

.github/workflows/sql-benchmarks.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ on:
3535
"subcommand": "tpch",
3636
"name": "TPC-H SF=1 on S3",
3737
"local_dir": "bench-vortex/data/tpch/1.0",
38-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/1.0/",
38+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
3939
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
4040
"scale_factor": "--scale-factor 1.0"
4141
},
@@ -51,7 +51,7 @@ on:
5151
"subcommand": "tpch",
5252
"name": "TPC-H SF=10 on S3",
5353
"local_dir": "bench-vortex/data/tpch/10.0",
54-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/",
54+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
5555
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
5656
"scale_factor": "--scale-factor 10.0"
5757
},
@@ -81,7 +81,7 @@ on:
8181
"subcommand": "fineweb",
8282
"name": "FineWeb S3",
8383
"local_dir": "bench-vortex/data/fineweb",
84-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/fineweb/",
84+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/fineweb/",
8585
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
8686
"scale_factor": "--scale-factor 100"
8787
},
@@ -97,7 +97,7 @@ on:
9797
"subcommand": "gharchive",
9898
"name": "GitHub Archive (S3)",
9999
"local_dir": "bench-vortex/data/gharchive",
100-
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/gharchive/",
100+
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/gharchive/",
101101
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
102102
"scale_factor": "--scale-factor 100"
103103
},

bench-vortex/src/bin/query_bench.rs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -425,10 +425,7 @@ fn run_statpopgen(args: StatPopGenArgs) -> anyhow::Result<()> {
425425
fn run_fineweb(args: FinewebArgs) -> anyhow::Result<()> {
426426
setup_logging_and_tracing(args.common.verbose, args.common.tracing)?;
427427

428-
let data_url = Url::from_directory_path("fineweb".to_data_path())
429-
.map_err(|_| anyhow::anyhow!("bad data path"))?;
430-
431-
let benchmark = Fineweb::new(data_url);
428+
let benchmark = Fineweb::with_remote_data_dir(args.common.use_remote_data_dir)?;
432429

433430
let config = DriverConfig {
434431
targets: args.targets,
@@ -456,10 +453,7 @@ fn run_fineweb(args: FinewebArgs) -> anyhow::Result<()> {
456453
fn run_gharchive(args: GhArchiveArgs) -> anyhow::Result<()> {
457454
setup_logging_and_tracing(args.common.verbose, args.common.tracing)?;
458455

459-
let data_url = Url::from_directory_path("gharchive".to_data_path())
460-
.map_err(|_| anyhow::anyhow!("bad data path"))?;
461-
462-
let benchmark = GithubArchive::new(data_url);
456+
let benchmark = GithubArchive::with_remote_data_dir(args.common.use_remote_data_dir)?;
463457

464458
let config = DriverConfig {
465459
targets: args.targets,

bench-vortex/src/fineweb/mod.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,38 @@ impl Fineweb {
5858
pub fn new(data_url: Url) -> Self {
5959
Self { data_url }
6060
}
61+
62+
pub fn with_remote_data_dir(use_remote_data_dir: Option<String>) -> anyhow::Result<Self> {
63+
let data_url = Self::create_data_url(&use_remote_data_dir)?;
64+
Ok(Self { data_url })
65+
}
66+
67+
fn create_data_url(remote_data_dir: &Option<String>) -> anyhow::Result<Url> {
68+
match remote_data_dir {
69+
None => {
70+
let data_dir = crate::IdempotentPath::to_data_path("fineweb");
71+
Url::from_directory_path(&data_dir).map_err(|_| {
72+
anyhow::anyhow!("Failed to create URL from directory path: {:?}", &data_dir)
73+
})
74+
}
75+
Some(remote_data_dir) => {
76+
if !remote_data_dir.ends_with("/") {
77+
log::warn!(
78+
"Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/develop/12345/fineweb/"
79+
);
80+
}
81+
log::info!(
82+
concat!(
83+
"Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n",
84+
"If it does not, you should kill this command, locally generate the files (by running without\n",
85+
"--use-remote-data-dir) and upload data/fineweb/ to some remote location.",
86+
),
87+
remote_data_dir,
88+
);
89+
Ok(Url::parse(remote_data_dir)?)
90+
}
91+
}
92+
}
6193
}
6294

6395
impl Fineweb {
@@ -92,6 +124,17 @@ impl Benchmark for Fineweb {
92124
}
93125

94126
fn generate_data(&self, target: &Target) -> anyhow::Result<()> {
127+
// Skip generation if using remote storage
128+
match self.data_url.scheme() {
129+
"file" => {
130+
// Continue with local generation
131+
}
132+
_ => {
133+
// Remote storage - data should already be uploaded
134+
return Ok(());
135+
}
136+
}
137+
95138
// Before downloading anything, make sure we are using a supported target.
96139
anyhow::ensure!(
97140
matches!(

bench-vortex/src/realnest/gharchive.rs

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,38 @@ impl GithubArchive {
4949
pub fn new(data_url: Url) -> Self {
5050
Self { data_url }
5151
}
52+
53+
pub fn with_remote_data_dir(use_remote_data_dir: Option<String>) -> anyhow::Result<Self> {
54+
let data_url = Self::create_data_url(&use_remote_data_dir)?;
55+
Ok(Self { data_url })
56+
}
57+
58+
fn create_data_url(remote_data_dir: &Option<String>) -> anyhow::Result<Url> {
59+
match remote_data_dir {
60+
None => {
61+
let data_dir = crate::IdempotentPath::to_data_path("gharchive");
62+
Url::from_directory_path(&data_dir).map_err(|_| {
63+
anyhow::anyhow!("Failed to create URL from directory path: {:?}", &data_dir)
64+
})
65+
}
66+
Some(remote_data_dir) => {
67+
if !remote_data_dir.ends_with("/") {
68+
log::warn!(
69+
"Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/develop/12345/gharchive/"
70+
);
71+
}
72+
log::info!(
73+
concat!(
74+
"Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n",
75+
"If it does not, you should kill this command, locally generate the files (by running without\n",
76+
"--use-remote-data-dir) and upload data/gharchive/ to some remote location.",
77+
),
78+
remote_data_dir,
79+
);
80+
Ok(Url::parse(remote_data_dir)?)
81+
}
82+
}
83+
}
5284
}
5385

5486
impl GithubArchive {
@@ -95,13 +127,24 @@ impl Benchmark for GithubArchive {
95127
}
96128

97129
fn generate_data(&self, target: &Target) -> anyhow::Result<()> {
130+
// Skip generation if using remote storage
131+
match self.data_url.scheme() {
132+
"file" => {
133+
// Continue with local generation
134+
}
135+
_ => {
136+
// Remote storage - data should already be uploaded
137+
return Ok(());
138+
}
139+
}
140+
98141
// Before downloading anything, make sure we are using a supported target.
99142
anyhow::ensure!(
100143
matches!(
101144
target.format,
102145
Format::Parquet | Format::OnDiskVortex | Format::VortexCompact
103146
),
104-
"unsupported format for `fineweb` bench: {}",
147+
"unsupported format for `gharchive` bench: {}",
105148
target.format
106149
);
107150

0 commit comments

Comments
 (0)