Skip to content

Commit ea59d05

Browse files
authored
Benchmark subcommand to distinguish between DataFusion and Ballista (#402)
* #401: Add subcommand to TPC-H benchmark args to distinguish between DataFusion and Ballista * fix benchmark subcommand name * Fix lint * fix benchmark tests using DatafusionBenchmarkOpts * Fix DataFusionBenchmarkOpts name and update doc
1 parent 68ad990 commit ea59d05

File tree

4 files changed

+66
-23
lines changed

4 files changed

+66
-23
lines changed

benchmarks/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,13 @@ to the `.gitignore` file.
4444
The benchmark can then be run (assuming the data created from `dbgen` is in `./data`) with a command such as:
4545

4646
```bash
47-
cargo run --release --bin tpch -- benchmark --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
47+
cargo run --release --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
4848
```
4949

5050
You can enable the features `simd` (to use SIMD instructions) and/or `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`:
5151

5252
```
53-
cargo run --release --features "simd mimalloc" --bin tpch -- benchmark --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
53+
cargo run --release --features "simd mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096
5454
```
5555

5656
The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl`
@@ -123,7 +123,7 @@ To run the benchmarks:
123123

124124
```bash
125125
cd $ARROW_HOME/ballista/rust/benchmarks/tpch
126-
cargo run --release benchmark --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl
126+
cargo run --release benchmark ballista --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl
127127
```
128128

129129
## Running the Ballista Benchmarks on docker-compose
@@ -140,7 +140,7 @@ docker-compose up
140140
Then you can run the benchmark with:
141141

142142
```bash
143-
docker-compose run ballista-client cargo run benchmark --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl
143+
docker-compose run ballista-client cargo run benchmark ballista --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl
144144
```
145145

146146
## Expected output

benchmarks/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ set -e
2222
cd /
2323
for query in 1 3 5 6 10 12
2424
do
25-
/tpch benchmark --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug
25+
/tpch benchmark ballista --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug
2626
done

benchmarks/src/bin/tpch.rs

Lines changed: 60 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
5454
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
5555

5656
#[derive(Debug, StructOpt, Clone)]
57-
struct BenchmarkOpt {
57+
struct BallistaBenchmarkOpt {
5858
/// Query number
5959
#[structopt(short, long)]
6060
query: usize,
@@ -67,10 +67,6 @@ struct BenchmarkOpt {
6767
#[structopt(short = "i", long = "iterations", default_value = "3")]
6868
iterations: usize,
6969

70-
/// Number of threads to use for parallel execution
71-
#[structopt(short = "c", long = "concurrency", default_value = "2")]
72-
concurrency: usize,
73-
7470
/// Batch size when reading CSV or Parquet files
7571
#[structopt(short = "s", long = "batch-size", default_value = "8192")]
7672
batch_size: usize,
@@ -100,6 +96,45 @@ struct BenchmarkOpt {
10096
port: Option<u16>,
10197
}
10298

99+
#[derive(Debug, StructOpt, Clone)]
100+
struct DataFusionBenchmarkOpt {
101+
/// Query number
102+
#[structopt(short, long)]
103+
query: usize,
104+
105+
/// Activate debug mode to see query results
106+
#[structopt(short, long)]
107+
debug: bool,
108+
109+
/// Number of iterations of each test run
110+
#[structopt(short = "i", long = "iterations", default_value = "3")]
111+
iterations: usize,
112+
113+
/// Number of threads to use for parallel execution
114+
#[structopt(short = "c", long = "concurrency", default_value = "2")]
115+
concurrency: usize,
116+
117+
/// Batch size when reading CSV or Parquet files
118+
#[structopt(short = "s", long = "batch-size", default_value = "8192")]
119+
batch_size: usize,
120+
121+
/// Path to data files
122+
#[structopt(parse(from_os_str), required = true, short = "p", long = "path")]
123+
path: PathBuf,
124+
125+
/// File format: `csv` or `parquet`
126+
#[structopt(short = "f", long = "format", default_value = "csv")]
127+
file_format: String,
128+
129+
/// Load the data into a MemTable before executing the query
130+
#[structopt(short = "m", long = "mem-table")]
131+
mem_table: bool,
132+
133+
/// Number of partitions to create when using MemTable as input
134+
#[structopt(short = "n", long = "partitions", default_value = "8")]
135+
partitions: usize,
136+
}
137+
103138
#[derive(Debug, StructOpt)]
104139
struct ConvertOpt {
105140
/// Path to csv files
@@ -127,10 +162,19 @@ struct ConvertOpt {
127162
batch_size: usize,
128163
}
129164

165+
#[derive(Debug, StructOpt)]
166+
#[structopt(about = "benchmark command")]
167+
enum BenchmarkSubCommandOpt {
168+
#[structopt(name = "ballista")]
169+
BallistaBenchmark(BallistaBenchmarkOpt),
170+
#[structopt(name = "datafusion")]
171+
DataFusionBenchmark(DataFusionBenchmarkOpt),
172+
}
173+
130174
#[derive(Debug, StructOpt)]
131175
#[structopt(name = "TPC-H", about = "TPC-H Benchmarks.")]
132176
enum TpchOpt {
133-
Benchmark(BenchmarkOpt),
177+
Benchmark(BenchmarkSubCommandOpt),
134178
Convert(ConvertOpt),
135179
}
136180

@@ -140,20 +184,21 @@ const TABLES: &[&str] = &[
140184

141185
#[tokio::main]
142186
async fn main() -> Result<()> {
187+
use BenchmarkSubCommandOpt::*;
188+
143189
env_logger::init();
144190
match TpchOpt::from_args() {
145-
TpchOpt::Benchmark(opt) => {
146-
if opt.host.is_some() && opt.port.is_some() {
147-
benchmark_ballista(opt).await.map(|_| ())
148-
} else {
149-
benchmark_datafusion(opt).await.map(|_| ())
150-
}
191+
TpchOpt::Benchmark(BallistaBenchmark(opt)) => {
192+
benchmark_ballista(opt).await.map(|_| ())
193+
}
194+
TpchOpt::Benchmark(DataFusionBenchmark(opt)) => {
195+
benchmark_datafusion(opt).await.map(|_| ())
151196
}
152197
TpchOpt::Convert(opt) => convert_tbl(opt).await,
153198
}
154199
}
155200

156-
async fn benchmark_datafusion(opt: BenchmarkOpt) -> Result<Vec<RecordBatch>> {
201+
async fn benchmark_datafusion(opt: DataFusionBenchmarkOpt) -> Result<Vec<RecordBatch>> {
157202
println!("Running benchmarks with the following options: {:?}", opt);
158203
let config = ExecutionConfig::new()
159204
.with_concurrency(opt.concurrency)
@@ -204,7 +249,7 @@ async fn benchmark_datafusion(opt: BenchmarkOpt) -> Result<Vec<RecordBatch>> {
204249
Ok(result)
205250
}
206251

207-
async fn benchmark_ballista(opt: BenchmarkOpt) -> Result<()> {
252+
async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> {
208253
println!("Running benchmarks with the following options: {:?}", opt);
209254

210255
let mut settings = HashMap::new();
@@ -956,7 +1001,7 @@ mod tests {
9561001
let expected = df.collect().await?;
9571002

9581003
// run the query to compute actual results of the query
959-
let opt = BenchmarkOpt {
1004+
let opt = DataFusionBenchmarkOpt {
9601005
query: n,
9611006
debug: false,
9621007
iterations: 1,
@@ -966,8 +1011,6 @@ mod tests {
9661011
file_format: "tbl".to_string(),
9671012
mem_table: false,
9681013
partitions: 16,
969-
host: None,
970-
port: None,
9711014
};
9721015
let actual = benchmark_datafusion(opt).await?;
9731016

docs/user-guide/src/distributed/raspberrypi.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ Run the benchmarks:
115115

116116
```bash
117117
docker run -it myrepo/ballista-arm64 \
118-
/tpch benchmark --query=1 --path=/path/to/data --format=parquet \
118+
/tpch benchmark datafusion --query=1 --path=/path/to/data --format=parquet \
119119
--concurrency=24 --iterations=1 --debug --host=ballista-scheduler --port=50050
120120
```
121121

0 commit comments

Comments
 (0)