Skip to content

Commit 4ad4f90

Browse files
compheadandygrove
andauthored
Adding TPCH benchmarks for Sort Merge Join (#10092)
* Adding TPCH bencmarks for Sort Merge Join * Update benchmarks/bench.sh Co-authored-by: Andy Grove <andygrove73@gmail.com> * fix benches * fmt * comments --------- Co-authored-by: Andy Grove <andygrove73@gmail.com>
1 parent b54adb3 commit 4ad4f90

File tree

2 files changed

+34
-2
lines changed

2 files changed

+34
-2
lines changed

benchmarks/bench.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,12 @@ main() {
213213
tpch_mem10)
214214
run_tpch_mem "10"
215215
;;
216+
tpch_smj)
217+
run_tpch_smj "1"
218+
;;
219+
tpch_smj10)
220+
run_tpch_smj "10"
221+
;;
216222
parquet)
217223
run_parquet
218224
;;
@@ -320,6 +326,21 @@ run_tpch() {
320326
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE}
321327
}
322328

329+
# Runs the tpch benchmark with sort merge join
330+
run_tpch_smj() {
331+
SCALE_FACTOR=$1
332+
if [ -z "$SCALE_FACTOR" ] ; then
333+
echo "Internal error: Scale factor not specified"
334+
exit 1
335+
fi
336+
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
337+
338+
RESULTS_FILE="${RESULTS_DIR}/tpch_smj_sf${SCALE_FACTOR}.json"
339+
echo "RESULTS_FILE: ${RESULTS_FILE}"
340+
echo "Running tpch SMJ benchmark..."
341+
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join false --format parquet -o ${RESULTS_FILE}
342+
}
343+
323344
# Runs the tpch in memory
324345
run_tpch_mem() {
325346
SCALE_FACTOR=$1

benchmarks/src/tpch/run.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
4242
use log::info;
4343
use structopt::StructOpt;
4444

45+
// hack to avoid `default_value is meaningless for bool` errors
46+
type BoolDefaultTrue = bool;
47+
4548
/// Run the tpch benchmark.
4649
///
4750
/// This benchmarks is derived from the [TPC-H][1] version
@@ -81,6 +84,11 @@ pub struct RunOpt {
8184
/// Whether to disable collection of statistics (and cost based optimizations) or not.
8285
#[structopt(short = "S", long = "disable-statistics")]
8386
disable_statistics: bool,
87+
88+
/// If true then hash join used, if false then sort merge join
89+
/// True by default.
90+
#[structopt(short = "j", long = "prefer_hash_join", default_value = "true")]
91+
prefer_hash_join: BoolDefaultTrue,
8492
}
8593

8694
const TPCH_QUERY_START_ID: usize = 1;
@@ -107,10 +115,11 @@ impl RunOpt {
107115
}
108116

109117
async fn benchmark_query(&self, query_id: usize) -> Result<Vec<QueryResult>> {
110-
let config = self
118+
let mut config = self
111119
.common
112120
.config()
113121
.with_collect_statistics(!self.disable_statistics);
122+
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
114123
let ctx = SessionContext::new_with_config(config);
115124

116125
// register tables
@@ -304,7 +313,7 @@ mod tests {
304313
use super::*;
305314

306315
use datafusion::common::exec_err;
307-
use datafusion::error::{DataFusionError, Result};
316+
use datafusion::error::Result;
308317
use datafusion_proto::bytes::{
309318
logical_plan_from_bytes, logical_plan_to_bytes, physical_plan_from_bytes,
310319
physical_plan_to_bytes,
@@ -339,6 +348,7 @@ mod tests {
339348
mem_table: false,
340349
output_path: None,
341350
disable_statistics: false,
351+
prefer_hash_join: true,
342352
};
343353
opt.register_tables(&ctx).await?;
344354
let queries = get_query_sql(query)?;
@@ -371,6 +381,7 @@ mod tests {
371381
mem_table: false,
372382
output_path: None,
373383
disable_statistics: false,
384+
prefer_hash_join: true,
374385
};
375386
opt.register_tables(&ctx).await?;
376387
let queries = get_query_sql(query)?;

0 commit comments

Comments
 (0)