Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ external_aggr: External aggregation benchmark on TPC-H dataset (SF=1)

# ClickBench Benchmarks
clickbench_1: ClickBench queries against a single parquet file
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
clickbench_partitioned: ClickBench queries against partitioned (100 files) parquet
clickbench_pushdown: ClickBench queries against partitioned (100 files) parquet w/ filter_pushdown enabled
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)

# H2O.ai Benchmarks (Group By, Join, Window)
Expand Down Expand Up @@ -207,6 +208,9 @@ main() {
clickbench_partitioned)
data_clickbench_partitioned
;;
clickbench_pushdown)
data_clickbench_partitioned # same data as clickbench_partitioned
;;
clickbench_extended)
data_clickbench_1
;;
Expand Down Expand Up @@ -303,6 +307,7 @@ main() {
run_cancellation
run_clickbench_1
run_clickbench_partitioned
run_clickbench_pushdown
run_clickbench_extended
run_h2o "SMALL" "PARQUET" "groupby"
run_h2o "MEDIUM" "PARQUET" "groupby"
Expand Down Expand Up @@ -340,6 +345,9 @@ main() {
clickbench_partitioned)
run_clickbench_partitioned
;;
clickbench_pushdown)
run_clickbench_pushdown
;;
clickbench_extended)
run_clickbench_extended
;;
Expand Down Expand Up @@ -572,14 +580,24 @@ run_clickbench_1() {
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the clickbench benchmark with the partitioned parquet files
# Runs the clickbench benchmark with the partitioned parquet dataset (100 files)
run_clickbench_partitioned() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (partitioned, 100 files) benchmark..."
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
}


# Runs the clickbench benchmark with the partitioned parquet files and filter_pushdown enabled
run_clickbench_pushdown() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_pushdown.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (partitioned, 100 files) benchmark with pushdown_filters=true, reorder_filters=true..."
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --pushdown --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
}


# Runs the clickbench "extended" benchmark with a single large parquet file
run_clickbench_extended() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
Expand Down
16 changes: 15 additions & 1 deletion benchmarks/src/clickbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use datafusion_common::exec_datafusion_err;
use datafusion_common::instant::Instant;
use structopt::StructOpt;

/// Run the clickbench benchmark
/// Driver program to run the ClickBench benchmark
///
/// The ClickBench[1] benchmarks are widely cited in the industry and
/// focus on grouping / aggregation / filtering. This runner uses the
Expand All @@ -44,6 +44,14 @@ pub struct RunOpt {
#[structopt(short, long)]
query: Option<usize>,

/// If specified, enables Parquet Filter Pushdown.
///
/// Specifically, it enables:
/// * `pushdown_filters = true`
/// * `reorder_filters = true`
#[structopt(long = "pushdown")]
pushdown: bool,

/// Common options
#[structopt(flatten)]
common: CommonOpt,
Expand Down Expand Up @@ -122,6 +130,12 @@ impl RunOpt {
// The hits_partitioned dataset specifies string columns
// as binary due to how it was written. Force it to strings
parquet_options.binary_as_string = true;

// Turn on Parquet filter pushdown if requested
if self.pushdown {
parquet_options.pushdown_filters = true;
parquet_options.reorder_filters = true;
}
}

let rt_builder = self.common.runtime_env_builder()?;
Expand Down