Skip to content

Commit eb9a702

Browse files
authored
Add support for ClickBench in bench.sh (#7005)
* Add support for ClickBench in bench.sh * Update benchmarks/bench.sh
1 parent 5907c21 commit eb9a702

File tree

1 file changed

+84
-8
lines changed

1 file changed

+84
-8
lines changed

benchmarks/bench.sh

Lines changed: 84 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818

1919
# This script is meant for developers of DataFusion -- it is runnable
2020
# from the standard DataFusion development environment and uses cargo,
21-
# etc.
21+
# etc and orchestrates gathering data and run the benchmark binary in
22+
# different configurations.
23+
2224

2325
# Exit on error
2426
set -e
@@ -64,12 +66,14 @@ compare: Comares results from benchmark runs
6466
* Benchmarks
6567
**********
6668
all(default): Data/Run/Compare for all benchmarks
67-
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
68-
tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
69-
tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
70-
tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
71-
parquet: Benchmark of parquet reader's filtering speed
72-
sort: Benchmark of sorting speed
69+
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
70+
tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
71+
tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
72+
tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
73+
parquet: Benchmark of parquet reader's filtering speed
74+
sort: Benchmark of sorting speed
75+
clickbench_1: ClickBench queries against a single parquet file
76+
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
7377
7478
**********
7579
* Supported Configuration (Environment Variables)
@@ -118,7 +122,7 @@ main() {
118122
data)
119123
BENCHMARK=${ARG2:-"${BENCHMARK}"}
120124
echo "***************************"
121-
echo "DataFusion Benchmark Data Generation"
125+
echo "DataFusion Benchmark Runner and Data Generator"
122126
echo "COMMAND: ${COMMAND}"
123127
echo "BENCHMARK: ${BENCHMARK}"
124128
echo "DATA_DIR: ${DATA_DIR}"
@@ -128,6 +132,8 @@ main() {
128132
all)
129133
data_tpch "1"
130134
data_tpch "10"
135+
data_clickbench_1
136+
data_clickbench_partitioned
131137
;;
132138
tpch)
133139
data_tpch "1"
@@ -143,6 +149,12 @@ main() {
143149
# same data as for tpch10
144150
data_tpch "10"
145151
;;
152+
clickbench_1)
153+
data_clickbench_1
154+
;;
155+
clickbench_partitioned)
156+
data_clickbench_partitioned
157+
;;
146158
*)
147159
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
148160
usage
@@ -178,6 +190,8 @@ main() {
178190
run_tpch_mem "10"
179191
run_parquet
180192
run_sort
193+
run_clickbench_1
194+
run_clickbench_partitioned
181195
;;
182196
tpch)
183197
run_tpch "1"
@@ -197,6 +211,12 @@ main() {
197211
sort)
198212
run_sort
199213
;;
214+
clickbench_1)
215+
run_clickbench_1
216+
;;
217+
clickbench_partitioned)
218+
run_clickbench_partitioned
219+
;;
200220
*)
201221
echo "Error: unknown benchmark '$BENCHMARK' for run"
202222
usage
@@ -318,6 +338,62 @@ run_sort() {
318338
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE}
319339
}
320340

341+
342+
# Downloads the single file hits.parquet ClickBench datasets from
343+
# https://github.com/ClickHouse/ClickBench/tree/main#data-loading
344+
#
345+
# Creates data in $DATA_DIR/hits.parquet
346+
data_clickbench_1() {
347+
pushd "${DATA_DIR}" > /dev/null
348+
349+
# Avoid downloading if it already exists and is the right size
350+
OUTPUT_SIZE=`wc -c hits.parquet 2>/dev/null | awk '{print $1}' || true`
351+
echo -n "Checking hits.parquet..."
352+
if test "${OUTPUT_SIZE}" = "14779976446"; then
353+
echo -n "... found ${OUTPUT_SIZE} bytes ..."
354+
else
355+
URL="https://datasets.clickhouse.com/hits_compatible/hits.parquet"
356+
echo -n "... downloading ${URL} (14GB) ... "
357+
wget --continue ${URL}
358+
fi
359+
echo " Done"
360+
popd > /dev/null
361+
}
362+
363+
# Downloads the 100 file partitioned ClickBench datasets from
364+
# https://github.com/ClickHouse/ClickBench/tree/main#data-loading
365+
#
366+
# Creates data in $DATA_DIR/hits_partitioned
367+
data_clickbench_partitioned() {
368+
MAX_CONCURRENT_DOWNLOADS=10
369+
370+
mkdir -p "${DATA_DIR}/hits_partitioned"
371+
pushd "${DATA_DIR}/hits_partitioned" > /dev/null
372+
373+
echo -n "Checking hits_partitioned..."
374+
OUTPUT_SIZE=`wc -c * 2>/dev/null | tail -n 1 | awk '{print $1}' || true`
375+
if test "${OUTPUT_SIZE}" = "14737666736"; then
376+
echo -n "... found ${OUTPUT_SIZE} bytes ..."
377+
else
378+
echo -n " downloading with ${MAX_CONCURRENT_DOWNLOADS} parallel workers"
379+
seq 0 99 | xargs -P${MAX_CONCURRENT_DOWNLOADS} -I{} bash -c 'wget -q --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet && echo -n "."'
380+
fi
381+
382+
echo " Done"
383+
popd > /dev/null
384+
}
385+
386+
387+
# Runs the clickbench benchmark with a single large parquet file
388+
run_clickbench_1() {
389+
echo "NOTICE: ClickBench (1 parquet file) is not yet supported"
390+
}
391+
392+
# Runs the clickbench benchmark with a single large parquet file
393+
run_clickbench_partitioned() {
394+
echo "NOTICE: ClickBench (1 parquet file) is not yet supported"
395+
}
396+
321397
compare_benchmarks() {
322398
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
323399
BRANCH1="${ARG2}"

0 commit comments

Comments
 (0)