1818
1919# This script is meant for developers of DataFusion -- it is runnable
2020# from the standard DataFusion development environment and uses cargo,
21- # etc.
21+ # etc and orchestrates gathering data and run the benchmark binary in
22+ # different configurations.
23+
2224
2325# Exit on error
2426set -e
@@ -64,12 +66,14 @@ compare: Comares results from benchmark runs
6466* Benchmarks
6567**********
6668all(default): Data/Run/Compare for all benchmarks
67- tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
68- tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
69- tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
70- tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
71- parquet: Benchmark of parquet reader's filtering speed
72- sort: Benchmark of sorting speed
69+ tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
70+ tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
71+ tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
72+ tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
73+ parquet: Benchmark of parquet reader's filtering speed
74+ sort: Benchmark of sorting speed
75+ clickbench_1: ClickBench queries against a single parquet file
76+ clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
7377
7478**********
7579* Supported Configuration (Environment Variables)
@@ -118,7 +122,7 @@ main() {
118122 data)
119123 BENCHMARK=${ARG2:- " ${BENCHMARK} " }
120124 echo " ***************************"
121- echo " DataFusion Benchmark Data Generation "
125+ echo " DataFusion Benchmark Runner and Data Generator "
122126 echo " COMMAND: ${COMMAND} "
123127 echo " BENCHMARK: ${BENCHMARK} "
124128 echo " DATA_DIR: ${DATA_DIR} "
@@ -128,6 +132,8 @@ main() {
128132 all)
129133 data_tpch " 1"
130134 data_tpch " 10"
135+ data_clickbench_1
136+ data_clickbench_partitioned
131137 ;;
132138 tpch)
133139 data_tpch " 1"
@@ -143,6 +149,12 @@ main() {
143149 # same data as for tpch10
144150 data_tpch " 10"
145151 ;;
152+ clickbench_1)
153+ data_clickbench_1
154+ ;;
155+ clickbench_partitioned)
156+ data_clickbench_partitioned
157+ ;;
146158 * )
147159 echo " Error: unknown benchmark '$BENCHMARK ' for data generation"
148160 usage
@@ -178,6 +190,8 @@ main() {
178190 run_tpch_mem " 10"
179191 run_parquet
180192 run_sort
193+ run_clickbench_1
194+ run_clickbench_partitioned
181195 ;;
182196 tpch)
183197 run_tpch " 1"
@@ -197,6 +211,12 @@ main() {
197211 sort)
198212 run_sort
199213 ;;
214+ clickbench_1)
215+ run_clickbench_1
216+ ;;
217+ clickbench_partitioned)
218+ run_clickbench_partitioned
219+ ;;
200220 * )
201221 echo " Error: unknown benchmark '$BENCHMARK ' for run"
202222 usage
@@ -318,6 +338,62 @@ run_sort() {
318338 $CARGO_COMMAND --bin parquet -- sort --path " ${DATA_DIR} " --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE}
319339}
320340
341+
342+ # Downloads the single file hits.parquet ClickBench datasets from
343+ # https://github.com/ClickHouse/ClickBench/tree/main#data-loading
344+ #
345+ # Creates data in $DATA_DIR/hits.parquet
346+ data_clickbench_1 () {
347+ pushd " ${DATA_DIR} " > /dev/null
348+
349+ # Avoid downloading if it already exists and is the right size
350+ OUTPUT_SIZE=` wc -c hits.parquet 2> /dev/null | awk ' {print $1}' || true`
351+ echo -n " Checking hits.parquet..."
352+ if test " ${OUTPUT_SIZE} " = " 14779976446" ; then
353+ echo -n " ... found ${OUTPUT_SIZE} bytes ..."
354+ else
355+ URL=" https://datasets.clickhouse.com/hits_compatible/hits.parquet"
356+ echo -n " ... downloading ${URL} (14GB) ... "
357+ wget --continue ${URL}
358+ fi
359+ echo " Done"
360+ popd > /dev/null
361+ }
362+
363+ # Downloads the 100 file partitioned ClickBench datasets from
364+ # https://github.com/ClickHouse/ClickBench/tree/main#data-loading
365+ #
366+ # Creates data in $DATA_DIR/hits_partitioned
367+ data_clickbench_partitioned () {
368+ MAX_CONCURRENT_DOWNLOADS=10
369+
370+ mkdir -p " ${DATA_DIR} /hits_partitioned"
371+ pushd " ${DATA_DIR} /hits_partitioned" > /dev/null
372+
373+ echo -n " Checking hits_partitioned..."
374+ OUTPUT_SIZE=` wc -c * 2> /dev/null | tail -n 1 | awk ' {print $1}' || true`
375+ if test " ${OUTPUT_SIZE} " = " 14737666736" ; then
376+ echo -n " ... found ${OUTPUT_SIZE} bytes ..."
377+ else
378+ echo -n " downloading with ${MAX_CONCURRENT_DOWNLOADS} parallel workers"
379+ seq 0 99 | xargs -P${MAX_CONCURRENT_DOWNLOADS} -I{} bash -c ' wget -q --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet && echo -n "."'
380+ fi
381+
382+ echo " Done"
383+ popd > /dev/null
384+ }
385+
386+
387+ # Runs the clickbench benchmark with a single large parquet file
388+ run_clickbench_1 () {
389+ echo " NOTICE: ClickBench (1 parquet file) is not yet supported"
390+ }
391+
392+ # Runs the clickbench benchmark with a single large parquet file
393+ run_clickbench_partitioned () {
394+ echo " NOTICE: ClickBench (1 parquet file) is not yet supported"
395+ }
396+
321397compare_benchmarks () {
322398 BASE_RESULTS_DIR=" ${SCRIPT_DIR} /results"
323399 BRANCH1=" ${ARG2} "
0 commit comments