Skip to content

feat: Support tpch and tpch10 benchmark for csv format #16373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@ venv: Creates new venv (unless already exists) and installs compare's
**********
all(default): Data/Run/Compare for all benchmarks
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join
tpch_csv: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single csv file per table, hash join
tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table, hash join
tpch_csv10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single csv file per table, hash join
tpch_mem10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
cancellation: How long cancelling a query takes
parquet: Benchmark of parquet reader's filtering speed
Expand Down Expand Up @@ -266,9 +268,11 @@ main() {
mkdir -p "${DATA_DIR}"
case "$BENCHMARK" in
all)
run_tpch "1"
run_tpch "1" "parquet"
run_tpch "1" "csv"
run_tpch_mem "1"
run_tpch "10"
run_tpch "10" "parquet"
run_tpch "10" "csv"
run_tpch_mem "10"
run_cancellation
run_parquet
Expand All @@ -286,13 +290,19 @@ main() {
run_external_aggr
;;
tpch)
run_tpch "1"
run_tpch "1" "parquet"
;;
tpch_csv)
run_tpch "1" "csv"
;;
tpch_mem)
run_tpch_mem "1"
;;
tpch10)
run_tpch "10"
run_tpch "10" "parquet"
;;
tpch_csv10)
run_tpch "10" "csv"
;;
tpch_mem10)
run_tpch_mem "10"
Expand Down Expand Up @@ -430,6 +440,17 @@ data_tpch() {
$CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
popd > /dev/null
fi

# Create 'csv' files from tbl
FILE="${TPCH_DIR}/csv/supplier"
if test -d "${FILE}"; then
echo " csv files exist ($FILE exists)."
else
echo " creating csv files using benchmark binary ..."
pushd "${SCRIPT_DIR}" > /dev/null
$CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}/csv" --format csv
popd > /dev/null
fi
}

# Runs the tpch benchmark
Expand All @@ -446,7 +467,9 @@ run_tpch() {
echo "Running tpch benchmark..."
# Optional query filter to run specific query
QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" $QUERY

FORMAT=$2
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" $QUERY
}

# Runs the tpch in memory
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/src/tpch/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ impl RunOpt {
(Arc::new(format), path, ".tbl")
}
"csv" => {
let path = format!("{path}/{table}");
let path = format!("{path}/csv/{table}");
let format = CsvFormat::default()
.with_delimiter(b',')
.with_has_header(true);
Expand Down