From e16af72a3eff4a13f3419e09faf49847debd7ddd Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 6 May 2023 10:42:02 -0600 Subject: [PATCH] Fix db-benchmark (#369) --- .dockerignore | 12 ++ benchmarks/db-benchmark/README.md | 11 +- .../db-benchmark/db-benchmark.dockerfile | 177 ++++++++++-------- benchmarks/db-benchmark/groupby-datafusion.py | 2 +- benchmarks/db-benchmark/join-datafusion.py | 2 +- benchmarks/db-benchmark/run-bench.sh | 4 +- 6 files changed, 117 insertions(+), 91 deletions(-) create mode 100644 .dockerignore mode change 100644 => 100755 benchmarks/db-benchmark/run-bench.sh diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..411e6029 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +.cargo +.github +.pytest_cache +ci +conda +dev +docs +examples +parquet +target +testing +venv \ No newline at end of file diff --git a/benchmarks/db-benchmark/README.md b/benchmarks/db-benchmark/README.md index fe268199..93293b0d 100644 --- a/benchmarks/db-benchmark/README.md +++ b/benchmarks/db-benchmark/README.md @@ -17,15 +17,16 @@ under the License. --> -# Run db-benchmark +# DataFusion Implementation of db-benchmark -This directory contains scripts for running DataFusion with the https://github.com/h2oai/db-benchmark +This directory contains scripts for running [db-benchmark](https://github.com/duckdblabs/db-benchmark) with +DataFusion's Python bindings. ## Directions -Run the following from root `arrow-datafusion` directory +Run the following from root of this project. ```bash -$ docker buildx build -t db-benchmark -f benchmarks/db-benchmark/db-benchmark.dockerfile . -$ docker run --privileged db-benchmark +$ docker build -t db-benchmark -f benchmarks/db-benchmark/db-benchmark.dockerfile . +$ docker run --privileged -it db-benchmark ``` diff --git a/benchmarks/db-benchmark/db-benchmark.dockerfile b/benchmarks/db-benchmark/db-benchmark.dockerfile index b21d3a0d..2876b5b6 100644 --- a/benchmarks/db-benchmark/db-benchmark.dockerfile +++ b/benchmarks/db-benchmark/db-benchmark.dockerfile @@ -15,92 +15,105 @@ # specific language governing permissions and limitations # under the License. -FROM ubuntu +FROM ubuntu:22.04 ARG DEBIAN_FRONTEND=noninteractive ARG TARGETPLATFORM -RUN apt-get update && \ - apt-get install -y git build-essential - -# Install R, curl, and python deps -RUN apt-get -y install --no-install-recommends --no-install-suggests \ - ca-certificates software-properties-common gnupg2 gnupg1 \ - && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \ - && add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' \ - && apt-get -y install r-base \ - && apt-get -y install curl \ - && apt-get -y install python3.8 \ - && apt-get -y install python3-pip - -# Install R libraries -RUN R -e "install.packages('data.table',dependencies=TRUE, repos='http://cran.rstudio.com/')" \ - && R -e "install.packages('dplyr',dependencies=TRUE, repos='http://cran.rstudio.com/')" - -# Install Rust -RUN curl https://sh.rustup.rs -sSf | bash -s -- -y -ENV PATH="/root/.cargo/bin:${PATH}" - -# Clone db-benchmark and download data -RUN git clone https://github.com/h2oai/db-benchmark \ - && cd db-benchmark \ - && Rscript _data/groupby-datagen.R 1e7 1e2 0 0 \ - && Rscript _data/join-datagen.R 1e7 0 0 0 \ - && mkdir data \ - && mv G1_1e7_1e2_0_0.csv data \ - && mv J1_1e7_1e1_0_0.csv data \ - && mv J1_1e7_1e4_0_0.csv data \ - && mv J1_1e7_1e7_0_0.csv data \ - && mv J1_1e7_NA_0_0.csv data \ - && cd .. - -# Clone datafusion-python and build python library -# Not sure if the wheel will be the same on all computers -RUN git clone https://github.com/datafusion-contrib/datafusion-python \ - && cd datafusion-python && git reset --hard 368b50ed9662d5e93c70b539f94cceace685265e \ - && python3 -m pip install pip \ - && python3 -m pip install pandas \ - && python3 -m pip install -r requirements.txt \ - && cd .. - -# Copy local arrow-datafusion -COPY . arrow-datafusion - -# 1. datafusion-python that builds from datafusion version referenced datafusion-python -RUN cd datafusion-python \ - && maturin build --release \ - && case "${TARGETPLATFORM}" in \ - */amd64) CPUARCH=x86_64 ;; \ - */arm64) CPUARCH=aarch64 ;; \ - *) exit 1 ;; \ - esac \ - # Version will need to be updated in conjunction with datafusion-python version - && python3 -m pip install target/wheels/datafusion-0.4.0-cp36-abi3-linux_${CPUARCH}.whl \ - && cd .. - -# 2. datafusion-python that builds from local datafusion. use this when making local changes to datafusion. -# Currently, as of March 5th 2022, this done not build (i think) because datafusion is being split into multiple crates -# and datafusion-python has not yet been updated to reflect this. -# RUN cd datafusion-python \ -# && sed -i '/datafusion =/c\datafusion = { path = "../arrow-datafusion/datafusion", features = ["pyarrow"] }' Cargo.toml \ -# && sed -i '/fuzz-utils/d' ../arrow-datafusion/datafusion/Cargo.toml \ -# && maturin build --release \ -# && case "${TARGETPLATFORM}" in \ -# */amd64) CPUARCH=x86_64 ;; \ -# */amd64) CPUARCH=aarch64 ;; \ -# *) exit 1 ;; \ -# esac \ -# && python3 -m pip install target/wheels/datafusion-0.4.0-cp36-abi3-linux_${CPUARCH}.whl \ -# && cd .. - -# Make datafusion directory in db-benchmark -RUN mkdir db-benchmark/datafusion \ - && cp ../arrow-datafusion/benchmarks/db-benchmark/groupby-datafusion.py db-benchmark/datafusion \ - && cp ../arrow-datafusion/benchmarks/db-benchmark/join-datafusion.py db-benchmark/datafusion \ - && cp ../arrow-datafusion/benchmarks/db-benchmark/run-bench.sh db-benchmark/ \ - && chmod +x db-benchmark/run-bench.sh +# This section is based on https://github.com/duckdblabs/db-benchmark/blob/master/_utils/repro.sh + +RUN apt-get -qq update +RUN apt-get -qq -y upgrade +RUN apt-get -qq install -y apt-utils + +RUN apt-get -qq install -y lsb-release software-properties-common wget curl vim htop git byobu libcurl4-openssl-dev libssl-dev +RUN apt-get -qq install -y libfreetype6-dev +RUN apt-get -qq install -y libfribidi-dev +RUN apt-get -qq install -y libharfbuzz-dev +RUN apt-get -qq install -y git +RUN apt-get -qq install -y libxml2-dev +RUN apt-get -qq install -y make +RUN apt-get -qq install -y libfontconfig1-dev +RUN apt-get -qq install -y libicu-dev pandoc zlib1g-dev libgit2-dev libcurl4-openssl-dev libssl-dev libjpeg-dev libpng-dev libtiff-dev +# apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository "deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" + +RUN apt-get -qq install -y r-base-dev virtualenv + +RUN cd /usr/local/lib/R && \ + chmod o+w site-library + +RUN cd / && \ + git clone https://github.com/duckdblabs/db-benchmark.git WORKDIR /db-benchmark -RUN ls && ls -al data/ +RUN mkdir -p .R && \ + echo 'CFLAGS=-O3 -mtune=native' >> .R/Makevars && \ + echo 'CXXFLAGS=-O3 -mtune=native' >> .R/Makevars + +RUN cd pydatatable && \ + virtualenv py-pydatatable --python=/usr/bin/python3.10 +RUN cd pandas && \ + virtualenv py-pandas --python=/usr/bin/python3.10 +RUN cd modin && \ + virtualenv py-modin --python=/usr/bin/python3.10 + +RUN Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependecies=TRUE, repos="https://cloud.r-project.org")' + +SHELL ["/bin/bash", "-c"] + +RUN source ./pandas/py-pandas/bin/activate && \ + python3 -m pip install --upgrade psutil && \ + python3 -m pip install --upgrade pandas && \ + deactivate + +RUN source ./modin/py-modin/bin/activate && \ + python3 -m pip install --upgrade modin && \ + deactivate + +RUN source ./pydatatable/py-pydatatable/bin/activate && \ + python3 -m pip install --upgrade git+https://github.com/h2oai/datatable && \ + deactivate + +## install dplyr +#RUN Rscript -e 'devtools::install_github(c("tidyverse/readr","tidyverse/dplyr"))' + +# install data.table +RUN Rscript -e 'install.packages("data.table", repos="https://rdatatable.gitlab.io/data.table/")' + +## generate data for groupby 0.5GB +RUN Rscript _data/groupby-datagen.R 1e7 1e2 0 0 +RUN #Rscript _data/groupby-datagen.R 1e8 1e2 0 0 +RUN #Rscript _data/groupby-datagen.R 1e9 1e2 0 0 + +RUN mkdir data && \ + mv G1_1e7_1e2_0_0.csv data/ + +# set only groupby task +RUN echo "Changing run.conf and _control/data.csv to run only groupby at 0.5GB" && \ + cp run.conf run.conf.original && \ + sed -i 's/groupby join groupby2014/groupby/g' run.conf && \ + sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf && \ + sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf + +## set sizes +RUN mv _control/data.csv _control/data.csv.original && \ + echo "task,data,nrow,k,na,sort,active" > _control/data.csv && \ + echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv + +RUN #./dplyr/setup-dplyr.sh +RUN #./datatable/setup-datatable.sh +RUN #./duckdb/setup-duckdb.sh + +# END OF SETUP + +RUN python3 -m pip install --upgrade pandas +RUN python3 -m pip install --upgrade datafusion + +# Now add our solution +RUN rm -rf datafusion-python 2>/dev/null && \ + mkdir datafusion-python +ADD benchmarks/db-benchmark/*.py datafusion-python/ +ADD benchmarks/db-benchmark/run-bench.sh . -ENTRYPOINT ./run-bench.sh \ No newline at end of file +ENTRYPOINT [ "/db-benchmark/run-bench.sh" ] \ No newline at end of file diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py index 7268cc87..76dd38fe 100644 --- a/benchmarks/db-benchmark/groupby-datafusion.py +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -58,7 +58,7 @@ def ans_shape(batches): ) print("dataset loaded") -ctx = df.ExecutionContext() +ctx = df.SessionContext() ctx.register_record_batches("x", [data.to_batches()]) print("registered record batches") # cols = ctx.sql("SHOW columns from x") diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py index 1993a5c8..8843b55c 100755 --- a/benchmarks/db-benchmark/join-datafusion.py +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -90,7 +90,7 @@ def ans_shape(batches): flush=True, ) -ctx = df.ExecutionContext() +ctx = df.SessionContext() x_data = pacsv.read_csv( src_jn_x, convert_options=pacsv.ConvertOptions(auto_dict_encode=True) diff --git a/benchmarks/db-benchmark/run-bench.sh b/benchmarks/db-benchmark/run-bench.sh old mode 100644 new mode 100755 index 9ccc2680..2c308092 --- a/benchmarks/db-benchmark/run-bench.sh +++ b/benchmarks/db-benchmark/run-bench.sh @@ -17,5 +17,5 @@ # under the License. set -e -SRC_DATANAME=G1_1e7_1e2_0_0 python3 datafusion/groupby-datafusion.py -SRC_DATANAME=J1_1e7_NA_0_0 python3 datafusion/join-datafusion.py +SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/datafusion-python/groupby-datafusion.py +#SRC_DATANAME=J1_1e7_NA_0_0 python3 /db-benchmark/datafusion-python/join-datafusion.py