Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ All notable changes to this project will be documented in this file.

- ubi-rust-builder: Bump Rust toolchain to 1.85.0, cargo-cyclonedx to 0.5.7, and cargo-auditable to 0.6.6 ([#1050]).
- spark-k8s: Include spark-connect jars. Replace OpenJDK with Temurin JDK. Cleanup. ([#1034])
- spark-connect-client: Image is now completely based on spark-k8s and includes JupyterLab and other demo dependencies ([#1071])

### Fixed

Expand Down
36 changes: 14 additions & 22 deletions spark-connect-client/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
# spark-builder: provides client libs for spark-connect
FROM stackable/image/spark-k8s AS spark-builder

FROM stackable/image/java-base

ARG PRODUCT
ARG PYTHON
ARG RELEASE
Expand All @@ -18,42 +16,36 @@ LABEL name="Stackable Spark Connect Examples" \
summary="Spark Connect Examples" \
description="Spark Connect client libraries for Python and the JVM, including some examples."

# Need root to install setuptools
USER root

ENV HOME=/stackable

COPY spark-connect-client/stackable/spark-connect-examples /stackable/spark-connect-examples
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark/connect /stackable/spark/connect
COPY --chown=${STACKABLE_USER_UID}:0 spark-connect-client/stackable/spark-connect-examples /stackable/spark-connect-examples
COPY --chown=${STACKABLE_USER_UID}:0 spark-connect-client/stackable/.jupyter /stackable/.jupyter

RUN <<EOF
microdnf update
# python{version}-setuptools: needed to build the pyspark[connect] package
microdnf install --nodocs \
"python${PYTHON}" \
"python${PYTHON}-pip" \
"python${PYTHON}-setuptools"
microdnf clean all
rm -rf /var/cache/yum

ln -s /usr/bin/python${PYTHON} /usr/bin/python
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip

# Install python libraries for the spark connect client
# shellcheck disable=SC2102
pip install --no-cache-dir pyspark[connect]==${PRODUCT}

# All files and folders owned by root group to support running as arbitrary users.
# This is best practice as all container users will belong to the root group (0).
chown -R ${STACKABLE_USER_UID}:0 /stackable
chmod -R g=u /stackable
EOF

# ----------------------------------------
# Attention: We are changing the group of all files in /stackable directly above
# If you do any file based actions (copying / creating etc.) below this comment you
# absolutely need to make sure that the correct permissions are applied!
# chown ${STACKABLE_USER_UID}:0
# ----------------------------------------

USER ${STACKABLE_USER_UID}

# Install python packages.
# Packages are intentionally installed in "user mode" to reduce the container attack surface.
# - pyspark[connect] = spark connect client libs
# - jupyterlab = notebook client used in demos
RUN pip install --no-cache-dir --user \
"pyspark[connect]==${PRODUCT}" \
"jupyterlab==4.4.1" \
"scikit-learn==1.3.1" \
"matplotlib==3.10.1"

WORKDIR /stackable/spark-connect-examples/python
Loading