Skip to content

feat: build spark 4 #1195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions spark-connect-client/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,10 @@
"java-base": "17",
"python": "3.11",
},
{
"product": "4.0.0",
"spark-k8s": "4.0.0",
"java-base": "17",
"python": "3.11",
},
]
79 changes: 58 additions & 21 deletions spark-k8s/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,17 @@ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patche
COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR} /stackable/src/spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR}

RUN <<EOF

# IMPORTANT: HBase connectors don't support Spark 4 yet, so we skip the build.
# Watch this PR for updates: https://github.com/apache/hbase-connectors/pull/130
if [[ "${PRODUCT}" == 4* ]]; then
# Create this empy directory so that following COPY layers succeed.
mkdir -p /stackable/spark/jars
# Create a dummy tarball to satisfy the build process for Spark 3.
touch hbase-connector-${HBASE_CONNECTOR}-stackable${RELEASE}-src.tar.gz
exit 0
fi

cd "$(/stackable/patchable --images-repo-root=src checkout spark-k8s/hbase-connectors ${HBASE_CONNECTOR})/spark"

NEW_VERSION="${HBASE_CONNECTOR}-stackable${RELEASE}"
Expand Down Expand Up @@ -108,6 +119,7 @@ mvn \
--define hadoop-three.version="${HADOOP}" \
--define hbase.version="${HBASE}" \
--define skipTests \
--define maven.test.skip=true \
clean package

mkdir -p /stackable/spark/jars
Expand Down Expand Up @@ -158,9 +170,6 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
COPY --from=hadoop-builder --chown=${STACKABLE_USER_UID}:0 /stackable/patched-libs /stackable/patched-libs

# >>> Build spark
# Compiling the tests takes a lot of time, so we skip them
# -Dmaven.test.skip=true skips both the compilation and execution of tests
# -DskipTests skips only the execution
RUN <<EOF
# Make Maven aware of custom Stackable libraries
mv /stackable/patched-libs/maven /root/.m2/repository
Expand All @@ -175,15 +184,35 @@ RUN <<EOF
ORIGINAL_VERSION="${PRODUCT}"
NEW_VERSION="${PRODUCT}-stackable${RELEASE}"

STACKABLE_HADOOP_VERSION="${HADOOP}-stackable${RELEASE}"

MAVEN_BIN="/tmp/apache-maven-${MAVEN_VERSION}/bin/mvn"
export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"

./dev/make-distribution.sh \
--mvn /tmp/apache-maven-${MAVEN_VERSION}/bin/mvn \
-Dhadoop.version="${HADOOP}-stackable${RELEASE}" \
-DskipTests \
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
--no-transfer-progress \
--batch-mode
case "${PRODUCT}" in
4*)
# The Spark 4 script has a --connect option which is not available in Spark 3.
# This option is required to build Spark Connect.
# Also this option breaks the Spark 3 build so we ensure it's only provided here.
./dev/make-distribution.sh \
--mvn "${MAVEN_BIN}" \
--connect \
-Dhadoop.version="${STACKABLE_HADOOP_VERSION}" \
-DskipTests \
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
--no-transfer-progress \
--batch-mode
;;
*)
./dev/make-distribution.sh \
--mvn "${MAVEN_BIN}" \
-Dhadoop.version="${STACKABLE_HADOOP_VERSION}" \
-DskipTests \
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver \
--no-transfer-progress \
--batch-mode
;;
esac

sed -i "s/${NEW_VERSION}/${ORIGINAL_VERSION}/g" assembly/target/bom.json
EOF
Expand All @@ -194,22 +223,30 @@ EOF
# we create a new dist/connect folder, and copy them here.
RUN <<EOF

# Get the Scala binary version
SCALA_BINARY_VERSION=$( \
mvn --quiet --non-recursive --no-transfer-progress --batch-mode --file pom.xml \
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
-DforceStdout \
-Dexpression='project.properties(scala.binary.version)')
SCALA_BINARY_VERSION=$(grep "scala.binary.version" pom.xml | head -n1 | awk -F '[<>]' '{print $3}')

mkdir -p dist/connect
cd dist/connect

cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .

# The Spark operator expects a file named spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}.jar without the -stackable${RELEASE} suffix.
case "${PRODUCT}" in
4*)
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
;;
*)
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
;;
esac

# This link is needed by the operator and is kept for backwards compatibility.
# TODO: remove it at some time in the future.
ln -s "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}.jar"
# Link to the spark-connect jar without the stackable suffix and scala version.
# This link supersedes the previous link.
ln -s "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" "spark-connect-${PRODUCT}.jar"
EOF

# <<< Build spark
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
From 2da5608928018dd017c91b904eb8f84a4f6df78a Mon Sep 17 00:00:00 2001
From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com>
Date: Fri, 4 Jul 2025 15:54:55 +0200
Subject: Update CycloneDX plugin

---
dev/make-distribution.sh | 1 -
pom.xml | 5 +++++
2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 16607e45ae..44e345a245 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -176,7 +176,6 @@ BUILD_COMMAND=("$MVN" clean package \
-Dmaven.javadoc.skip=true \
-Dmaven.scaladoc.skip=true \
-Dmaven.source.skip \
- -Dcyclonedx.skip=true \
$@)

# Actually build the jar
diff --git a/pom.xml b/pom.xml
index 443d46a430..632920f100 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3327,6 +3327,11 @@
<groupId>org.cyclonedx</groupId>
<artifactId>cyclonedx-maven-plugin</artifactId>
<version>2.8.0</version>
+ <configuration>
+ <projectType>application</projectType>
+ <schemaVersion>1.5</schemaVersion>
+ <skipNotDeployed>false</skipNotDeployed>
+ </configuration>
<executions>
<execution>
<phase>package</phase>
2 changes: 2 additions & 0 deletions spark-k8s/stackable/patches/4.0.0/patchable.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
base = "fa33ea000a0bda9e5a3fa1af98e8e85b8cc5e4d4"
mirror = "https://github.com/stackabletech/spark.git"
18 changes: 18 additions & 0 deletions spark-k8s/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,22 @@
"tini": "0.19.0",
"hbase_connector": "1.0.1",
},
{
"product": "4.0.0",
"java-base": "17",
"java-devel": "17",
"python": "3.11",
"hadoop": "3.4.1",
"hbase": "2.6.2",
"aws_java_sdk_bundle": "2.24.6",
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
"jackson_dataformat_xml": "2.15.2", # https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.13/3.5.1
"stax2_api": "4.2.1", # https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-xml/2.15.2
"woodstox_core": "6.5.1", # https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-xml/2.15.2
"vector": "0.47.0",
"jmx_exporter": "1.3.0",
"tini": "0.19.0",
"hbase_connector": "1.0.1",
},
]