Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 9ee8591

Browse files
authoredSep 23, 2020
fix(docker): fix spark 3 dockers (#383)
1 parent e0685fe commit 9ee8591

File tree

7 files changed

+187
-29
lines changed

7 files changed

+187
-29
lines changed
 

‎.travis.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@ env:
2121
- SCALA_MAJOR_VERSION=2.12
2222
- SPARK2_SCALA_VERSION=2.11.12
2323
- SPARK2_SCALA_MAJOR_VERSION=2.11
24+
- SPARK2_HADOOP_MAJOR_VERSION=2.7
25+
- HADOOP_MAJOR_VERSION=3.2
2426
- SPARK2_HADOOP_VERSION=2.9.2
2527
- SPARK2_HIVE_VERSION=2.3.3
26-
- SPARK2_VERSION=2.4.5
27-
- SPARK_VERSION=3.0.0
28+
- SPARK2_VERSION=2.4.6
29+
- SPARK_VERSION=3.0.1
2830
- HIVE_VERSION=2.3.7
2931
- HUDI_VERSION=0.5.3
3032
- TARGET_CACHE=$HOME/target-cache/${TRAVIS_COMMIT}

‎build.sbt

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ scalaVersion := Option(System.getenv("SCALA_VERSION")).getOrElse("2.12.11")
1717
val sparkVersion: Def.Initialize[String] = Def.setting {
1818
CrossVersion.partialVersion(scalaVersion.value) match {
1919
case Some((2, scalaMajor)) if scalaMajor >= 12 => Option(System.getenv("SPARK_VERSION")).getOrElse("3.0.0")
20-
case _ => Option(System.getenv("SPARK2_VERSION")).getOrElse("2.4.5")
20+
case _ => Option(System.getenv("SPARK2_VERSION")).getOrElse("2.4.6")
2121
}
2222
}
2323

‎docker/spark/k8s/Dockerfile

+29-21
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,48 @@
1-
ARG SPARK_VERSION=3.0.0
2-
ARG SPARK_IMAGE=gcr.io/spark-operator/spark:v${SPARK_VERSION}-gcs-prometheus
3-
FROM ${SPARK_IMAGE}
4-
5-
# Install Tools
6-
RUN apt-get update \
7-
&& apt-get install -y curl wget \
8-
&& rm -rf /var/lib/apt/lists/*
1+
ARG SPARK_VERSION=3.0.1
2+
FROM metorikku/spark:base-${SPARK_VERSION}
93

104
ARG AWS_SDK_VERSION=1.11.853
11-
ARG HADOOP_VERSION=3.2.1
5+
ARG HADOOP_VERSION=3.2.0
126
ARG HTTPCLIENT_VERSION=4.5.11
137
ARG SCALA_MAJOR_VERSION=2.12
148
ARG SPARK_VERSION=3.0.0
159

16-
RUN rm -f ${SPARK_HOME}/jars/spark-bigquery-latest.jar
10+
USER root
1711

18-
RUN wget -q https://repo1.maven.org/maven2/net/logstash/log4j/jsonevent-layout/1.7/jsonevent-layout-1.7.jar -P $SPARK_HOME/jars/
19-
RUN wget -q https://repo1.maven.org/maven2/net/minidev/json-smart/1.1.1/json-smart-1.1.1.jar -P $SPARK_HOME/jars/
20-
RUN wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P $SPARK_HOME/jars/
21-
RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/${AWS_SDK_VERSION}/aws-java-sdk-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/
22-
RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_SDK_VERSION}/aws-java-sdk-core-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/
23-
RUN wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/${AWS_SDK_VERSION}/aws-java-sdk-s3-${AWS_SDK_VERSION}.jar -P $SPARK_HOME/jars/
12+
ADD https://repo1.maven.org/maven2/net/logstash/log4j/jsonevent-layout/1.7/jsonevent-layout-1.7.jar $SPARK_HOME/jars/
13+
ADD https://repo1.maven.org/maven2/net/minidev/json-smart/1.1.1/json-smart-1.1.1.jar $SPARK_HOME/jars/
14+
ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar $SPARK_HOME/jars/
15+
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/${AWS_SDK_VERSION}/aws-java-sdk-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/
16+
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/${AWS_SDK_VERSION}/aws-java-sdk-core-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/
17+
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/${AWS_SDK_VERSION}/aws-java-sdk-s3-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/
18+
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-dynamodb/${AWS_SDK_VERSION}/aws-java-sdk-dynamodb-${AWS_SDK_VERSION}.jar $SPARK_HOME/jars/
19+
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/
20+
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/
21+
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-avro_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-avro_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar $SPARK_HOME/jars/
22+
ADD https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.6.2/commons-pool2-2.6.2.jar $SPARK_HOME/jars/
23+
RUN rm -f $SPARK_HOME/jars/httpclient-*.jar
24+
ADD https://repo1.maven.org/maven2/org/apache/httpcomponents/httpclient/${HTTPCLIENT_VERSION}/httpclient-${HTTPCLIENT_VERSION}.jar $SPARK_HOME/jars/
2425

25-
RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-sql-kafka-0-10_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/
26-
RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-streaming-kafka-0-10-assembly_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/
27-
RUN wget -q https://repo1.maven.org/maven2/org/apache/spark/spark-avro_${SCALA_MAJOR_VERSION}/${SPARK_VERSION}/spark-avro_${SCALA_MAJOR_VERSION}-${SPARK_VERSION}.jar -P $SPARK_HOME/jars/
28-
RUN wget -q https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.6.2/commons-pool2-2.6.2.jar -P $SPARK_HOME/jars/
26+
RUN chmod 644 $SPARK_HOME/jars/*
27+
28+
ADD https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.11.0/jmx_prometheus_javaagent-0.11.0.jar /prometheus/
29+
RUN chmod 644 /prometheus/*.jar
2930

3031
#Python
3132
RUN apt-get update \
32-
&& apt-get install -y coreutils jq less inotify-tools python3 python3-setuptools \
33+
&& apt-get install -y wget curl coreutils jq less inotify-tools python3 python3-setuptools \
3334
&& curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \
3435
&& python3 get-pip.py 'pip==20.1.1' \
3536
&& rm get-pip.py \
3637
&& rm -rf /var/lib/apt/lists/*
3738

39+
#USER ${spark_uid}
40+
3841
ADD conf/* ${SPARK_HOME}/custom/conf/
3942

43+
RUN mkdir -p /etc/metrics/conf
44+
ADD metrics/* /etc/metrics/conf/
45+
46+
RUN touch hadoop-metrics2.properties
47+
4048
ENV PYTHONHASHSEED 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#
2+
# Copyright 2018 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# https://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
18+
driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
19+
executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#
2+
# Copyright 2018 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# https://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
---
18+
lowercaseOutputName: true
19+
attrNameSnakeCase: true
20+
rules:
21+
# These come from the application driver if it's a streaming application
22+
# Example: default/streaming.driver.com.example.ClassName.StreamingMetrics.streaming.lastCompletedBatch_schedulingDelay
23+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.(\S+)\.StreamingMetrics\.streaming\.(\S+)><>Value
24+
name: spark_streaming_driver_$4
25+
labels:
26+
app_namespace: "$1"
27+
app_id: "$2"
28+
# These come from the application driver if it's a structured streaming application
29+
# Example: default/sstreaming.driver.spark.streaming.QueryName.inputRate-total
30+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.spark\.streaming\.(\S+)\.(\S+)><>Value
31+
name: spark_structured_streaming_driver_$4
32+
labels:
33+
app_namespace: "$1"
34+
app_id: "$2"
35+
query_name: "$3"
36+
# These come from the application executors
37+
# Example: default/spark-pi.0.executor.threadpool.activeTasks
38+
- pattern: metrics<name=(\S+)\.(\S+)\.(\S+)\.executor\.(\S+)><>Value
39+
name: spark_executor_$4
40+
type: GAUGE
41+
labels:
42+
app_namespace: "$1"
43+
app_id: "$2"
44+
executor_id: "$3"
45+
# These come from the application driver
46+
# Example: default/spark-pi.driver.DAGScheduler.stage.failedStages
47+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.(BlockManager|DAGScheduler|jvm)\.(\S+)><>Value
48+
name: spark_driver_$3_$4
49+
type: GAUGE
50+
labels:
51+
app_namespace: "$1"
52+
app_id: "$2"
53+
# These come from the application driver
54+
# Emulate timers for DAGScheduler like messagePRocessingTime
55+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.DAGScheduler\.(.*)><>Count
56+
name: spark_driver_DAGScheduler_$3_count
57+
type: COUNTER
58+
labels:
59+
app_namespace: "$1"
60+
app_id: "$2"
61+
# HiveExternalCatalog is of type counter
62+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.HiveExternalCatalog\.(.*)><>Count
63+
name: spark_driver_HiveExternalCatalog_$3_count
64+
type: COUNTER
65+
labels:
66+
app_namespace: "$1"
67+
app_id: "$2"
68+
# These come from the application driver
69+
# Emulate histograms for CodeGenerator
70+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.CodeGenerator\.(.*)><>Count
71+
name: spark_driver_CodeGenerator_$3_count
72+
type: COUNTER
73+
labels:
74+
app_namespace: "$1"
75+
app_id: "$2"
76+
# These come from the application driver
77+
# Emulate timer (keep only count attribute) plus counters for LiveListenerBus
78+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.LiveListenerBus\.(.*)><>Count
79+
name: spark_driver_LiveListenerBus_$3_count
80+
type: COUNTER
81+
labels:
82+
app_namespace: "$1"
83+
app_id: "$2"
84+
# Get Gauge type metrics for LiveListenerBus
85+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.LiveListenerBus\.(.*)><>Value
86+
name: spark_driver_LiveListenerBus_$3
87+
type: GAUGE
88+
labels:
89+
app_namespace: "$1"
90+
app_id: "$2"
91+
# Executors counters
92+
- pattern: metrics<name=(\S+)\.(\S+)\.(.*)\.executor\.(.*)><>Count
93+
name: spark_executor_$4_count
94+
type: COUNTER
95+
labels:
96+
app_namespace: "$1"
97+
app_id: "$2"
98+
executor_id: "$3"
99+
# These come from the application executors
100+
# Example: app-20160809000059-0000.0.jvm.threadpool.activeTasks
101+
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.(jvm|NettyBlockTransfer)\.(.*)><>Value
102+
name: spark_executor_$4_$5
103+
type: GAUGE
104+
labels:
105+
app_namespace: "$1"
106+
app_id: "$2"
107+
executor_id: "$3"
108+
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.HiveExternalCatalog\.(.*)><>Count
109+
name: spark_executor_HiveExternalCatalog_$4_count
110+
type: COUNTER
111+
labels:
112+
app_namespace: "$1"
113+
app_id: "$2"
114+
executor_id: "$3"
115+
# These come from the application driver
116+
# Emulate histograms for CodeGenerator
117+
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.CodeGenerator\.(.*)><>Count
118+
name: spark_executor_CodeGenerator_$4_count
119+
type: COUNTER
120+
labels:
121+
app_namespace: "$1"
122+
app_id: "$2"
123+
executor_id: "$3"

‎examples/udf/build.sbt

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ scalaVersion := Option(System.getProperty("scalaVersion")).getOrElse("2.12.11")
88
val sparkVersion: Def.Initialize[String] = Def.setting {
99
CrossVersion.partialVersion(scalaVersion.value) match {
1010
case Some((2, scalaMajor)) if scalaMajor >= 12 => Option(System.getProperty("sparkVersion")).getOrElse("3.0.0")
11-
case _ => "2.4.5"
11+
case _ => "2.4.6"
1212
}
1313
}
1414

‎scripts/docker.sh

+10-4
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,31 @@
11
#!/bin/bash
22

3-
# Hack that helps with the cache
43
docker pull metorikku/metorikku:k8s
54
docker pull metorikku/metorikku:standalone
65
docker pull metorikku/metorikku:spark2_k8s
76
docker pull metorikku/metorikku:spark2_standalone
87
docker pull metorikku/hive
98

10-
docker pull gcr.io/spark-operator/spark:v$SPARK_VERSION-gcs-prometheus
11-
docker pull gcr.io/spark-operator/spark:v$SPARK2_VERSION-gcs-prometheus
12-
139
set -e
1410

1511
# Latest spark
12+
wget -q https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION.tgz
13+
tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION.tgz
14+
(cd spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION && bin/docker-image-tool.sh -r metorikku -t base-$SPARK_VERSION build)
15+
rm -rf spark-$SPARK_VERSION-bin-hadoop$HADOOP_MAJOR_VERSION*
16+
1617
docker build -t metorikku/spark:k8s --cache-from metorikku/metorikku:k8s --build-arg SCALA_MAJOR_VERSION=$SCALA_MAJOR_VERSION --build-arg SPARK_VERSION=$SPARK_VERSION -f docker/spark/k8s/Dockerfile docker/spark/k8s
1718
docker build -t metorikku/spark:standalone --cache-from metorikku/metorikku:standalone --build-arg IMAGE_NAME=metorikku/spark:k8s -f docker/spark/standalone/Dockerfile docker/spark/standalone
1819
# Adding metorikku jar
1920
docker build -t metorikku/metorikku:k8s --cache-from metorikku/metorikku:k8s --build-arg IMAGE_NAME=metorikku/spark:k8s -f docker/metorikku/Dockerfile .
2021
docker build -t metorikku/metorikku:standalone --cache-from metorikku/metorikku:standalone --build-arg IMAGE_NAME=metorikku/spark:standalone -f docker/metorikku/Dockerfile .
2122

2223
# Spark 2
24+
wget -q https://archive.apache.org/dist/spark/spark-$SPARK2_VERSION/spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION.tgz
25+
tar -xzf spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION.tgz
26+
(cd spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION && bin/docker-image-tool.sh -r metorikku -t base-$SPARK2_VERSION build)
27+
rm -rf spark-$SPARK2_VERSION-bin-hadoop$SPARK2_HADOOP_MAJOR_VERSION
28+
2329
docker build -t metorikku/spark:spark2_k8s --cache-from metorikku/metorikku:spark2_k8s --build-arg SCALA_MAJOR_VERSION=$SPARK2_SCALA_MAJOR_VERSION --build-arg SPARK_VERSION=$SPARK2_VERSION --build-arg HADOOP_VERSION=${SPARK2_HADOOP_VERSION} -f docker/spark/k8s/Dockerfile docker/spark/k8s
2430
docker build -t metorikku/spark:spark2_hadoop --cache-from metorikku/metorikku:spark2_standalone --build-arg IMAGE_NAME=metorikku/spark:spark2_k8s --build-arg HIVE_VERSION=${SPARK2_HIVE_VERSION} --build-arg HADOOP_VERSION=${SPARK2_HADOOP_VERSION} -f docker/spark/custom-hadoop/Dockerfile docker/spark/custom-hadoop
2531
docker build -t metorikku/spark:spark2_standalone --cache-from metorikku/metorikku:spark2_standalone --build-arg IMAGE_NAME=metorikku/spark:spark2_hadoop -f docker/spark/standalone/Dockerfile docker/spark/standalone

0 commit comments

Comments
 (0)
Please sign in to comment.