actionml · dennybaa · Feb 7, 2017 · Dec 19, 2016 · Dec 19, 2016 · Dec 20, 2016
diff --git a/.travis.yml b/.travis.yml
@@ -35,3 +35,6 @@ after_success:
 
 after_script:
   - docker images
+
+notifications:
+  webhooks: https://hooks.microbadger.com/images/actionml/spark/${MICROBADGER_TOKEN}
diff --git a/Dockerfile b/Dockerfile
@@ -1,17 +1,29 @@
-FROM java:8-jre-alpine
+FROM openjdk:8-jre-alpine
 MAINTAINER Denis Baryshev <dennybaa@gmail.com>
 
 ENV GOSU_VERSION 1.9
-ENV SPARK_VERSION 1.6.2
+ENV SPARK_VERSION 1.6.3
 ENV SPARK_HOME /usr/local/spark
 ENV SPARK_USER aml
+ENV GLIBC_APKVER 2.24-r0
+ENV LANG=en_US.UTF-8
 
 LABEL vendor=ActionML \
-      version_tags="[\"1.6\",\"1.6.2\"]"
+      version_tags="[\"1.6\",\"1.6.3\"]"
 
 # Update alpine and install required tools
 RUN echo "@community http://nl.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \ 
-    apk add --update --no-cache bash curl gnupg snappy shadow@community
+    apk add --update --no-cache bash curl gnupg shadow@community
+
+# Glibc compatibility
+RUN curl -sSL https://github.com/stackfeed/alpine-pkg-glibc/releases/download/$GLIBC_APKVER/stackfeed.rsa.pub \
+            -o /etc/apk/keys/stackfeed.rsa.pub && \
+    curl -sSLO https://github.com/stackfeed/alpine-pkg-glibc/releases/download/$GLIBC_APKVER/glibc-i18n-$GLIBC_APKVER.apk && \
+    curl -sSLO https://github.com/stackfeed/alpine-pkg-glibc/releases/download/$GLIBC_APKVER/glibc-$GLIBC_APKVER.apk && \
+    curl -sSLO https://github.com/stackfeed/alpine-pkg-glibc/releases/download/$GLIBC_APKVER/glibc-bin-$GLIBC_APKVER.apk && \
+    apk add --no-cache glibc-$GLIBC_APKVER.apk glibc-bin-$GLIBC_APKVER.apk glibc-i18n-$GLIBC_APKVER.apk && \
+    echo "export LANG=$LANG" > /etc/profile.d/locale.sh && \
+      rm /etc/apk/keys/stackfeed.rsa.pub glibc-*.apk
 
 # Get gosu
 RUN curl -sSL https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64 \
@@ -29,14 +41,14 @@ RUN curl -L http://www.us.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${S
 
 # Create users (to go "non-root") and set directory permissions
 RUN useradd -mU -d /home/hadoop hadoop && passwd -d hadoop && \
-    useradd -mU -d /home/aml -G hadoop aml && passwd -d aml && \
-    chown -R aml:hadoop ${SPARK_HOME}
+    useradd -mU -d /home/$SPARK_USER -G hadoop $SPARK_USER && passwd -d $SPARK_USER && \
+    chown -R $SPARK_USER:hadoop $SPARK_HOME
 
 ADD entrypoint.sh spark-defaults.conf /
 
-# Some env vars can be passed to alter the behaviour, for additional
-# details please visit https://spark.apache.org/docs/latest/spark-standalone.html
-
+## Scratch directories can be passed as volumes
+# SPARK_HOME/work directory used on worker for scratch space and job output logs.
+# /tmp - Directory to use for "scratch" space in Spark, including map output files and RDDs that get stored on disk.
 VOLUME [ "/usr/local/spark/work", "/tmp" ]
 
 EXPOSE 8080 8081 6066 7077 4040 7001 7002 7003 7004 7005 7006

diff --git a/README.md b/README.md
@@ -1,15 +1,9 @@
-[![DockerHub](https://img.shields.io/badge/docker-available-blue.svg)](https://hub.docker.com/r/actionml/spark) [![Build Status](https://travis-ci.org/actionml/docker-spark.svg?branch=master)](https://travis-ci.org/actionml/docker-spark)
+[![Build Status](https://travis-ci.org/actionml/docker-spark.svg?branch=master)](https://travis-ci.org/actionml/docker-spark)  [![Go to Docker Hub](https://img.shields.io/badge/Docker%20Hub-%E2%86%92-blue.svg)](https://hub.docker.com/r/actionml/spark/) [![](https://images.microbadger.com/badges/version/actionml/spark.svg)](https://microbadger.com/images/actionml/spark) [![](https://images.microbadger.com/badges/image/actionml/spark.svg)](https://microbadger.com/images/actionml/spark)
 
 # Docker container for spark (standalone cluster)
 
 ## Starting up
 
-First connect to the docker daemon
-
-```
-export DOCKER_HOST=:3375
-```
-
 This repository contains a set of scripts and configuration files to run a [Apache Spark](https://spark.apache.org/) standalone cluster from [Docker](https://www.docker.io/) container.
 
 To start master, workers or shell (on the same docker host),  you can invoke the following commands:
@@ -87,6 +81,19 @@ Mind that *SPARK_WORKER_INSTANCES* is not applicable to container, if you need t
 
 If you are planning to use spark shell, it's advised to look at [Zeppelin](https://zeppelin.incubator.apache.org/), it could be used instead of spark shell for working with data. It has pleasant GUI and IPython like functionality.
 
+# Important notes
+
+## Spark scratch volumes
+
+Mind that in production use the following directories *must* be passed to spark containers as volumes.
+
+* `SPARK_HOME/work` directory used on worker for scratch space and job output logs.
+*  `/tmp` - directory to use for "scratch" space in Spark, including map output files and RDDs that get stored on disk.
+
+## JDBC metastore (for Hive)
+
+Default configuration uses Derby as JDBC metastore_db which is created relatively to the startup path. So to make container (namely spark-shell) to start right change to /tmp.
+
 # Authors
 
  - Denis Baryshev (<dennybaa@gmail.com>)
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -1,33 +1,33 @@
 #!/bin/bash
 set -e
 
+## Defaults
+#
+: ${SPARK_HOME:?must be set!}
+default_opts="--properties-file /spark-defaults.conf"
+
+
 # Check if CLI args list containes bind address key.
 cli_bind_address() {
   echo "$*" | grep -qE -- "--host\b|-h\b|--ip\b|-i\b"
 }
 
-# Setup volumes
-chown_volumes() {
-  paths="/usr/local/spark/work"
-  mkdir -p ${paths}
+# Set permissions on the scratch volumes
+scratch_volumes_permissions() {
+  mkdir -p $SPARK_HOME/work && chown $SPARK_USER:hadoop $SPARK_HOME/work
   chmod 1777 /tmp
-  chown aml:hadoop ${paths}
 }
 
 
-# Set instance type master/worker/shell
-default_opts="--properties-file /spark-defaults.conf"
+## Configuration sourcing
+. $SPARK_HOME/sbin/spark-config.sh
+. $SPARK_HOME/bin/load-spark-env.sh
 
-# Basic configs sourcing
-: ${SPARK_HOME:?must be set!}
-. "${SPARK_HOME}/sbin/spark-config.sh"
-. "${SPARK_HOME}/bin/load-spark-env.sh"
 
+## Entrypoint
 
-# Set proper volume permissions
-chown_volumes
+scratch_volumes_permissions
 
-# Execute spark service or given arguments (for ex. can enter bash)
 case $1 in
 master|worker)
     instance=$1
@@ -40,13 +40,17 @@ master|worker)
       default_opts="${default_opts} --host ${bind_address}"
     fi
 
-    echo "spark-class invocation arguments: $default_opts $@"
-    exec gosu aml:hadoop ${SPARK_HOME}/bin/spark-class $CLASS $default_opts $@
+    echo "==> spark-class invocation arguments: $CLASS $default_opts $@"
+
+    cd /tmp
+    exec gosu $SPARK_USER:hadoop $SPARK_HOME/bin/spark-class $CLASS $default_opts $@
   ;;
 shell)
     shift
-    echo "spark-shell invocation arguments: $default_opts $@"
-    exec gosu aml:hadoop ${SPARK_HOME}/bin/spark-shell $default_opts $@
+    echo "==> spark-shell invocation arguments: $default_opts $@"
+
+    cd /tmp
+    exec gosu $SPARK_USER:hadoop $SPARK_HOME/bin/spark-shell $default_opts $@
   ;;
 *)
     cmdline="$@"

diff --git a/spark-defaults.conf b/spark-defaults.conf
@@ -1,9 +1,9 @@
-spark.driver.port	7001
-spark.fileserver.port	7002
-spark.broadcast.port	7003
-spark.replClassServer.port	7004
-spark.blockManager.port 7005
-spark.executor.port 7006
-spark.ui.port 4040
+spark.driver.port           7001
+spark.fileserver.port       7002
+spark.broadcast.port        7003
+spark.replClassServer.port  7004
+spark.blockManager.port     7005
+spark.executor.port         7006
+spark.ui.port               4040
 spark.broadcast.factory org.apache.spark.broadcast.HttpBroadcastFactory
 spark.local.dir /tmp