Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ after_success:

after_script:
- docker images

notifications:
webhooks: https://hooks.microbadger.com/images/actionml/spark/${MICROBADGER_TOKEN}
30 changes: 21 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
FROM java:8-jre-alpine
FROM openjdk:8-jre-alpine
MAINTAINER Denis Baryshev <dennybaa@gmail.com>

ENV GOSU_VERSION 1.9
ENV SPARK_VERSION 1.6.2
ENV SPARK_VERSION 1.6.3
ENV SPARK_HOME /usr/local/spark
ENV SPARK_USER aml
ENV GLIBC_APKVER 2.24-r0
ENV LANG=en_US.UTF-8

LABEL vendor=ActionML \
version_tags="[\"1.6\",\"1.6.2\"]"
version_tags="[\"1.6\",\"1.6.3\"]"

# Update alpine and install required tools
RUN echo "@community http://nl.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
apk add --update --no-cache bash curl gnupg snappy shadow@community
apk add --update --no-cache bash curl gnupg shadow@community

# Glibc compatibility
RUN curl -sSL https://github.com/stackfeed/alpine-pkg-glibc/releases/download/$GLIBC_APKVER/stackfeed.rsa.pub \
-o /etc/apk/keys/stackfeed.rsa.pub && \
curl -sSLO https://github.com/stackfeed/alpine-pkg-glibc/releases/download/$GLIBC_APKVER/glibc-i18n-$GLIBC_APKVER.apk && \
curl -sSLO https://github.com/stackfeed/alpine-pkg-glibc/releases/download/$GLIBC_APKVER/glibc-$GLIBC_APKVER.apk && \
curl -sSLO https://github.com/stackfeed/alpine-pkg-glibc/releases/download/$GLIBC_APKVER/glibc-bin-$GLIBC_APKVER.apk && \
apk add --no-cache glibc-$GLIBC_APKVER.apk glibc-bin-$GLIBC_APKVER.apk glibc-i18n-$GLIBC_APKVER.apk && \
echo "export LANG=$LANG" > /etc/profile.d/locale.sh && \
rm /etc/apk/keys/stackfeed.rsa.pub glibc-*.apk

# Get gosu
RUN curl -sSL https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64 \
Expand All @@ -29,14 +41,14 @@ RUN curl -L http://www.us.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${S

# Create users (to go "non-root") and set directory permissions
RUN useradd -mU -d /home/hadoop hadoop && passwd -d hadoop && \
useradd -mU -d /home/aml -G hadoop aml && passwd -d aml && \
chown -R aml:hadoop ${SPARK_HOME}
useradd -mU -d /home/$SPARK_USER -G hadoop $SPARK_USER && passwd -d $SPARK_USER && \
chown -R $SPARK_USER:hadoop $SPARK_HOME

ADD entrypoint.sh spark-defaults.conf /

# Some env vars can be passed to alter the behaviour, for additional
# details please visit https://spark.apache.org/docs/latest/spark-standalone.html

## Scratch directories can be passed as volumes
# SPARK_HOME/work directory used on worker for scratch space and job output logs.
# /tmp - Directory to use for "scratch" space in Spark, including map output files and RDDs that get stored on disk.
VOLUME [ "/usr/local/spark/work", "/tmp" ]

EXPOSE 8080 8081 6066 7077 4040 7001 7002 7003 7004 7005 7006
Expand Down
21 changes: 14 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
[![DockerHub](https://img.shields.io/badge/docker-available-blue.svg)](https://hub.docker.com/r/actionml/spark) [![Build Status](https://travis-ci.org/actionml/docker-spark.svg?branch=master)](https://travis-ci.org/actionml/docker-spark)
[![Build Status](https://travis-ci.org/actionml/docker-spark.svg?branch=master)](https://travis-ci.org/actionml/docker-spark) [![Go to Docker Hub](https://img.shields.io/badge/Docker%20Hub-%E2%86%92-blue.svg)](https://hub.docker.com/r/actionml/spark/) [![](https://images.microbadger.com/badges/version/actionml/spark.svg)](https://microbadger.com/images/actionml/spark) [![](https://images.microbadger.com/badges/image/actionml/spark.svg)](https://microbadger.com/images/actionml/spark)

# Docker container for spark (standalone cluster)

## Starting up

First connect to the docker daemon

```
export DOCKER_HOST=:3375
```

This repository contains a set of scripts and configuration files to run a [Apache Spark](https://spark.apache.org/) standalone cluster from [Docker](https://www.docker.io/) container.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pferrel I've reverted the above ^^.


To start master, workers or shell (on the same docker host), you can invoke the following commands:
Expand Down Expand Up @@ -87,6 +81,19 @@ Mind that *SPARK_WORKER_INSTANCES* is not applicable to container, if you need t

If you are planning to use spark shell, it's advised to look at [Zeppelin](https://zeppelin.incubator.apache.org/), it could be used instead of spark shell for working with data. It has pleasant GUI and IPython like functionality.

# Important notes

## Spark scratch volumes

Mind that in production use the following directories *must* be passed to spark containers as volumes.

* `SPARK_HOME/work` directory used on worker for scratch space and job output logs.
* `/tmp` - directory to use for "scratch" space in Spark, including map output files and RDDs that get stored on disk.

## JDBC metastore (for Hive)

Default configuration uses Derby as JDBC metastore_db which is created relatively to the startup path. So to make container (namely spark-shell) to start right change to /tmp.

# Authors

- Denis Baryshev (<dennybaa@gmail.com>)
40 changes: 22 additions & 18 deletions entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
#!/bin/bash
set -e

## Defaults
#
: ${SPARK_HOME:?must be set!}
default_opts="--properties-file /spark-defaults.conf"


# Check if CLI args list containes bind address key.
cli_bind_address() {
echo "$*" | grep -qE -- "--host\b|-h\b|--ip\b|-i\b"
}

# Setup volumes
chown_volumes() {
paths="/usr/local/spark/work"
mkdir -p ${paths}
# Set permissions on the scratch volumes
scratch_volumes_permissions() {
mkdir -p $SPARK_HOME/work && chown $SPARK_USER:hadoop $SPARK_HOME/work
chmod 1777 /tmp
chown aml:hadoop ${paths}
}


# Set instance type master/worker/shell
default_opts="--properties-file /spark-defaults.conf"
## Configuration sourcing
. $SPARK_HOME/sbin/spark-config.sh
. $SPARK_HOME/bin/load-spark-env.sh

# Basic configs sourcing
: ${SPARK_HOME:?must be set!}
. "${SPARK_HOME}/sbin/spark-config.sh"
. "${SPARK_HOME}/bin/load-spark-env.sh"

## Entrypoint

# Set proper volume permissions
chown_volumes
scratch_volumes_permissions

# Execute spark service or given arguments (for ex. can enter bash)
case $1 in
master|worker)
instance=$1
Expand All @@ -40,13 +40,17 @@ master|worker)
default_opts="${default_opts} --host ${bind_address}"
fi

echo "spark-class invocation arguments: $default_opts $@"
exec gosu aml:hadoop ${SPARK_HOME}/bin/spark-class $CLASS $default_opts $@
echo "==> spark-class invocation arguments: $CLASS $default_opts $@"

cd /tmp
exec gosu $SPARK_USER:hadoop $SPARK_HOME/bin/spark-class $CLASS $default_opts $@
;;
shell)
shift
echo "spark-shell invocation arguments: $default_opts $@"
exec gosu aml:hadoop ${SPARK_HOME}/bin/spark-shell $default_opts $@
echo "==> spark-shell invocation arguments: $default_opts $@"

cd /tmp
exec gosu $SPARK_USER:hadoop $SPARK_HOME/bin/spark-shell $default_opts $@
;;
*)
cmdline="$@"
Expand Down
14 changes: 7 additions & 7 deletions spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
spark.driver.port 7001
spark.fileserver.port 7002
spark.broadcast.port 7003
spark.replClassServer.port 7004
spark.blockManager.port 7005
spark.executor.port 7006
spark.ui.port 4040
spark.driver.port 7001
spark.fileserver.port 7002
spark.broadcast.port 7003
spark.replClassServer.port 7004
spark.blockManager.port 7005
spark.executor.port 7006
spark.ui.port 4040
spark.broadcast.factory org.apache.spark.broadcast.HttpBroadcastFactory
spark.local.dir /tmp