Skip to content

Commit e39a0d0

Browse files
authored
Merge pull request #4 from jamesmcclain/feature/jupyterhub
Move To JupyterHub
2 parents ed51cd4 + bed79d7 commit e39a0d0

File tree

14 files changed

+815
-62
lines changed

14 files changed

+815
-62
lines changed

.dockerignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
archives/*.zip
2+
incubator-toree-*/
3+
geotrellis-uberjar/

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
lib/
2+
target/
3+
archives/*.zip
4+
incubator-toree-*/
5+
toree-0.2.0.dev1.tar.gz
6+
spark-*.tgz
7+
geotrellis-uberjar-assembly-*.jar

Dockerfile

Lines changed: 10 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,16 @@
1-
FROM quay.io/geodocker/spark:latest
1+
FROM centos:7
22

3-
MAINTAINER Justin Polchlopek, jpolchlopek@azavea.com
3+
MAINTAINER Justin Polchlopek <jpolchlopek@azavea.com>
44

5-
RUN yum -y update && \
6-
yum -y install scl-utils && \
7-
rpm -Uvh https://www.softwarecollections.org/en/scls/rhscl/python33/epel-7-x86_64/download/rhscl-python33-epel-7-x86_64.noarch.rpm && \
8-
yum -y install python33 && \
9-
yum -y install gcc gcc-fortran gcc-c++ && \
10-
yum -y install blas-devel lapack-devel
5+
ENV SPARK_HOME /usr/local/spark-2.0.0-bin-hadoop2.7
116

12-
RUN scl enable python33 'easy_install pip'
13-
RUN scl enable python33 'pip install numpy'
14-
RUN scl enable python33 'pip install scipy'
15-
RUN scl enable python33 'pip install pandas'
16-
RUN scl enable python33 'pip install jupyter'
17-
RUN scl enable python33 'pip install --upgrade ipython[all]'
7+
ADD spark-2.0.0-bin-hadoop2.7.tgz /usr/local
8+
ADD scripts/*.sh /scripts/
9+
COPY toree-0.2.0.dev1.tar.gz /tmp
10+
COPY geotrellis-uberjar-assembly-1.0.0-RC1.jar /tmp
1811

19-
RUN alternatives --set java /usr/java/jdk1.8.0_45/jre/bin/java
20-
RUN yum -y install git
21-
RUN curl https://bintray.com/sbt/rpm/rpm | tee /etc/yum.repos.d/bintray-sbt-rpm.repo
22-
RUN yum -y install sbt
23-
RUN git clone https://github.com/apache/incubator-toree.git
24-
RUN pushd incubator-toree && \
25-
APACHE_SPARK_VERSION=2.0.0 make build && \
26-
APACHE_SPARK_VERSION=2.0.0 make dist && \
27-
mkdir -p dist/toree-pip && \
28-
cp -r dist/toree dist/toree-pip && \
29-
cp dist/toree/LICENSE dist/toree-pip/LICENSE && \
30-
cp dist/toree/NOTICE dist/toree-pip/NOTICE && \
31-
cp dist/toree/DISCLAIMER dist/toree-pip/DISCLAIMER && \
32-
cp dist/toree/VERSION dist/toree-pip/VERSION && \
33-
cp dist/toree/RELEASE_NOTES.md dist/toree-pip/RELEASE_NOTES.md && \
34-
cp -R dist/toree/licenses dist/toree-pip/licenses && \
35-
cp -rf etc/pip_install/* dist/toree-pip/. && \
36-
printf "__version__ = '$(grep BASE_VERSION= Makefile | sed -e 's/BASE_VERSION=\(.*\)/\1/')'\n" >> dist/toree-pip/toree/_version.py && \
37-
printf "__commit__ = '$(git rev-parse --short=12 --verify HEAD)'\n" >> dist/toree-pip/toree/_version.py && \
38-
pushd dist/toree-pip/ && \
39-
scl enable python33 'python setup.py sdist --dist-dir=.' && \
40-
scl enable python33 'pip install $(ls toree-*.tar.gz) && jupyter toree install' && \
41-
popd && \
42-
rm -rf dist/toree-pip/ && \
43-
popd
12+
RUN /scripts/build.sh
4413

45-
EXPOSE 7001
46-
EXPOSE 7002
47-
EXPOSE 7003
48-
EXPOSE 7004
49-
EXPOSE 7005
50-
EXPOSE 7006
51-
EXPOSE 7077
52-
EXPOSE 7777
53-
EXPOSE 6066
14+
EXPOSE 8000
5415

55-
CMD scl enable python33 'PYSPARK_PYTHON=python3 PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=7777" pyspark --packages com.databricks:spark-csv_2.10:1.1.0 --executor-memory 6400M --driver-memory 6400M'
16+
CMD scl enable python33 'jupyterhub --no-ssl'

Makefile

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,45 @@
1+
TOREE_VERSION := 9b577f19df83dab95419b781de710ea2a393202a
2+
SHA := $(shell echo ${TOREE_VERSION} | sed 's,\(.......\).*,\1,')
13
BASE := $(subst -, ,$(notdir ${CURDIR}))
24
ORG := $(word 1, ${BASE})
35
REPO := $(word 2, ${BASE})-$(word 3, ${BASE})
46
IMG := quay.io/${ORG}/${REPO}
57

6-
build:
7-
docker build -t ${IMG}:latest .
8+
9+
.PHONY all: build
10+
11+
archives/${TOREE_VERSION}.zip:
12+
(cd archives ; curl -L -O "https://github.com/apache/incubator-toree/archive/${TOREE_VERSION}.zip")
13+
14+
spark-2.0.0-bin-hadoop2.7.tgz:
15+
curl -L -O "http://d3kbcqa49mib13.cloudfront.net/spark-2.0.0-bin-hadoop2.7.tgz"
16+
17+
incubator-toree-${TOREE_VERSION}: archives/${TOREE_VERSION}.zip
18+
rm -rf $@
19+
unzip $<
20+
21+
incubator-toree-${TOREE_VERSION}/dist/toree-pip/toree-0.2.0.dev1.tar.gz: incubator-toree-${TOREE_VERSION}
22+
make -C $< release
23+
24+
toree-0.2.0.dev1.tar.gz: incubator-toree-${TOREE_VERSION}/dist/toree-pip/toree-0.2.0.dev1.tar.gz
25+
cp $< $@
26+
27+
geotrellis-uberjar-assembly-1.0.0-RC1.jar: geotrellis-uberjar/build.sbt
28+
(cd geotrellis-uberjar ; ./sbt "assembly")
29+
cp geotrellis-uberjar/target/scala-2.11/geotrellis-uberjar-assembly-1.0.0-RC1.jar $@
30+
31+
build: toree-0.2.0.dev1.tar.gz spark-2.0.0-bin-hadoop2.7.tgz geotrellis-uberjar-assembly-1.0.0-RC1.jar
32+
docker build -t ${IMG}:${SHA} .
833

934
publish: build
10-
docker push ${IMG}:latest
11-
if [ "${TAG}" != "" -a "${TAG}" != "latest" ]; then docker tag ${IMG}:latest ${IMG}:${TAG} && docker push ${IMG}:${TAG}; fi
35+
docker push ${IMG}:${SHA}
36+
if [ "${TAG}" != "" -a "${TAG}" != "${SHA}" ]; then docker tag ${IMG}:${SHA} ${IMG}:${TAG} && docker push ${IMG}:${TAG}; fi
1237

1338
# test: build
1439
# docker-compose up -d
1540
# docker-compose run --rm hdfs-name bash -c "set -e \
16-
# && source /sbin/hdfs-lib.sh \
17-
# && wait_until_hdfs_is_available \
18-
# && hdfs dfs -touchz /live-check \
19-
# && hdfs dfs -ls /live-check"
41+
# # && source /sbin/hdfs-lib.sh \
42+
# # && wait_until_hdfs_is_available \
43+
# # && hdfs dfs -touchz /live-check \
44+
# # && hdfs dfs -ls /live-check"
2045
# docker-compose down

README.md

Lines changed: 77 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,81 @@
1-
GeoDocker GeoTrellis Jupyter Notebook
2-
=====================================
1+
# GeoDocker GeoTrellis Jupyter Notebook #
32

43
A Docker container to provide a Jupyter notebook instance with GeoTrellis functionality to [GeoDocker](https://github.com/geodocker/geodocker).
54

6-
Configuration and Usage
7-
-----------------------
5+
## Configuring and Starting ##
86

9-
Start this container using the `--net=host` option and point a browser on the host machine to http://localhost:7777 to enter the interface. Select `Apache Toree - Scala` from the New dropdown menu and start typing Scala commands. Note that `sc` is pre-initialized to a live `SparkContext` instance.
7+
### Toy Settings ###
8+
9+
Starting this image in toy settings (e.g. with a "cluster" confined entirely to one physical computer) is very easy.
10+
11+
#### Self-Contained ####
12+
13+
To use the image with a self-contained local master, type
14+
15+
```bash
16+
docker run -it --rm -p 8000:8000 quay.io/geodocker/geotrellis-jupyter:9b577f1
17+
```
18+
19+
After a few moments, the server should be available at [`localhost:8000`](http://localhost:8000).
20+
21+
#### With GeoDocker ####
22+
23+
First ensure that the `docker-compose` command is installed and working.
24+
With that command present, simply navigate into a directory containing the appropriate [`docker-compose.yml`](docker-compose.yml) file and bring the "cluster" up
25+
26+
```bash
27+
cd ~/local/src/geodocker-geotrellis-jupyter
28+
docker-compose up
29+
```
30+
31+
As before, the server should be available at [`localhost:8000`](http://localhost:8000).
32+
33+
### Serious Settings ###
34+
35+
The two most immediate issues with using this image in a more serious setting (with a real cluster) are
36+
- properly configuring the Spark master, and
37+
- enabling SSL.
38+
39+
To use the image with a YARN master, the appropriate configuration files must be copied to the image
40+
(the precise details of how to do that are left as an exercise to the reader).
41+
Once everything is setup so that it is possible run jobs with a YARN master from within the container, Toree must be reinstalled with the appropriate settings.
42+
The command to do that might look something like this
43+
44+
```bash
45+
scl enable python33 'jupyter toree install --spark_opts="--master yarn --jars file:///tmp/geotrellis-uberjar-assembly-1.0.0-RC1.jar"'
46+
```
47+
48+
To use the image with Spark in stand-alone mode, Toree must be reinstalled with the appropriate settings.
49+
The command to do that might look something like this
50+
51+
```bash
52+
scl enable python33 'jupyter toree install --spark_opts="--master spark://10.0.1.3:7077 --jars file:///tmp/geotrellis-uberjar-assembly-1.0.0-RC1.jar"'
53+
```
54+
55+
In stand-alone mode, the version of Spark in the image (currently 2.0.0) must match the version installed on the cluster.
56+
If that is not true, then it is be necessary to create a new image
57+
(either derived from this one [in the docker-sense] or built from a fork of this source distribution)
58+
with the appropriate version of Spark installed.
59+
60+
To run `jupyterhub` with SSL enabled, the [JupyterHub documentation](https://github.com/jupyterhub/jupyterhub) suggests something like this
61+
62+
```bash
63+
jupyterhub --ip 10.0.1.2 --port 443 --ssl-key my_ssl.key --ssl-cert my_ssl.cert
64+
```
65+
66+
Please see the JupyterHub documentation for more detailed discussion;
67+
The steps/suggestions given here are probably necessary but almost certainly not sufficient to produce a working setup.
68+
69+
The [`geodocker.sh`](scripts/geodocker.sh) script is an example of a script which reinstalls Toree then launches JupyerHub.
70+
For serious usage, it will probably be necessary to create another docker image derived form this one.
71+
That image should contain site-specific configuration files and a script similar to `scripts/geodocker.sh` with the appropriate configuration and launch commands encapsulated within.
72+
73+
## Usage ##
74+
75+
The default username and password are both `jack`.
76+
The default account is suitable for local use,
77+
but if the image is going to be used in a more serious setting, be sure to disable that account and enable some other login mechanism.
78+
79+
To make use of GeoTrellis, create a new "Apache Toree - Scala" notebook (or use an existing one).
80+
81+
![screenshot from 2016-11-19 15 45 07](https://cloud.githubusercontent.com/assets/11281373/20458321/b14c04e8-ae6f-11e6-8edf-467121f72d91.png)

archives/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
*
2+
*/
3+
!.gitignore

docker-compose.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
version: '2'
2+
services:
3+
spark-master:
4+
image: quay.io/geodocker/spark:8fd1a50
5+
command: master
6+
ports:
7+
- 4040:4040
8+
- 8080:8080
9+
spark-worker:
10+
image: quay.io/geodocker/spark:8fd1a50
11+
command: worker
12+
environment:
13+
SPARK_MASTER: spark-master
14+
depends_on:
15+
- spark-master
16+
jupyter:
17+
image: quay.io/geodocker/geotrellis-jupyter:9b577f1
18+
command: /scripts/geodocker.sh
19+
ports:
20+
- 8000:8000
21+
depends_on:
22+
- spark-master

geotrellis-uberjar/build.sbt

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import sbtassembly.PathList
2+
3+
// val generalDeps = Seq("org.apache.hadoop" % "hadoop-client" % "2.6.2")
4+
5+
lazy val commonSettings = Seq(
6+
organization := "org.locationtech.geotrellis",
7+
version := Version.geotrellis,
8+
scalaVersion := "2.11.8",
9+
test in assembly := {},
10+
assemblyMergeStrategy in assembly := {
11+
case "log4j.properties" => MergeStrategy.first
12+
case "reference.conf" => MergeStrategy.concat
13+
case "application.conf" => MergeStrategy.concat
14+
case PathList("META-INF", xs @ _*) =>
15+
xs match {
16+
case ("MANIFEST.MF" :: Nil) => MergeStrategy.discard
17+
// Concatenate everything in the services directory to keep GeoTools happy.
18+
case ("services" :: _ :: Nil) =>
19+
MergeStrategy.concat
20+
// Concatenate these to keep JAI happy.
21+
case ("javax.media.jai.registryFile.jai" :: Nil) | ("registryFile.jai" :: Nil) | ("registryFile.jaiext" :: Nil) =>
22+
MergeStrategy.concat
23+
case (name :: Nil) => {
24+
// Must exclude META-INF/*.([RD]SA|SF) to avoid "Invalid signature file digest for Manifest main attributes" exception.
25+
if (name.endsWith(".RSA") || name.endsWith(".DSA") || name.endsWith(".SF"))
26+
MergeStrategy.discard
27+
else
28+
MergeStrategy.first
29+
}
30+
case _ => MergeStrategy.first
31+
}
32+
case _ => MergeStrategy.first
33+
},
34+
shellPrompt := { s => Project.extract(s).currentProject.id + " > " },
35+
resolvers += "LocationTech GeoTrellis Releases" at "https://repo.locationtech.org/content/repositories/geotrellis-releases",
36+
libraryDependencies ++= Seq(
37+
"org.locationtech.geotrellis" %% "geotrellis-accumulo" % Version.geotrellis,
38+
"org.locationtech.geotrellis" %% "geotrellis-cassandra" % Version.geotrellis,
39+
"org.locationtech.geotrellis" %% "geotrellis-geotools" % Version.geotrellis,
40+
"org.locationtech.geotrellis" %% "geotrellis-hbase" % Version.geotrellis,
41+
"org.locationtech.geotrellis" %% "geotrellis-proj4" % Version.geotrellis,
42+
"org.locationtech.geotrellis" %% "geotrellis-raster" % Version.geotrellis,
43+
"org.locationtech.geotrellis" %% "geotrellis-s3" % Version.geotrellis,
44+
"org.locationtech.geotrellis" %% "geotrellis-shapefile" % Version.geotrellis,
45+
"org.locationtech.geotrellis" %% "geotrellis-slick" % Version.geotrellis,
46+
"org.locationtech.geotrellis" %% "geotrellis-spark-etl" % Version.geotrellis,
47+
"org.locationtech.geotrellis" %% "geotrellis-spark" % Version.geotrellis,
48+
"org.locationtech.geotrellis" %% "geotrellis-util" % Version.geotrellis,
49+
"org.locationtech.geotrellis" %% "geotrellis-vectortile" % Version.geotrellis,
50+
"org.locationtech.geotrellis" %% "geotrellis-vector" % Version.geotrellis,
51+
"org.apache.hadoop" % "hadoop-client" % Version.hadoop % "provided",
52+
"org.apache.spark" %% "spark-core" % Version.spark % "provided"
53+
)
54+
)
55+
56+
lazy val uberjar = Project("geotrellis-uberjar", file("."))
57+
.settings(commonSettings: _*)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import sbt._
2+
import scala.util.Properties
3+
4+
object Version {
5+
def either(environmentVariable: String, default: String): String = Properties.envOrElse(environmentVariable, default)
6+
7+
lazy val hadoop = either("SPARK_HADOOP_VERSION", "2.6.2")
8+
lazy val spark = either("SPARK_VERSION", "2.0.0")
9+
lazy val geotrellis = "1.0.0-RC1"
10+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.1")

0 commit comments

Comments
 (0)