Skip to content

Feature/bigdata #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 12, 2023
12 changes: 6 additions & 6 deletions bigdata/scripts/sparkhdfs/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ RUN apt-get install -y curl unzip zip
RUN apt-get install -y wget

# Download Apache Hadoop
RUN wget https://downloads.apache.org/hadoop/core/hadoop-3.3.5/hadoop-3.3.5.tar.gz
RUN tar xvf hadoop-3.3.5.tar.gz
RUN mv hadoop-3.3.5 hadoop
RUN wget https://dlcdn.apache.org/hadoop/core/hadoop-3.3.6/hadoop-3.3.6.tar.gz
RUN tar xvf hadoop-3.3.6.tar.gz
RUN mv hadoop-3.3.6 hadoop

# Download Apache Spark
RUN wget https://dlcdn.apache.org/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz
RUN tar xvf spark-3.4.0-bin-hadoop3.tgz
RUN mv spark-3.4.0-bin-hadoop3 spark
RUN wget https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
RUN tar xvf spark-3.5.0-bin-hadoop3.tgz
RUN mv spark-3.5.0-bin-hadoop3 spark

# Final stage
FROM ubuntu:18.04
Expand Down
7 changes: 7 additions & 0 deletions bigdata/scripts/sparkhdfs/create_wordcount_input_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/bash -l


for repl in {1..10}
do
echo " a b c d e f g h i j k l m n o p q r s t t u v w x y z" >> input.xml1
done
63 changes: 37 additions & 26 deletions bigdata/scripts/sparkhdfs/runSparkHDFS.sh
Original file line number Diff line number Diff line change
@@ -1,31 +1,42 @@
#!/bin/bash -l
#SBATCH -J SparkHDFS
#SBATCH -N 3 # Nodes
#SBATCH -n 3 # Tasks
#!/usr/bin/bash -l
#SBATCH --job-name=SparkHDFS
#SBATCH --nodes=3
#SBATCH --ntasks=3
#SBATCH --ntasks-per-node=1
#SBATCH --mem=16GB
#SBATCH -c 16 # Cores assigned to each task
#SBATCH --mem-per-cpu=2GB
#SBATCH --cpus-per-task=4
#SBATCH --time=0-00:59:00
#SBATCH -p batch
#SBATCH --partition=batch
#SBATCH --qos=normal
#SBATCH --mail-user=first.lastname@uni.lu
#SBATCH --mail-type=BEGIN,END

##SBATCH --exclusive

module load tools/Singularity

hostName="`hostname`"
echo "hostname=$hostName"

#save it for future job refs
myhostname="`hostname`"
rm coordinatorNode
rm -f coordinatorNode
touch coordinatorNode
cat > coordinatorNode << EOF
$myhostname
EOF

###

# Ensure that loging and work directories exist
mkdir -p ${HOME}/sparkhdfs/hadoop/logs
mkdir -p ${HOME}/sparkhdfs/hadoop/etc/hadoop
mkdir -p ${HOME}/sparkhdfs/spark/logs
mkdir -p ${HOME}/sparkhdfs/spark/conf
mkdir -p ${HOME}/sparkhdfs/spark/work

#create Spark configs
SPARK_CONF=${HOME}/spark/conf/spark-defaults.conf
SPARK_CONF=${HOME}/sparkhdfs/spark/conf/spark-defaults.conf
cat > ${SPARK_CONF} << EOF

# Master settings
Expand All @@ -47,7 +58,7 @@ spark.logConf true

EOF

SPARK_ENVSH=${HOME}/spark/conf/spark-env.sh
SPARK_ENVSH=${HOME}/sparkhdfs/spark/conf/spark-env.sh
cat > ${SPARK_ENVSH} << EOF
#!/usr/bin/env bash

Expand All @@ -58,7 +69,7 @@ HADOOP_HOME="/opt/hadoop"

EOF

SPARK_L4J=${HOME}/spark/conf/log4j.properties
SPARK_L4J=${HOME}/sparkhdfs/spark/conf/log4j.properties
cat > ${SPARK_L4J} << EOF
# Set everything to be logged to the console
log4j.rootCategory=DEBUG, console
Expand All @@ -75,7 +86,7 @@ log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
EOF

### create HDFS config
HDFS_SITE=${HOME}/hadoop/etc/hadoop/hdfs-site.xml
HDFS_SITE=${HOME}/sparkhdfs/hadoop/etc/hadoop/hdfs-site.xml
cat > ${HDFS_SITE} << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
Expand All @@ -100,7 +111,7 @@ cat > ${HDFS_SITE} << EOF

EOF

HDFS_CORESITE=${HOME}/hadoop/etc/hadoop/core-site.xml
HDFS_CORESITE=${HOME}/sparkhdfs/hadoop/etc/hadoop/core-site.xml
cat > ${HDFS_CORESITE} << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
Expand All @@ -120,7 +131,7 @@ EOF
# Create a launcher script for SparkMaster and hdfsNamenode
#Once started, the Spark master will print out a spark://HOST:PORT to be used for submitting jobs

SPARKM_LAUNCHER=${HOME}/spark-start-master-${SLURM_JOBID}.sh
SPARKM_LAUNCHER=${HOME}/sparkhdfs/spark-start-master-${SLURM_JOBID}.sh
echo " - create SparkMaster and hdfsNamenode launcher script '${SPARKM_LAUNCHER}'"
cat << 'EOF' > ${SPARKM_LAUNCHER}
#!/bin/bash
Expand All @@ -129,30 +140,30 @@ echo "I am ${SLURM_PROCID} running on:"
hostname

#we are going to share an instance for Spark master and HDFS namenode
singularity instance start --bind $HOME/hadoop/logs:/opt/hadoop/logs,$HOME/hadoop/etc/hadoop:/opt/hadoop/etc/hadoop,$HOME/spark/conf:/opt/spark/conf,$HOME/spark/logs:/opt/spark/logs,$HOME/spark/work:/opt/spark/work \
singularity instance start --bind $HOME/sparkhdfs/hadoop/logs:/opt/hadoop/logs,$HOME/sparkhdfs/hadoop/etc/hadoop:/opt/hadoop/etc/hadoop,$HOME/sparkhdfs/spark/conf:/opt/spark/conf,$HOME/sparkhdfs/spark/logs:/opt/spark/logs,$HOME/sparkhdfs/spark/work:/opt/spark/work,$HOME/sparkhdfs:$HOME \
sparkhdfs.sif shinst

singularity run --bind $HOME/hadoop/logs:/opt/hadoop/logs,$HOME/hadoop/etc/hadoop:/opt/hadoop/etc/hadoop instance://shinst \
singularity run --bind $HOME/sparkhdfs/hadoop/logs:/opt/hadoop/logs,$HOME/sparkhdfs/hadoop/etc/hadoop:/opt/hadoop/etc/hadoop,$HOME/sparkhdfs:$HOME instance://shinst \
sparkHDFSNamenode 2>&1 &

singularity run --bind $HOME/spark/conf:/opt/spark/conf,$HOME/spark/logs:/opt/spark/logs,$HOME/spark/work:/opt/spark/work instance://shinst \
singularity run --bind $HOME/sparkhdfs/spark/conf:/opt/spark/conf,$HOME/sparkhdfs/spark/logs:/opt/spark/logs,$HOME/sparkhdfs/spark/work:/opt/spark/work,$HOME/sparkhdfs:$HOME instance://shinst \
sparkMaster

#the following example works for running without instance only the Spark Master
#singularity run --bind $HOME/spark/conf:/opt/spark/conf,$HOME/spark/logs:/opt/spark/logs,$HOME/spark/work:/opt/spark/work sparkhdfs.sif \
#singularity run --bind $HOME/sparkhdfs/spark/conf:/opt/spark/conf,$HOME/sparkhdfs/spark/logs:/opt/spark/logs,$HOME/sparkhdfs/spark/work:/opt/spark/work,$HOME/sparkhdfs:$HOME sparkhdfs.sif \
# sparkMaster

EOF
chmod +x ${SPARKM_LAUNCHER}

srun --exclusive -N 1 -n 1 -c 16 --ntasks-per-node=1 -l -o $HOME/SparkMaster-`hostname`.out \
srun --exclusive --nodes=1 --ntasks=1 --ntasks-per-node=1 --cpus-per-task=4 --label --output=$HOME/sparkhdfs/SparkMaster-`hostname`.out \
${SPARKM_LAUNCHER} &

export SPARKMASTER="spark://$hostName:7078"

echo "Starting Spark workers and HDFS datanodes"

SPARK_LAUNCHER=${HOME}/spark-start-workers-${SLURM_JOBID}.sh
SPARK_LAUNCHER=${HOME}/sparkhdfs/spark-start-workers-${SLURM_JOBID}.sh
echo " - create Spark workers and HDFS datanodes launcher script '${SPARK_LAUNCHER}'"
cat << 'EOF' > ${SPARK_LAUNCHER}
#!/bin/bash
Expand All @@ -161,31 +172,31 @@ echo "I am ${SLURM_PROCID} running on:"
hostname

#we are going to share an instance for Spark workers and HDFS datanodes
singularity instance start --bind $HOME/hadoop/logs:/opt/hadoop/logs,$HOME/hadoop/etc/hadoop:/opt/hadoop/etc/hadoop,$HOME/spark/conf:/opt/spark/conf,$HOME/spark/logs:/opt/spark/logs,$HOME/spark/work:/opt/spark/work \
singularity instance start --bind $HOME/sparkhdfs/hadoop/logs:/opt/hadoop/logs,$HOME/sparkhdfs/hadoop/etc/hadoop:/opt/hadoop/etc/hadoop,$HOME/sparkhdfs/spark/conf:/opt/spark/conf,$HOME/sparkhdfs/spark/logs:/opt/spark/logs,$HOME/sparkhdfs/spark/work:/opt/spark/work,$HOME/sparkhdfs:$HOME \
sparkhdfs.sif shinst

singularity run --bind $HOME/hadoop/logs:/opt/hadoop/logs,$HOME/hadoop/etc/hadoop:/opt/hadoop/etc/hadoop instance://shinst \
singularity run --bind $HOME/sparkhdfs/hadoop/logs:/opt/hadoop/logs,$HOME/sparkhdfs/hadoop/etc/hadoop:/opt/hadoop/etc/hadoop,$HOME/sparkhdfs:$HOME instance://shinst \
sparkHDFSDatanode 2>&1 &

singularity run --bind $HOME/spark/conf:/opt/spark/conf,$HOME/spark/logs:/opt/spark/logs,$HOME/spark/work:/opt/spark/work instance://shinst \
singularity run --bind $HOME/sparkhdfs/spark/conf:/opt/spark/conf,$HOME/sparkhdfs/spark/logs:/opt/spark/logs,$HOME/sparkhdfs/spark/work:/opt/spark/work,$HOME/sparkhdfs:$HOME instance://shinst \
sparkWorker $SPARKMASTER -c 8 -m 12G


#the following without instance only Spark worker
#singularity run --bind $HOME/spark/conf:/opt/spark/conf,$HOME/spark/logs:/opt/spark/logs,$HOME/spark/work:/opt/spark/work sparkhdfs.sif \
#singularity run --bind $HOME/sparkhdfs/spark/conf:/opt/spark/conf,$HOME/sparkhdfs/spark/logs:/opt/spark/logs,$HOME/sparkhdfs/spark/work:/opt/spark/work,$HOME/sparkhdfs:$HOME sparkhdfs.sif \
# sparkWorker $SPARKMASTER -c 8 -m 8G

EOF
chmod +x ${SPARK_LAUNCHER}

srun --exclusive -N 2 -n 2 -c 16 --ntasks-per-node=1 -l -o $HOME/SparkWorkers-`hostname`.out \
srun --exclusive --nodes=2 --ntasks=2 --ntasks-per-node=1 --cpus-per-task=4 --label --output=$HOME/sparkhdfs/SparkWorkers-`hostname`.out \
${SPARK_LAUNCHER} &

pid=$!
sleep 3600s
wait $pid

echo $HOME
echo $HOME/sparkhdfs

echo "Ready Stopping SparkHDFS instances"