forked from aehrc/VariantSpark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjvariant-spark
executable file
·80 lines (57 loc) · 2.21 KB
/
jvariant-spark
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
set -e
PWD=$(cd "`dirname $0`"/..; pwd)
function fatal_error () {
echo "ERROR: $1" 1>&2
exit 1
}
WITH_HAIL=NO
#parse command line
while [[ $# -gt 0 ]]; do
case $1 in
-wh|--with-hail)
WITH_HAIL=YES
shift # past argument
;;
*) # unknown option
shift # past argument
;;
esac
done
if [[ -z "${VARSPARK_HOME}" ]]; then
VARSPARK_HOME="${PWD}"
fi
VS_ASSEMBLY_JAR="`${VARSPARK_HOME}/bin/find-varspark-jar`"
[[ $(type -P "pyspark") ]] || fatal_error "\`pyspark\` cannot be found. Please make sure it's on your PATH."
SPARK_VERSION=`spark-submit --version 2>&1 | grep -m 1 version | awk -F 'version ' '{print $2}'`
if [[ -f "${PWD}/python/lib/varspark-src.zip" ]]; then
export PYTHONPATH=$(printf ":%s" `ls ${PWD}/python/lib/*.zip`)
fi
if [[ -f "${PWD}/python/varspark/__init__.py" ]]; then
export PYTHONPATH=${PWD}/python$(printf ":%s" `ls ${PWD}/python/lib/*.zip`)
fi
export PYSPARK_DRIVER_PYTHON='jupyter'
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
if [ "$WITH_HAIL" == "YES" ]; then
[[ -n "${HAIL_HOME}" ]] || fatal_error "\`pyspark\` cannot be found. Please make sure it's on your PATH."
if [ -f "${HAIL_HOME}/jars/hail-all-spark.jar" ]; then
VS_HAIL_JAR="${HAIL_HOME}/jars/hail-all-spark.jar"
elif [ -f "${HAIL_HOME}/build/libs/hail-all-spark.jar" ]; then
VS_HAIL_JAR="${HAIL_HOME}/build/libs/hail-all-spark.jar"
else
fatal "Cannot locate hail-all-spark.jar under ${HAIL_HOME}."
fi
export PYTHONPATH=$PYTHONPATH:"${HAIL_HOME}/python"
echo "Hail jar: $VS_HAIL_JAR"
pyspark \
--driver-class-path "${VS_HAIL_JAR}" \
--conf "spark.hadoop.io.compression.codecs=org.apache.hadoop.io.compress.DefaultCodec,is.hail.io.compress.BGzipCodec,org.apache.hadoop.io.compress.GzipCodec" \
--conf "spark.sql.files.openCostInBytes=53687091200" \
--conf "spark.sql.files.maxPartitionBytes=53687091200" \
--conf "spark.executor.extraClassPath=${VS_HAIL_JAR}" \
--conf "spark.sql.catalogImplementation=in-memory" \
--jars "${VS_ASSEMBLY_JAR}"
else
pyspark --jars "${VS_ASSEMBLY_JAR}" \
--conf "spark.sql.catalogImplementation=in-memory"
fi