forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunner_mapr.sh
executable file
·151 lines (126 loc) · 5.55 KB
/
runner_mapr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/bin/bash
echo you can use -n argument to skip the s3 download if you did it once
echo files are unzipped to ../../h2o-downloaded
# This is critical:
# Ensure that all your children are truly dead when you yourself are killed.
# trap "kill -- -$BASHPID" INT TERM EXIT
# leave out EXIT for now
trap "kill -- -$BASHPID" INT TERM
echo "BASHPID: $BASHPID"
echo "current PID: $$"
source ./runner_setup.sh "$@"
echo "Do we have to clean out old ice_root dirs somewhere?"
echo "Setting up sandbox, since no cloud build here will clear it out! (unlike other runners)"
rm -fr sandbox
mkdir -p sandbox
# Should we do this cloud build with the sh2junit.py? to get logging, xml etc.
# I suppose we could just have a test verify the request cloud size, after buildingk
MAPR_JOBTRACKER=192.168.1.173:9001
MAPR_NODES=3
MAPR_HEAP=20g
MAPR_JAR=h2odriver_mapr2.1.3.jar
H2O_DOWNLOADED=../../h2o-downloaded
H2O_HADOOP=$H2O_DOWNLOADED/hadoop
H2O_JAR=h2o.jar
HDFS_OUTPUT=hdfsOutputDirName
# file created by the h2o on hadoop h2odriver*jar
REMOTE_HOME=/home/0xcustomer
REMOTE_IP=192.168.1.173
REMOTE_USER=0xcustomer@$REMOTE_IP
REMOTE_SCP="scp -i $HOME/.0xcustomer/0xcustomer_id_rsa"
REMOTE_SSH_USER="ssh -i $HOME/.0xcustomer/0xcustomer_id_rsa $REMOTE_USER"
source ./kill_hadoop_jobs.sh
#*****HERE' WHERE WE START H2O ON HADOOP*******************************************
rm -f /tmp/h2o_on_hadoop_$REMOTE_IP.sh
echo "cd /home/0xcustomer" > /tmp/h2o_on_hadoop_$REMOTE_IP.sh
echo "rm -fr h2o_one_node" >> /tmp/h2o_on_hadoop_$REMOTE_IP.sh
set +e
# remember to update this, to match whatever user kicks off the h2o on hadoop
echo "hadoop dfs -rmr /user/0xcustomer/$HDFS_OUTPUT" >> /tmp/h2o_on_hadoop_$REMOTE_IP.sh
set -e
echo "hadoop jar $MAPR_JAR water.hadoop.h2odriver -jt $MAPR_JOBTRACKER -libjars $H2O_JAR -mapperXmx $MAPR_HEAP -nodes $MAPR_NODES -output $HDFS_OUTPUT -notify h2o_one_node " >> /tmp/h2o_on_hadoop_$REMOTE_IP.sh
# copy the script, just so we have it there too
$REMOTE_SCP /tmp/h2o_on_hadoop_$REMOTE_IP.sh $REMOTE_USER:$REMOTE_HOME
# have to copy the downloaded h2o stuff over to xxx to execute with the ssh
# it needs the right hadoop client setup. This is easier than installing hadoop client stuff here.
# do the jars last, so we can see the script without waiting for the copy
echo "scp some jars"
$REMOTE_SCP $H2O_HADOOP/$MAPR_JAR $REMOTE_USER:$REMOTE_HOME
$REMOTE_SCP $H2O_DOWNLOADED/$H2O_JAR $REMOTE_USER:$REMOTE_HOME
# exchange keys so jenkins can do this?
# background!
cat /tmp/h2o_on_hadoop_$REMOTE_IP.sh
cat /tmp/h2o_on_hadoop_$REMOTE_IP.sh | $REMOTE_SSH_USER &
#*********************************************************************************
CLOUD_PID=$!
jobs -l
source ./wait_for_h2o_on_hadoop.sh
# use these args when we do Runit
while IFS=';' read CLOUD_IP CLOUD_PORT
do
echo $CLOUD_IP, $CLOUD_PORT
done < h2o_one_node
rm -fr h2o-nodes.json
# NOTE: keep this hdfs info in sync with the json used to build the cloud above
../find_cloud.py -f h2o_one_node -hdfs_version mapr3.0.1 -hdfs_name_node 192.168.1.161 -expected_size $MAPR_NODES
echo "h2o-nodes.json should now exist"
ls -ltr h2o-nodes.json
# cp it to sandbox? not sure if anything is, for this setup
cp -f h2o-nodes.json sandbox
cp -f h2o_one_node sandbox
#***********************************************************************************
echo "Touch all the 0xcustomer-datasets mnt points, to get autofs to mount them."
echo "Permission rights extend to the top level now, so only 0xcustomer can automount them"
echo "okay to ls the top level here...no secret info..do all the machines hadoop (cdh3) might be using"
for mr in 171 172 173 174 175 176 177 178 179 180
do
ssh -i $HOME/.0xcustomer/0xcustomer_id_rsa 0xcustomer@192.168.1.$mr 'cd /mnt/0xcustomer-datasets'
done
# We now have the h2o-nodes.json, that means we started the jvms
# Shouldn't need to wait for h2o cloud here..
# the test should do the normal cloud-stabilize before it does anything.
# n0.doit uses nosetests so the xml gets created on completion. (n0.doit is a single test thing)
# A little '|| true' hack to make sure we don't fail out if this subtest fails
# test_c1_rel has 1 subtest
# This could be a runner, that loops thru a list of tests.
# belt and suspenders ..for resolving bucket path names
export H2O_REMOTE_BUCKETS_ROOT=/home/0xcustomer
echo "If it exists, pytest_config-<username>.json in this dir will be used"
echo "i.e. pytest_config-jenkins.json"
echo "Used to run as 0xcust.., with multi-node targets (possibly)"
myPy() {
DOIT=../testdir_single_jvm/n0.doit
$DOIT $1/$2 || true
# try moving all the logs created by this test in sandbox to a subdir to isolate test failures
# think of h2o.check_sandbox_for_errors()
rm -f -r sandbox/$1
mkdir -p sandbox/$1
cp -f sandbox/*log sandbox/$1
# rm -f sandbox/*log
}
# do first
myPy c6 test_c6_maprfs.py
# myPy c5 test_c5_KMeans_sphere15_180GB.py
# don't run this until we know whether 0xcustomer permissions also exist for the hadoop job
# myPy c1 test_c1_rel.py
myPy c2 test_c2_rel.py
# myPy c3 test_c3_rel.py
# myPy c4 test_c4_four_billion_rows.py
# If this one fails, fail this script so the bash dies
# We don't want to hang waiting for the cloud to terminate.
myPy shutdown test_shutdown.py
echo "Maybe it takes some time for hadoop to shut it down? sleep 10"
sleep 10
if ps -p $CLOUD_PID > /dev/null
then
echo "$CLOUD_PID is still running after shutdown. Will kill"
kill $CLOUD_PID
# may take a second?
sleep 1
fi
ps aux | grep h2odriver
jobs -l
echo ""
echo "The h2odriver job should be gone. It was pid $CLOUD_PID"
echo "The hadoop job(s) should be gone?"
$REMOTE_SSH_USER "hadoop job -list"