-
Notifications
You must be signed in to change notification settings - Fork 0
/
move_pump_runs2opendata_bucket.sh
executable file
·86 lines (84 loc) · 3.63 KB
/
move_pump_runs2opendata_bucket.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env bash
set -exo pipefail
#assume the AWS Open Data Creds are available and are being used for these operations
export LC_ALL=C
export LOG_DIR=../transfer_logs
mkdir -p $LOG_DIR
export threads=10
#in seconds
SLEEP_TIME=60
#source bucket
SB="s3://monorail-batch"
#target bucket
TB="s3://recount-opendata"
awsprofile=$1
if [[ -z $awsprofile ]]; then
awsprofile="default"
fi
#human or mouse
org0=$2
if [[ -z $org0 ]]; then
org0="human"
fi
pSB=$SB/pump-outputs/$org0
pTB=$TB/recount3expansion/pump/$org0
pDONES="s3://monorail-batch/PUMP_DONES"
#REFACTORED to do parallel copyies/removals
while true; do
#remove any previous done files to avoid confusion about what's already been copied
rm -rf *.DONE
#get latest set of finsihed samples (from pump)
aws s3 --profile $awsprofile cp --request-payer requester --recursive $pDONES/ ./
echo -n "" > $LOG_DIR/all_copies.${date}
echo -n "" > copy_finished_pump_runs2opendata.jobs
#for each sample (.DONE file):
for f in `ls *.DONE`; do
#get the full S3 source path to copy the pump results from
s3path=$(cat $f)
suffix=$(echo "$s3path" | perl -ne 'chomp; $f=$_; @f=split(/\//,$f,-1); $sdate=pop(@f); $lo2=pop(@f); $study=pop(@f); $lo=pop(@f); print join("/",($lo,$study,$lo2,$sdate))."\n";')
#use this for local logging purposes
logd=$LOG_DIR/$suffix
mkdir -p $logd
#copy back the full pump results directory to the target bucket
echo "/usr/bin/time -v aws s3 --profile $awsprofile sync --request-payer requester $s3path/ $pTB/$suffix/ > $logd/s3copy 2>&1" >> copy_finished_pump_runs2opendata.jobs
echo "$f|$s3path|$suffix" >> $LOG_DIR/all_copies.${date}
done
/usr/bin/time -v parallel -j$threads < copy_finished_pump_runs2opendata.jobs > copy_finished_pump_runs2opendata.jobs.run${threads} 2>&1
echo -n "" > $LOG_DIR/bad_copies.${date}
echo -n "" > $LOG_DIR/all_removals.${date}
echo -n "" > remove_finished_pump_runs2opendata.jobs
for copyline in `cat $LOG_DIR/all_copies.${date}`; do
f=$(echo "$copyline" | cut -d'|' -f 1)
s3path=$(echo "$copyline" | cut -d'|' -f 2)
suffix=$(echo "$copyline" | cut -d'|' -f 3)
logd=$LOG_DIR/$suffix
set +eo pipefail
good=$(fgrep "Exit status: 0" $logd/s3copy)
set -eo pipefail
if [[ -n $good ]]; then
/usr/bin/time -v aws s3 --profile $awsprofile cp --request-payer requester $pDONES/$f $pTB/DONES/ > $logd/s3copy.done 2>&1
#then remove path from source bucket
echo "/usr/bin/time -v aws s3 --profile $awsprofile rm --request-payer requester --recursive $s3path/ > $logd/s3rm 2>&1" >> remove_finished_pump_runs2opendata.jobs
echo "$f|$s3path|$suffix" >> $LOG_DIR/all_removals.${date}
else
echo "$copyline" >> $LOG_DIR/bad_copies.${date}
fi
done
/usr/bin/time -v parallel -j$threads < remove_finished_pump_runs2opendata.jobs > remove_finished_pump_runs2opendata.jobs${threads} 2>&1
echo -n "" > $LOG_DIR/bad_removals.${date}
for copyline in `cat $LOG_DIR/all_removals.${date}`; do
f=$(echo "$copyline" | cut -d'|' -f 1)
s3path=$(echo "$copyline" | cut -d'|' -f 2)
suffix=$(echo "$copyline" | cut -d'|' -f 3)
logd=$LOG_DIR/$suffix
set +eo pipefail
good=$(fgrep "Exit status: 0" $logd/s3rm)
set -eo pipefail
if [[ -n $good ]]; then
/usr/bin/time -v aws s3 --profile $awsprofile rm --request-payer requester $pDONES/$f > $logd/s3rm.done 2>&1
else
echo "$copyline" >> $LOG_DIR/bad_removals.${date}
fi
done
sleep $SLEEP_TIME
done