-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcoor2expr
More file actions
executable file
·195 lines (177 loc) · 7.9 KB
/
coor2expr
File metadata and controls
executable file
·195 lines (177 loc) · 7.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/bin/bash
#PBS -l nodes=1:ppn=4
OPTION=0
GENOME="mm9"
#### usage ####
usage() {
echo
echo Program: "coor2expr (determine normalized expression for a genomic coordinate)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: coor2expr -i <coor> -j <file> -k <float> [OPTIONS]"
echo "Options:"
echo " -i <coor> [input genomic coordinate (chr:start-end)]"
echo " -j <file> [input mapped reads in BAM format]"
echo " [if multiple seperate them by a comma]"
echo "[OPTIONS]"
echo " -k <float> [normalize expression by input size factor]"
echo " [if multiple seperate them by a comma]"
echo " -l <int> [normalize expression by input length]"
echo " -m [normalize expression by counts per million mapped reads]"
echo " -d [remove duplicate reads]"
echo " -e <int> [extend 3' end of reads by input number of bases (useful for ChIP-seq data)]"
echo " [if multiple seperate them by a comma]"
echo " -g <string> [genome (default: mm9)]"
echo " -p <int> [options to compute expression (default: 0)]"
echo " [0 -> total normalized expression]"
echo " [1 -> median normalized expression of block groups]"
echo " [2 -> median normalized expression of block]"
echo " [3 -> maximum normalized expression of block groups]"
echo " [4 -> maximum normalized expression of block]"
echo " -s [define block groups for reads from + and - strand separately]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:j:k:l:mde:g:p:sh ARG; do
case "$ARG" in
i) COOR=$OPTARG;;
j) BAMFILE=$OPTARG;;
k) SIZEFACTOR=$OPTARG;;
l) LENFACTOR=$OPTARG;;
m) CPM=1;;
d) REMOVE_DUPLICATE=1;;
e) EXTEND=$OPTARG;;
g) GENOME=$OPTARG;;
p) OPTION=$OPTARG;;
s) STRANDSPECIFIC=1;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$COOR" -o -z "$BAMFILE" -o "$HELP" ]; then
usage
fi
###################
#helperfunction
function wait_for_jobs_to_finish {
for job in `jobs -p`
do
echo $job
wait $job
done
echo $1
}
###############
## populating files based on input genome
if [ "$GENOME" == "mm9" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.genome"
elif [ "$GENOME" == "hg19" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg19.genome"
else
echo "Presently the program only support analysis for mm9 or hg19"
echo
usage
fi
<<"COMMENT"
COMMENT
## parse input bam files in an array
IFS=","
BAMFILES=($BAMFILE)
BAMFILES_COUNT=${#BAMFILES[@]}
IFS=""
## initialize size factors, if both size factors and total reads not provided
if [ -z "$SIZEFACTOR" -a -z "$CPM" ]; then
SIZEFACTOR=""
for(( i=0; i<$BAMFILES_COUNT; i++ )); do
SIZEFACTOR="$SIZEFACTOR,1"
done
SIZEFACTOR=`echo $SIZEFACTOR | perl -ane '$_=~s/^\,//g; print $_;'`;
fi
## initialize extend parameter, if not provided
if [ -z "$EXTEND" ]; then
EXTEND=""
for(( i=0; i<$BAMFILES_COUNT; i++ )); do
EXTEND="$EXTEND,0"
done
EXTEND=`echo $EXTEND | perl -ane '$_=~s/^\,//g; print $_;'`;
fi
## parse input size factors in an array
IFS=","
SIZEFACTORS=($SIZEFACTOR)
SIZEFACTORS_COUNT=${#SIZEFACTORS[@]}
IFS=""
## parse input extensions in an array
IFS=","
EXTENDS=($EXTEND)
EXTENDS_COUNT=${#EXTENDS[@]}
IFS=""
if [ ! -z "$SIZEFACTOR" -a "$BAMFILES_COUNT" -ne "$SIZEFACTORS_COUNT" -o "$BAMFILES_COUNT" -ne "$EXTENDS_COUNT" ]; then
echo -n "Please provide size factor and extend parameter for each input bam file";
usage
fi
## start the analysis
if [ "$OPTION" -gt 0 -a "$OPTION" -le 4 ]; then
ID1=`echo $COOR | sed 's/[\:\-]/_/g'`;
ID2=`echo $BAMFILE | sed 's/^.*\///g'`;
ID=$ID1"_"$RANDOM"_"$ID2".tmp"
## retrieve normalized reads (CPM or RLE)
if [ ! -z "$CPM" ]; then
if [ ! -z "$REMOVE_DUPLICATE" ]; then
coor2reads -c $COOR -b $BAMFILE -m -d -e $EXTEND -g $GENOME | sort -k 6,6 > /tmp/$ID;
else
coor2reads -c $COOR -b $BAMFILE -m -e $EXTEND -g $GENOME | sort -k 6,6 > /tmp/$ID;
fi
else
if [ ! -z "$REMOVE_DUPLICATE" ]; then
coor2reads -c $COOR -b $BAMFILE -s $SIZEFACTOR -d -e $EXTEND -g $GENOME | sort -k 6,6 > /tmp/$ID;
else
coor2reads -c $COOR -b $BAMFILE -s $SIZEFACTOR -e $EXTEND -g $GENOME | sort -k 6,6 > /tmp/$ID;
fi
fi
## put strand to "+" for all reads, unless specified not to do so by -s parameter
if [ -z "$STRANDSPECIFIC" ]; then
perl -ane 'print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]\t+\n";' /tmp/$ID > /tmp/$ID.tmp
mv /tmp/$ID.tmp /tmp/$ID
fi
## define block groups or blocks using blockbuster
if [ "$OPTION" -eq 1 ]; then
EXPR=`blockbuster.x -minClusterHeight 2 -minBlockHeight 2 -distance 70 -scale 0.6 -blockHeight abs /tmp/$ID | grep "^>" | cut -f 6 | bedStat.R -i stdin -l 2>/dev/null | grep Median | sed 's/Median=//g'`;
elif [ "$OPTION" -eq 2 ]; then
EXPR=`blockbuster.x -minClusterHeight 2 -minBlockHeight 2 -distance 70 -scale 0.6 -blockHeight abs /tmp/$ID | grep -v "^>" | perl -ane 'print "$F[1]\t$F[2]\t$F[3]\t$F[5]\t$F[6]\t$F[4]\n";' | sortBed -i stdin | mergeBed -i stdin -c 4 -o sum | cut -f 4 | bedStat.R -i stdin -l 2>/dev/null | grep Median | sed 's/Median=//g'`;
#EXPR=`blockbuster.x -minClusterHeight 2 -minBlockHeight 2 -distance 70 -scale 0.6 -blockHeight abs /tmp/$ID | grep -v "^>" | perl -ane 'print "$F[1]\t$F[2]\t$F[3]\t$F[5]\t$F[6]\t$F[4]\n";' | cut -f 4 | bedStat.R -i stdin -l 2>/dev/null | grep Median | sed 's/Median=//g'`;
elif [ "$OPTION" -eq 3 ]; then
EXPR=`blockbuster.x -minClusterHeight 2 -minBlockHeight 2 -distance 70 -scale 0.6 -blockHeight abs /tmp/$ID | grep "^>" | cut -f 6 | bedStat.R -i stdin -l 2>/dev/null | grep Max | sed 's/Max=//g'`;
elif [ "$OPTION" -eq 4 ]; then
EXPR=`blockbuster.x -minClusterHeight 2 -minBlockHeight 2 -distance 70 -scale 0.6 -blockHeight abs /tmp/$ID | grep -v "^>" | perl -ane 'print "$F[1]\t$F[2]\t$F[3]\t$F[5]\t$F[6]\t$F[4]\n";' | sortBed -i stdin | mergeBed -i stdin -c 4 -o sum | cut -f 4 | bedStat.R -i stdin -l 2>/dev/null | grep Max | sed 's/Max=//g'`;
#EXPR=`blockbuster.x -minClusterHeight 2 -minBlockHeight 2 -distance 70 -scale 0.6 -blockHeight abs /tmp/$ID | grep -v "^>" | perl -ane 'print "$F[1]\t$F[2]\t$F[3]\t$F[5]\t$F[6]\t$F[4]\n";' | cut -f 4 | bedStat.R -i stdin -l 2>/dev/null | grep Max | sed 's/Max=//g'`;
fi
rm /tmp/$ID
else
## retrieve normalized reads (CPM or RLE)
#echo "coor2reads -c $COOR -b $BAMFILE -m -e $EXTEND -g $GENOME | perl -ane 'BEGIN{$sum=0;} $sum+=$F[4]; END { printf("%0.2f\n", $sum);}"; exit;
if [ ! -z "$CPM" ]; then
if [ ! -z "$REMOVE_DUPLICATE" ]; then
EXPR=`coor2reads -c $COOR -b $BAMFILE -m -d -e $EXTEND -g $GENOME | perl -ane 'BEGIN{$sum=0;} $sum+=$F[4]; END { printf("%0.2f\n", $sum);}'`
else
EXPR=`coor2reads -c $COOR -b $BAMFILE -m -e $EXTEND -g $GENOME | perl -ane 'BEGIN{$sum=0;} $sum+=$F[4]; END { printf("%0.2f\n", $sum);}'`
fi
else
if [ ! -z "$REMOVE_DUPLICATE" ]; then
EXPR=`coor2reads -c $COOR -b $BAMFILE -s $SIZEFACTOR -d -e $EXTEND -g $GENOME | perl -ane 'BEGIN{$sum=0;} $sum+=$F[4]; END { printf("%0.2f\n", $sum);}'`
else
EXPR=`coor2reads -c $COOR -b $BAMFILE -s $SIZEFACTOR -e $EXTEND -g $GENOME | perl -ane 'BEGIN{$sum=0;} $sum+=$F[4]; END { printf("%0.2f\n", $sum);}'`
fi
fi
fi
## perform length normalization
if [ ! -z "$LENFACTOR" ]; then
NORM_LENGTH=`echo $COOR | perl -ane '@t=split(/[\:\-]+/, $_); $len=($t[2]-$t[1])+1; $len=$len/'$LENFACTOR'; print $len;'`
EXPR=`echo $EXPR | perl -ane '$expr=$_/'$NORM_LENGTH'; printf("%0.2f\n", $expr);'`
fi
if [ -z "$EXPR" ]; then
EXPR=0;
fi
echo $EXPR