forked from BGI-Qingdao/stlfr2supernova_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
threshold_calc.sh
executable file
·62 lines (51 loc) · 2.12 KB
/
threshold_calc.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#/bin/bash
if [[ $# != 3 ]] ; then
echo "Usage : $0 your-barcode-freq.txt your-genome-size(in bp) your-expect-cov"
echo "please check your parameters .exit ... "
exit 1
fi
BarcodeFreq=$1
GenomeSize=$2
ExpectCov=$3
if [[ ! -f $BarcodeFreq ]] ; then
echo "Usage : $0 your-barcode-freq.txt your-genome-size(in bp) your-expect-cov"
echo "your-barcode-freq.txt [ $BarcodeFreq ] is not exist !!! "
echo "exit ..."
exit 1
fi
echo "INFO : run with BarcodeFreq=$BarcodeFreq GenomeSize=$GenomeSize ExpectCov=$ExpectCov "
ExpectBP=$((GenomeSize*ExpectCov))
ExpectReadPair=$((ExpectBP/200))
TMP=tmp.pair_num_per_barcode_freq.txt
echo "INFO : expect reads_pair=$ExpectReadPair"
awk '{print $2}' < $BarcodeFreq | sort -n | uniq -c | \
awk 'BEGIN{a=0;}{ freq=$1; pair_num=$2; if( pair_num != 0 ) { a=a+freq*pair_num; printf("%d\t%d\t%d\t%d\n",freq,pair_num,freq*pair_num,a) ; } }' \
>$TMP
TotalPair=`awk 'BEGIN{a=0;}{a=a+$3;}END{print a;}' <$TMP`
echo "INFO : total reads_pair=$TotalPair"
if [[ $TotalPair -lt $ExpectReadPair ]] ; then
echo "FATAL : total < expect !! no need for filter !!! exit ..."
exit 1
fi
BIG=500
BigPair=`awk 'BEGIN{a=0;}{if($2>$BIG) { a=a+$3;} } END {print a; }' <$TMP`
echo "INFO : delete barcode with too much reads : big_pair=$BigPair"
LeftPair=$((TotalPair-BigPair))
if [[ $LeftPair -lt $ExpectReadPair ]] ; then
echo "RESULT : high threshold is $BIG and low threshold is 1"
echo "RESULT : delete $BigPair reads total."
echo "Done ..."
exit 0
fi
NeedCut=$((LeftPair-ExpectReadPair))
echo "{if( \$4 > $NeedCut) { print \$0;} }" >tmp.awk
awk -f tmp.awk <$TMP | head -n 1 1>tmp.low.result
LOW=`awk '{print $2}' <tmp.low.result`
Low_the_pair=`awk '{print $3}' <tmp.low.result`
LowPair=`awk '{print $4}' <tmp.low.result`
NowLeftPair=$((LeftPair- LowPair))
echo "RESULT : high threshold is $BIG and low threshold is $((LOW+1))"
echo "RESULT : delete $BigPair from barcodes that contain reads-pair > $BIG."
echo "RESULT : delete $LowPair from barcodes that contain reads-pair < $((LOW+1))."
echo "RESULT : left $NowLeftPair reads = $((NowLeftPair/GenomeSize)) cov"
echo "Done ..."