Skip to content

Commit ab6be4e

Browse files
committed
* added systematic comparison of simulated circ-read sampling and find_circ output. merge_bed.py now adheres to -6 BED6 mode also in --verbatim. find_circ.py --test mode fixed to handle broken segments correctly.
1 parent 51fff34 commit ab6be4e

File tree

3 files changed

+71
-13
lines changed

3 files changed

+71
-13
lines changed

find_circ.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,10 @@ def get_dummy(self,chrom,start,end,sense):
459459

460460
if args:
461461
logger.info('reading from {0}'.format(args[0]))
462-
sam_input = pysam.Samfile(args[0],'rb')
462+
if args[0].endswith('sam'):
463+
sam_input = pysam.Samfile(args[0],'r')
464+
else:
465+
sam_input = pysam.Samfile(args[0],'rb')
463466
else:
464467
logger.info('reading from stdin')
465468
sam_input = pysam.Samfile('-','r')
@@ -1227,7 +1230,7 @@ def validate_hits_for_test_fragment(frag_name, lin_coords, circ_coords, unsplice
12271230
unspliced_str = "N/A"
12281231

12291232
if seg_broken:
1230-
broken = ";".join(sorted(seg_broken))
1233+
broken = ";".join([str(b) for b in sorted(seg_broken)])
12311234
broken_str = 'BROKEN_SEGMENTS:{0}'.format(broken)
12321235
else:
12331236
broken_str = "N/A"
@@ -1342,7 +1345,8 @@ def extract_coords(align):
13421345
return chrom, align.pos, align.aend, "*"
13431346

13441347
unspliced_coords = set([extract_coords(mate) for mate in unspliced_mates])
1345-
validate_hits_for_test_fragment(frag_name, lin_coords, circ_coords, unspliced_coords, seg_broken)
1348+
seg_broken_coords = set([extract_coords(seg) for seg in seg_broken])
1349+
validate_hits_for_test_fragment(frag_name, lin_coords, circ_coords, unspliced_coords, seg_broken_coords)
13461350

13471351
if circ_coords:
13481352
# investigate unspliced mates

merge_bed.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,22 @@ def src(fname):
3636
if line.startswith("#"):
3737
continue
3838
line = line.strip()
39-
chrom,start,end,name,score,sense = line.split('\t')[:6]
39+
parts = line.split('\t')
40+
if options.bed6:
41+
parts = parts[:6]
42+
43+
chrom,start,end,name,score,sense = parts[:6]
4044
start,end = int(start)+ds,int(end)+de
4145

4246
#print (chrom,start,end,sense)
43-
pos[(chrom,start,end,sense)] = line
47+
pos[(chrom,start,end,sense)] = parts
4448

4549
if flank:
4650
for x in xrange(flank):
47-
pos[(chrom,start-x,end,sense)] = line
48-
pos[(chrom,start+x,end,sense)] = line
49-
pos[(chrom,start,end-x,sense)] = line
50-
pos[(chrom,start,end+x,sense)] = line
51+
pos[(chrom,start-x,end,sense)] = parts
52+
pos[(chrom,start+x,end,sense)] = parts
53+
pos[(chrom,start,end-x,sense)] = parts
54+
pos[(chrom,start,end+x,sense)] = parts
5155

5256
#if cover:
5357
#for x in xrange
@@ -126,9 +130,7 @@ def append_uniq(values):
126130

127131
from itertools import izip_longest
128132
parts = []
129-
source = enumerate(izip_longest(*[l.rstrip().split('\t') for l in lines],fillvalue=""))
130-
if options.bed6:
131-
source = enumerate(izip_longest(*[l.rstrip().split('\t')[:6] for l in lines],fillvalue=""))
133+
source = enumerate(izip_longest(*[l for l in lines],fillvalue=""))
132134
for i,column in source:
133135
#print i,column
134136
if i in col_map:
@@ -145,7 +147,7 @@ def append_uniq(values):
145147
cols = [comstr]
146148
for name in com:
147149
cols.append("%s : " % name)
148-
cols.append(by_name[name][pos])
150+
cols.append("\t".join(by_name[name][pos]))
149151

150152
print "\t".join(cols)
151153

test_data/whatidid.md

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
## Testing the --test feature
2+
3+
# Using Marcel's C.elegans reads
4+
5+
```
6+
# in find_circ2/test_data
7+
PREFIX=/data/BIO3/home/mschilli/repo/global/external/marvin/find_circ2/test_data/
8+
INDEX=/data/BIO3/indices/WBcel235_bwa_0.7.5a-r405/WBcel235.fa
9+
10+
bwa mem -t16 -k 15 -T 1 $INDEX \
11+
${PREFIX}/synthetic_reads.R1.fa.gz \
12+
${PREFIX}/synthetic_reads.R2.fa.gz \
13+
> /scratch/circdetection_test_data/marcel_test.sam
14+
15+
cat /scratch/circdetection_test_data/marcel_test.sam | \
16+
../find_circ.py --test -G $INDEX --stdout=test | les
17+
```
18+
19+
# Using my own dm6 reads
20+
21+
```
22+
# in find_circ2/test_data
23+
INDEX=/data/rajewsky/indices/dm6_bwa_0.7.12-r1039/dm6.fa
24+
25+
time cat ~/circpeptides/fly/dm6.ribocircs_mar2016.ucsc | grep ANNOTATED | \
26+
../simulate_reads.py -G $INDEX -o sim_dm6 --fpk &
27+
28+
time cat ~/circpeptides/fly/dm6.ribocircs_mar2016.ucsc | grep ANNOTATED | \
29+
../simulate_reads.py -G $INDEX -o sim_dm6_0.01 --mutate=0.01 --fpk &
30+
31+
time cat ~/circpeptides/fly/dm6.ribocircs_mar2016.ucsc | grep ANNOTATED | \
32+
../simulate_reads.py -G $INDEX -o sim_dm6_0.05 --mutate=0.05 --fpk &
33+
34+
time cat ~/circpeptides/fly/dm6.ribocircs_mar2016.ucsc | grep ANNOTATED | \
35+
../simulate_reads.py -G $INDEX -o sim_dm6_0.1 --mutate=0.1 --fpk &
36+
37+
38+
for SAMPLE in sim_dm6 sim_dm6_0.01 sim_dm6_0.05 sim_dm6_0.1
39+
do {
40+
bwa mem -t16 -k 15 -T 1 -p $INDEX ${SAMPLE}/simulated_reads.fa.gz > ${SAMPLE}.sam
41+
42+
../find_circ.py ${SAMPLE}.sam -o ${SAMPLE}_run --test -G $INDEX \
43+
--known-lin=${SAMPLE}/lin_splice_sites.bed \
44+
--known-circ=${SAMPLE}/circ_splice_sites.bed
45+
} done;
46+
47+
for ERR in "" _0.01 _0.05 _0.1
48+
do {
49+
../merge_bed.py -6 -V sim_dm6${ERR}/circ_splice_sites.bed sim_dm6${ERR}_run/circ_splice_sites.bed | grep '(in0,in1)' | cut -f 7,14 | histogram.py -s -q -b0 --ofs-fit -x "simulated junction reads" -y "recovered junction reads" -t "circRNA recovery error=${ERR}" --pdf=scatter_sim${ERR}.pdf
50+
} done;
51+
52+
```

0 commit comments

Comments
 (0)