Skip to content

Commit

Permalink
v1.60
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon van Heeringen committed Mar 10, 2014
1 parent 6136c6f commit dfb2511
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 46 deletions.
82 changes: 52 additions & 30 deletions pita/collection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pita.exon import *
from pita.util import read_statistics
from pita.util import longest_orf,exons_to_seq,model_to_bed
from pita.config import SEP
import numpy
import sys
import logging
Expand Down Expand Up @@ -76,16 +77,20 @@ def add_exon(self, chrom, start, end, strand):

# Otherwise create new exon and return it
e = Exon(chrom, start, end, strand)


self.graph.add_node(e)

if self.index:
e.seq = self.index.get_sequence(chrom, start, end, strand)

self.exons[chrom][to_sloc(start, end, strand)] = e
return e

def remove_exon(self, e):
self.graph.remove_node(e)
del self.exons[e.chrom][to_sloc(e.start, e.end, e.strand)]
if e in self.graph:
self.logger.info("Removing exon {0}".format(e))
self.graph.remove_node(e)
del self.exons[e.chrom][to_sloc(e.start, e.end, e.strand)]

def get_exons(self, chrom=None):
""" Return exon objects
Expand Down Expand Up @@ -117,8 +122,9 @@ def add_transcript(self, name, source, exons):
raise ValueError, "strands don't match"

# First add all exons
self.logger.debug("Adding transcript: {0} {1} {2}".format(name, source, exons))
exons = [self.add_exon(*exon) for exon in exons]

# Add transcript model to the graph
self.graph.add_path(exons)

Expand Down Expand Up @@ -193,31 +199,47 @@ def filter_long(self, l=1000):
exons = self.get_exons()
for exon in self.get_exons():
if len(exon) >= l:
self.logger.info("Checking exon {0} length {1} evidence {2}".format(exon, l, exon.evidence))
if len(exon.evidence) < 2:
self.logger.debug("Checking exon {0} length {1} evidence {2}".format(exon, l, exon.evidence))
if len([e.split(SEP)[0] for e in exon.evidence]) < 2:
out_edges = len(self.graph.out_edges([exon]))
in_edges = len(self.graph.in_edges([exon]))
self.logger.info("In {0} Out {1}".format(in_edges,out_edges))
self.logger.debug("In {0} Out {1}".format(in_edges,out_edges))

if in_edges >= 0 and out_edges >= 1 and exon.strand == "+" or in_edges >= 1 and out_edges >= 0 and exon.strand == "-":
self.logger.info("Removing long exon {0}".format(exon))
self.graph.remove_node(exon)
del self.exons[exon.chrom][to_sloc(exon.start, exon.end, exon.strand)]

def filter_and_merge(self, nodes, l):
for e1, e2 in self.graph.edges_iter(nodes):
if e2.start - e1.end <= l:
new_exon = self.add_exon(e1.chrom, e1.start, e2.end, e1.strand)
self.logger.info("Adding {0}".format(new_exon))
for e_in in [e[0] for e in self.graph.in_edges([e1])]:
self.graph.add_edge(e_in, new_exon)
for e_out in [e[1] for e in self.graph.out_edges([e2])]:
self.graph.add_edge(new_exon, e2)

for e in (e1, e2):
self.remove_exon(e)

return new_exon
return None

def filter_short_introns(self, l=10, mode='merge'):
filter_nodes = []
for intron in self.graph.edges_iter():
e1,e2 = intron
if e2.start - e1.end <= l:
if mode == "merge":
print "merging {0} {1}".format(e1, e2)
new_exon = self.add_exon(e1.chrom, e1.start, e2.end, e1.strand)
for e_in in [e[0] for e in self.graph.in_edges([e1])]:
self.graph.add_edge(e_in, new_exon)
for e_out in [e[1] for e in self.graph.out_edges([e2])]:
self.graph.add_edge(new_exon, e2)

for e in (e1, e2):
self.remove_exon(e)
filter_nodes += [e1, e2]

if mode == "merge":
exon = self.filter_and_merge(filter_nodes, l)
while exon:
exon = self.filter_and_merge(filter_nodes + [exon], l)
else:
for e in filter_nodes:
self.remove_exon(e)

def all_simple_paths(self, exon1, exon2):
return nx.all_simple_paths(self.graph, exon1, exon2)
Expand Down Expand Up @@ -278,7 +300,7 @@ def get_read_statistics(self, fnames, name, span="exon", extend=(0,0), nreads=No
for i, fname in enumerate(fnames):
if fname.endswith("bam") and (not nreads or not nreads[i]):
rmrepeats = True
self.logger.info("Counting reads in {0}".format(fname))
self.logger.debug("Counting reads in {0}".format(fname))
self.nreads[name] += read_statistics(fname)
else:
rmrepeats = False
Expand Down Expand Up @@ -392,13 +414,13 @@ def get_weight(self, transcript, identifier, idtype):
def max_weight(self, transcripts, identifier_weight):
#identifier_weight = []

for transcript in transcripts:
tw = self.get_weight(transcript, None, "evidence")
self.logger.debug("Weight: {0} - {1} - {2}".format(
model_to_bed(transcript),
"evidence",
tw,
))
#for transcript in transcripts:
# tw = self.get_weight(transcript, None, "evidence")
# self.logger.debug("Weight: {0} - {1} - {2}".format(
# model_to_bed(transcript),
# "evidence",
# tw,
# ))

if not identifier_weight or len(identifier_weight) == 0:
w = [len(t) for t in transcripts]
Expand All @@ -414,11 +436,11 @@ def max_weight(self, transcripts, identifier_weight):
idw = []
for transcript in transcripts:
tw = self.get_weight(transcript, identifier, idtype)
self.logger.debug("Weight: {0} - {1} - {2}".format(
model_to_bed(transcript),
identifier,
tw
))
#self.logger.debug("Weight: {0} - {1} - {2}".format(
# model_to_bed(transcript),
# identifier,
# tw
# ))
idw.append(pseudo + tw)

idw = numpy.array(idw)
Expand Down
1 change: 1 addition & 0 deletions pita/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
TSS_UPSTREAM = "u"
TSS_DOWNSTREAM = "a"
TSS_NOTFOUND = "x"
SEP = ":::"
15 changes: 9 additions & 6 deletions pita/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import sys
import logging

def merge_exons(starts, sizes):
def merge_exons(starts, sizes, l=0):
merge = []
for i, (start1, start2, size) in enumerate(zip(starts[:-1], starts[1:], sizes[:-1])):
if start1 + size >= start2:
if start1 + size + l >= start2:
merge.append(i + 1)

if len(merge) == 0:
Expand Down Expand Up @@ -41,11 +41,14 @@ def _gff_type_iterator(feature, ftypes):
for f in _gff_type_iterator(feature, ftypes):
yield f

def read_gff_transcripts(fobj, fname="", min_exons=1):
def read_gff_transcripts(fobj, fname="", min_exons=1, merge=0):

# Setup logging
logger = logging.getLogger('pita')


if merge > 0:
logger.warning("Merging exons not yet implemented for GFF files!")

#limits = dict(gff_type = ["mRNA", "exon"])
smap = {"1":"+",1:"+","-1":"-",-1:"-", None:"+"}
transcripts = []
Expand Down Expand Up @@ -73,7 +76,7 @@ def read_gff_transcripts(fobj, fname="", min_exons=1):



def read_bed_transcripts(fobj, fname="", min_exons=1):
def read_bed_transcripts(fobj, fname="", min_exons=1, merge=0):

# Setup logging
logger = logging.getLogger('pita')
Expand All @@ -92,7 +95,7 @@ def read_bed_transcripts(fobj, fname="", min_exons=1):
sizes = [int(x) for x in vals[10].strip(",").split(",")]
starts = [int(x) for x in vals[11].strip(",").split(",")]

starts, sizes = merge_exons(starts, sizes)
starts, sizes = merge_exons(starts, sizes, l=merge)

i = 1
name = "%s_%s_%s" % (vals[0], vals[3], i)
Expand Down
21 changes: 12 additions & 9 deletions pita/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,28 @@
import itertools
import sys
from tempfile import NamedTemporaryFile
from pita.config import SEP

def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
sep = ":::"

logger = logging.getLogger("pita")

try:
# Read annotation files
mc = Collection(index)
logger.info("Reading annotation for {0}".format(chrom))
for name, fname, ftype in anno_files:
logger.debug("Reading annotation from {0}".format(fname))
tabixfile = pysam.Tabixfile(fname)
#tabixfile = fname
if chrom in tabixfile.contigs:
fobj = TabixIteratorAsFile(tabixfile.fetch(chrom))
if ftype == "bed":
it = read_bed_transcripts(fobj, fname, 2)
it = read_bed_transcripts(fobj, fname, min_exons=2, merge=10)
elif ftype in ["gff", "gtf", "gff3"]:
it = read_gff_transcripts(fobj, fname, 2)
it = read_gff_transcripts(fobj, fname, min_exons=2, merge=10)
for tname, source, exons in it:
mc.add_transcript("{0}{1}{2}".format(name, sep, tname), source, exons)
mc.add_transcript("{0}{1}{2}".format(name, SEP, tname), source, exons)
del fobj
tabixfile.close()
del tabixfile
Expand All @@ -38,7 +39,8 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
# Remove long exons with only one evidence source
mc.filter_long(l=2000)
# Remove short introns
mc.filter_short_introns()
#mc.filter_short_introns()
logger.info("Loading data for {0}".format(chrom))

for name, fname, span, extend in data:
if span == "splice":
Expand All @@ -50,12 +52,13 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):

models = {}
exons = {}
logger.info("Calling transcripts for {0}".format(chrom))
for cluster in mc.get_connected_models():
while len(cluster) > 0:
best_model = mc.max_weight(cluster, weight)
variants = [m for m in mc.all_simple_paths(best_model[0], best_model[-1])]
if len(variants) > 1:
logger.info("Checking {0} extra variants".format(len(variants)))
logger.debug("Checking {0} extra variants".format(len(variants)))
best_model = mc.max_weight(variants, weight)


Expand Down Expand Up @@ -87,17 +90,17 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
best_ev = {}
other_ev = {}
for e in best_model:
for ev in set([x.split(sep)[0] for x in e.evidence]):
for ev in set([x.split(SEP)[0] for x in e.evidence]):
best_ev[ev] = best_ev.setdefault(ev, 0) + 1

# Fast way to collapse
for e in other_exons:
for ev in set([x.split(sep)[0] for x in e.evidence]):
for ev in set([x.split(SEP)[0] for x in e.evidence]):
other_ev[ev] = other_ev.setdefault(ev, 0) + 1
ev = []
for e in best_model + other_exons:
for evidence in e.evidence:
ev.append(evidence.split(sep))
ev.append(evidence.split(SEP))

### End ugly logging stuff
logger.debug("Best model: {0} with {1} exons".format(genename, len(best_model)))
Expand Down
2 changes: 1 addition & 1 deletion scripts/flatbread
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ for f in config["annotation"]:
sys.exit(1)
a = pybedtools.BedTool(path)
b = pybedtools.BedTool(bedfile)
a.intersect(b).saveas(os.path.join(outname, os.path.basename(path)))
a.intersect(b, wa=True).saveas(os.path.join(outname, os.path.basename(path)))
f['path'] = os.path.basename(path)

for f in config['data']:
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def run_tests(self):
scripts=[
"scripts/pita",
"scripts/bed12togff3",
"scripts/flatbread",
],
data_files=[],
tests_require=['pytest'],
Expand Down

0 comments on commit dfb2511

Please sign in to comment.