v1.60

simonvh · Mar 10, 2014 · dfb2511 · dfb2511
1 parent 6136c6f
commit dfb2511
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 46 deletions.
diff --git a/pita/collection.py b/pita/collection.py
@@ -1,6 +1,7 @@
 from pita.exon import *
 from pita.util import read_statistics
 from pita.util import longest_orf,exons_to_seq,model_to_bed
+from pita.config import SEP
 import numpy
 import sys
 import logging
@@ -76,16 +77,20 @@ def add_exon(self, chrom, start, end, strand):
 
         # Otherwise create new exon and return it
         e = Exon(chrom, start, end, strand)
-
+
+        self.graph.add_node(e)
+
         if self.index:
             e.seq = self.index.get_sequence(chrom, start, end, strand)
 
         self.exons[chrom][to_sloc(start, end, strand)] = e
         return e
 
     def remove_exon(self, e):
-        self.graph.remove_node(e)
-        del self.exons[e.chrom][to_sloc(e.start, e.end, e.strand)]
+        if e in self.graph:
+            self.logger.info("Removing exon {0}".format(e))
+            self.graph.remove_node(e)
+            del self.exons[e.chrom][to_sloc(e.start, e.end, e.strand)]
 
     def get_exons(self, chrom=None):
         """ Return exon objects
@@ -117,8 +122,9 @@ def add_transcript(self, name, source, exons):
                 raise ValueError, "strands don't match"
 
         # First add all exons
+        self.logger.debug("Adding transcript: {0} {1} {2}".format(name, source, exons))
         exons = [self.add_exon(*exon) for exon in exons]
-        
+
         # Add transcript model to the graph
         self.graph.add_path(exons)
 
@@ -193,31 +199,47 @@ def filter_long(self, l=1000):
         exons = self.get_exons()
         for exon in self.get_exons():
             if len(exon) >= l:
-                self.logger.info("Checking exon {0} length {1} evidence {2}".format(exon, l, exon.evidence))
-                if len(exon.evidence) < 2:
+                self.logger.debug("Checking exon {0} length {1} evidence {2}".format(exon, l, exon.evidence))
+                if len([e.split(SEP)[0] for e in exon.evidence]) < 2:
                     out_edges = len(self.graph.out_edges([exon]))
                     in_edges = len(self.graph.in_edges([exon]))
-                    self.logger.info("In {0} Out {1}".format(in_edges,out_edges))
+                    self.logger.debug("In {0} Out {1}".format(in_edges,out_edges))
 
                     if in_edges >= 0 and out_edges >= 1 and exon.strand == "+" or in_edges >= 1 and out_edges >= 0 and exon.strand == "-":
                         self.logger.info("Removing long exon {0}".format(exon))
                         self.graph.remove_node(exon)
                         del self.exons[exon.chrom][to_sloc(exon.start, exon.end, exon.strand)]
 
+    def filter_and_merge(self, nodes, l):
+        for e1, e2 in self.graph.edges_iter(nodes):
+            if e2.start - e1.end <= l:
+                new_exon = self.add_exon(e1.chrom, e1.start, e2.end, e1.strand)
+                self.logger.info("Adding {0}".format(new_exon))
+                for e_in in [e[0] for e in self.graph.in_edges([e1])]:
+                    self.graph.add_edge(e_in, new_exon)
+                for e_out in [e[1] for e in self.graph.out_edges([e2])]:
+                    self.graph.add_edge(new_exon, e2)
+
+                for e in (e1, e2):
+                    self.remove_exon(e)                         
+
+                return new_exon 
+        return None
+
     def filter_short_introns(self, l=10, mode='merge'):
+        filter_nodes = []
         for intron in self.graph.edges_iter():
             e1,e2 = intron
             if e2.start - e1.end <= l:
-                if mode == "merge":
-                    print "merging {0} {1}".format(e1, e2)
-                    new_exon = self.add_exon(e1.chrom, e1.start, e2.end, e1.strand)
-                    for e_in in [e[0] for e in self.graph.in_edges([e1])]:
-                        self.graph.add_edge(e_in, new_exon)
-                    for e_out in [e[1] for e in self.graph.out_edges([e2])]:
-                        self.graph.add_edge(new_exon, e2)
-
-                for e in (e1, e2):
-                    self.remove_exon(e)                         
+                filter_nodes += [e1, e2]
+
+        if mode == "merge":
+            exon = self.filter_and_merge(filter_nodes, l)
+            while exon:
+                exon = self.filter_and_merge(filter_nodes + [exon], l)
+        else:
+            for e in filter_nodes:
+                self.remove_exon(e)                         
 
     def all_simple_paths(self, exon1, exon2):
         return nx.all_simple_paths(self.graph, exon1, exon2)
@@ -278,7 +300,7 @@ def get_read_statistics(self, fnames, name, span="exon", extend=(0,0), nreads=No
         for i, fname in enumerate(fnames):
             if fname.endswith("bam") and (not nreads or not nreads[i]):
                 rmrepeats = True
-                self.logger.info("Counting reads in {0}".format(fname))
+                self.logger.debug("Counting reads in {0}".format(fname))
                 self.nreads[name] += read_statistics(fname) 
             else:
                 rmrepeats = False
@@ -392,13 +414,13 @@ def get_weight(self, transcript, identifier, idtype):
     def max_weight(self, transcripts, identifier_weight):
         #identifier_weight = []
 
-        for transcript in transcripts:
-            tw = self.get_weight(transcript, None, "evidence")
-            self.logger.debug("Weight: {0} - {1} - {2}".format(
-                                                           model_to_bed(transcript),
-                                                           "evidence",
-                                                           tw,
-                                                           ))
+        #for transcript in transcripts:
+        #    tw = self.get_weight(transcript, None, "evidence")
+        #    self.logger.debug("Weight: {0} - {1} - {2}".format(
+        #                                                   model_to_bed(transcript),
+        #                                                   "evidence",
+        #                                                   tw,
+        #                                                   ))
 
         if not identifier_weight or len(identifier_weight) == 0:
             w = [len(t) for t in transcripts]    
@@ -414,11 +436,11 @@ def max_weight(self, transcripts, identifier_weight):
                 idw = []
                 for transcript in transcripts:
                     tw = self.get_weight(transcript, identifier, idtype)
-                    self.logger.debug("Weight: {0} - {1} - {2}".format(
-                                                             model_to_bed(transcript),
-                                                             identifier,
-                                                             tw
-                                                             ))
+                    #self.logger.debug("Weight: {0} - {1} - {2}".format(
+                    #                                         model_to_bed(transcript),
+                    #                                         identifier,
+                    #                                         tw
+                    #                                         ))
                     idw.append(pseudo + tw)
 
                 idw = numpy.array(idw)

diff --git a/pita/config.py b/pita/config.py
@@ -3,3 +3,4 @@
 TSS_UPSTREAM = "u"
 TSS_DOWNSTREAM = "a"
 TSS_NOTFOUND = "x"
+SEP = ":::"
diff --git a/pita/io.py b/pita/io.py
@@ -4,10 +4,10 @@
 import sys
 import logging 
 
-def merge_exons(starts, sizes):
+def merge_exons(starts, sizes, l=0):
     merge = []
     for i, (start1, start2, size) in enumerate(zip(starts[:-1], starts[1:], sizes[:-1])):
-        if start1 + size >= start2:
+        if start1 + size + l >= start2:
             merge.append(i + 1)
 
     if len(merge) == 0:
@@ -41,11 +41,14 @@ def _gff_type_iterator(feature, ftypes):
             for f in _gff_type_iterator(feature, ftypes):
                 yield f
 
-def read_gff_transcripts(fobj, fname="", min_exons=1):
+def read_gff_transcripts(fobj, fname="", min_exons=1, merge=0):
 
     # Setup logging
     logger = logging.getLogger('pita')
-
+
+    if merge > 0:
+        logger.warning("Merging exons not yet implemented for GFF files!")
+
     #limits = dict(gff_type = ["mRNA", "exon"])
     smap = {"1":"+",1:"+","-1":"-",-1:"-", None:"+"}
     transcripts = []
@@ -73,7 +76,7 @@ def read_gff_transcripts(fobj, fname="", min_exons=1):
 
 
 
-def read_bed_transcripts(fobj, fname="", min_exons=1):
+def read_bed_transcripts(fobj, fname="", min_exons=1, merge=0):
 
     # Setup logging
     logger = logging.getLogger('pita')
@@ -92,7 +95,7 @@ def read_bed_transcripts(fobj, fname="", min_exons=1):
                     sizes = [int(x) for x in vals[10].strip(",").split(",")]
                     starts = [int(x) for x in vals[11].strip(",").split(",")] 
 
-                    starts, sizes = merge_exons(starts, sizes)
+                    starts, sizes = merge_exons(starts, sizes, l=merge)
 
                     i = 1
                     name = "%s_%s_%s" % (vals[0], vals[3], i)

diff --git a/pita/model.py b/pita/model.py
@@ -6,27 +6,28 @@
 import itertools
 import sys
 from tempfile import NamedTemporaryFile
+from pita.config import SEP
 
 def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
-    sep = ":::"
 
     logger = logging.getLogger("pita")
 
     try:
         # Read annotation files
         mc = Collection(index)
+        logger.info("Reading annotation for {0}".format(chrom))
         for name, fname, ftype in anno_files:
             logger.debug("Reading annotation from {0}".format(fname))
             tabixfile = pysam.Tabixfile(fname)
             #tabixfile = fname
             if chrom in tabixfile.contigs:
                 fobj = TabixIteratorAsFile(tabixfile.fetch(chrom))
                 if ftype == "bed":
-                    it = read_bed_transcripts(fobj, fname, 2)
+                    it = read_bed_transcripts(fobj, fname, min_exons=2, merge=10)
                 elif ftype in ["gff", "gtf", "gff3"]:
-                    it = read_gff_transcripts(fobj, fname, 2)
+                    it = read_gff_transcripts(fobj, fname, min_exons=2, merge=10)
                 for tname, source, exons in it:
-                    mc.add_transcript("{0}{1}{2}".format(name, sep, tname), source, exons)
+                    mc.add_transcript("{0}{1}{2}".format(name, SEP, tname), source, exons)
                 del fobj    
             tabixfile.close()
             del tabixfile
@@ -38,7 +39,8 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
         # Remove long exons with only one evidence source
         mc.filter_long(l=2000)
         # Remove short introns
-        mc.filter_short_introns()
+        #mc.filter_short_introns()
+        logger.info("Loading data for {0}".format(chrom))
 
         for name, fname, span, extend in data:
             if span == "splice":
@@ -50,12 +52,13 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
 
         models = {}
         exons = {}
+        logger.info("Calling transcripts for {0}".format(chrom))
         for cluster in mc.get_connected_models():
             while len(cluster) > 0:
                 best_model = mc.max_weight(cluster, weight)
                 variants = [m for m in mc.all_simple_paths(best_model[0], best_model[-1])]
                 if len(variants) > 1:
-                    logger.info("Checking {0} extra variants".format(len(variants)))
+                    logger.debug("Checking {0} extra variants".format(len(variants)))
                     best_model = mc.max_weight(variants, weight)
 
 
@@ -87,17 +90,17 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
                 best_ev = {}
                 other_ev = {}
                 for e in best_model:
-                    for ev in set([x.split(sep)[0] for x in e.evidence]):
+                    for ev in set([x.split(SEP)[0] for x in e.evidence]):
                         best_ev[ev] = best_ev.setdefault(ev, 0) + 1
 
                 # Fast way to collapse
                 for e in other_exons:
-                    for ev in set([x.split(sep)[0] for x in e.evidence]):
+                    for ev in set([x.split(SEP)[0] for x in e.evidence]):
                         other_ev[ev] = other_ev.setdefault(ev, 0) + 1
                 ev = []
                 for e in best_model + other_exons:
                     for evidence in e.evidence:
-                        ev.append(evidence.split(sep))
+                        ev.append(evidence.split(SEP))
 
                 ### End ugly logging stuff
                 logger.debug("Best model: {0} with {1} exons".format(genename, len(best_model)))

diff --git a/scripts/flatbread b/scripts/flatbread
@@ -62,7 +62,7 @@ for f in config["annotation"]:
         sys.exit(1)
     a = pybedtools.BedTool(path)
     b = pybedtools.BedTool(bedfile)
-    a.intersect(b).saveas(os.path.join(outname, os.path.basename(path)))
+    a.intersect(b, wa=True).saveas(os.path.join(outname, os.path.basename(path)))
     f['path'] = os.path.basename(path)
 
 for f in config['data']:

diff --git a/setup.py b/setup.py
@@ -30,6 +30,7 @@ def run_tests(self):
         scripts=[
             "scripts/pita",
             "scripts/bed12togff3",
+            "scripts/flatbread",
         ],
         data_files=[],
         tests_require=['pytest'],