Various tweaks

simonvh · Mar 10, 2014 · 085f266 · 085f266
1 parent 4af03ca
commit 085f266
Show file tree

Hide file tree

Showing 8 changed files with 210 additions and 45 deletions.
diff --git a/pita/collection.py b/pita/collection.py
@@ -59,6 +59,8 @@ def __init__(self, index=None):
         else:
             self.index = None
 
+
+
     def add_exon(self, chrom, start, end, strand):
         """ 
         Create Exon object if does not exist.
@@ -81,6 +83,10 @@ def add_exon(self, chrom, start, end, strand):
         self.exons[chrom][to_sloc(start, end, strand)] = e
         return e
 
+    def remove_exon(self, e):
+        self.graph.remove_node(e)
+        del self.exons[e.chrom][to_sloc(e.start, e.end, e.strand)]
+
     def get_exons(self, chrom=None):
         """ Return exon objects
         Optional argument is the chromosome, otherwise return all
@@ -186,16 +192,53 @@ def prune(self):
     def filter_long(self, l=1000):
         exons = self.get_exons()
         for exon in self.get_exons():
-            if len(exon) >= l and len(exon.evidence) < 2:
-                out_edges = len(self.graph.out_edges([exon]))
-                in_edges = len(self.graph.in_edges([exon]))
-                if in_edges >= 1 and out_edges >= 1:
-                    self.logger.info("Removing long exon {0}".format(exon))
-                    self.graph.remove_node(exon)
-                    del self.exons[exon.chrom][to_sloc(exon.start, exon.end, exon.strand)]
-
+            if len(exon) >= l:
+                self.logger.info("Checking exon {0} length {1} evidence {2}".format(exon, l, exon.evidence))
+                if len(exon.evidence) < 2:
+                    out_edges = len(self.graph.out_edges([exon]))
+                    in_edges = len(self.graph.in_edges([exon]))
+                    self.logger.info("In {0} Out {1}".format(in_edges,out_edges))
+
+                    if in_edges >= 0 and out_edges >= 1 and exon.strand == "+" or in_edges >= 1 and out_edges >= 0 and exon.strand == "-":
+                        self.logger.info("Removing long exon {0}".format(exon))
+                        self.graph.remove_node(exon)
+                        del self.exons[exon.chrom][to_sloc(exon.start, exon.end, exon.strand)]
+
+    def filter_short_introns(self, l=10, mode='merge'):
+        for intron in self.graph.edges_iter():
+            e1,e2 = intron
+            if e2.start - e1.end <= l:
+                if mode == "merge":
+                    print "merging {0} {1}".format(e1, e2)
+                    new_exon = self.add_exon(e1.chrom, e1.start, e2.end, e1.strand)
+                    for e_in in [e[0] for e in self.graph.in_edges([e1])]:
+                        self.graph.add_edge(e_in, new_exon)
+                    for e_out in [e[1] for e in self.graph.out_edges([e2])]:
+                        self.graph.add_edge(new_exon, e2)
+
+                for e in (e1, e2):
+                    self.remove_exon(e)                         
+
+    def all_simple_paths(self, exon1, exon2):
+        return nx.all_simple_paths(self.graph, exon1, exon2)
+
+    def get_alt_splicing_exons(self):
+        for exon in self.get_exons():
+            out_exons = [e[1] for e in self.graph.out_edges([exon]) if len(self.graph.out_edges([e[1]])) > 0]
+            if len(out_exons) > 1:
+
+                out_exon = out_exons[0]
+
+                self.logger.info("ALT SPLICING {0} {1}".format(exon, out_exon))
+
+            #in_exons = [e[0] for e in self.graph.in_edges([exon])]
+            #for in_exon in in_exons:
+            #    my_in_exons = [e[0] for e in self.graph.in_edges([in_exon])]
+            #    for my_in_exon in my_in_exons:
+            #        if my_in_exon in in_exons:
+            #            self.logger.info("{0} is alternative exon".format(in_exon)) 
 
-    def get_read_statistics(self, fnames, name, span="exon", extend=(0,0)):
+    def get_read_statistics(self, fnames, name, span="exon", extend=(0,0), nreads=None):
         from fluff.fluffio import get_binned_stats
         from tempfile import NamedTemporaryFile
 
@@ -232,9 +275,10 @@ def get_read_statistics(self, fnames, name, span="exon", extend=(0,0)):
         if not self.nreads.has_key(name):
             self.nreads[name] = 0
 
-        for fname in fnames:
-            if fname.endswith("bam"):
+        for i, fname in enumerate(fnames):
+            if fname.endswith("bam") and (not nreads or not nreads[i]):
                 rmrepeats = True
+                self.logger.info("Counting reads in {0}".format(fname))
                 self.nreads[name] += read_statistics(fname) 
             else:
                 rmrepeats = False

diff --git a/pita/model.py b/pita/model.py
@@ -44,13 +44,19 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
                 mc.get_splice_statistics(fname, name=name)
             else:
                 logger.debug("Reading BAM data {0} from {1}".format(name, fname))
-                mc.get_read_statistics(fname, name=name, span=span, extend=extend)
+                mc.get_read_statistics(fname, name=name, span=span, extend=extend, nreads=None)
 
         models = {}
         exons = {}
         for cluster in mc.get_connected_models():
             while len(cluster) > 0:
                 best_model = mc.max_weight(cluster, weight)
+                variants = [m for m in mc.all_simple_paths(best_model[0], best_model[-1])]
+                if len(variants) > 1:
+                    logger.info("Checking {0} extra variants".format(len(variants)))
+                    best_model = mc.max_weight(variants, weight)
+
+
                 genename = "{0}:{1}-{2}_".format(
                                             best_model[0].chrom,
                                             best_model[0].start,
@@ -115,8 +121,13 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
                     w2 = 0.0
                     for d in prune:
                         logger.debug("Pruning overlap: {0}".format(d))
-                        w1 += mc.get_weight(m1, d["name"], d["type"])
-                        w2 += mc.get_weight(m2, d["name"], d["type"])
+                        tmp_w1 = mc.get_weight(m1, d["name"], d["type"])
+                        tmp_w2 = mc.get_weight(m2, d["name"], d["type"])
+                        m = max((tmp_w1, tmp_w2))
+                        if m > 0:
+                            w1 += tmp_w1 / max((tmp_w1, tmp_w2))
+                            w2 += tmp_w2 / max((tmp_w1, tmp_w2))
+
                     if w1 >= w2:
                         discard[gene2] = 1
                     else:

diff --git a/pita/util.py b/pita/util.py
@@ -19,7 +19,7 @@ def read_statistics(fname, rmrepeat=False, rmdup=False, mapped=False):
              )
 
     stdout,stderr = p.communicate()
-
+    
     n = int(stdout.strip())
 
     return n
@@ -48,24 +48,28 @@ def exons_to_seq(exons):
         seq = "".join((seq, e.seq))
     return seq
 
-
-def longest_orf(seq):
+def longest_orf(seq, do_prot=False):
     if type(seq) == type([]):
         seq = exons_to_seq(seq)
     dna = Seq(seq, IUPAC.ambiguous_dna)
 
     my_cmp = lambda x,y: cmp(len(x), len(y))
 
     orfs = []
+    prots = []
     for i in range(3):
         prot = str(dna[i:].translate())
-        putative_orfs = [re.sub(r'^[^M]*', "", o) for o in prot.split("*")]
-        longest_orf = sorted(putative_orfs, cmp=my_cmp)[-1]
-        start = prot.find(longest_orf) * 3 + i
-        end = start + (len(longest_orf) + 1) * 3
+        #putative_orfs = [re.sub(r'^[^M]*', "", o) for o in prot.split("*")]
+        putative_orfs = [o for o in prot.split("*")]
+        longest = sorted(putative_orfs, cmp=my_cmp)[-1]
+        prots.append(longest)
+        start = prot.find(longest) * 3 + i
+        end = start + (len(longest) + 1) * 3
         orfs.append((start,end))
-
-    return sorted(orfs, cmp=lambda x,y: cmp(x[1] - x[0], y[1] - y[0]))[-1]
+    if do_prot:
+        return sorted(prots, cmp=lambda x,y: cmp(len(x), len(y)))[-1]
+    else:
+        return sorted(orfs, cmp=lambda x,y: cmp(x[1] - x[0], y[1] - y[0]))[-1]
 
 def find_genomic_pos(pos, exons):
     if exons[0].strand == "-":

diff --git a/scripts/flatbread b/scripts/flatbread
@@ -1,19 +1,10 @@
 #!/usr/bin/env python
-from pita.model import get_chrom_models
-from pita.log import AnnotationLog
-from pita.util import to_genomic_orf, longest_orf
+# Create test set for use with pita
 import os
 import sys
-import logging
 import yaml
 import argparse
-import pysam
-import subprocess
-import pp
-from tempfile import NamedTemporaryFile
-import multiprocessing as mp
-from functools import partial
-import signal
+import pybedtools
 
 p = argparse.ArgumentParser()
 p.add_argument("-c",
@@ -34,6 +25,10 @@ configfile = args.configfile
 bedfile = args.bedfile
 outname = args.output
 
+if not (configfile and bedfile and outname):
+    p.print_help()
+    sys.exit()
+
 # Parse YAML config file
 f = open(configfile, "r")
 config = yaml.load(f)
@@ -43,4 +38,41 @@ base = "."
 if config.has_key("data_path"):
     base = config["data_path"]
 
-print config["annotation"]
+# Output data directoy
+if not os.path.exists(outname):
+    os.mkdir(outname)
+else:
+    print("Output path {0} already exists!".format(outname))
+    sys.exit(1)
+
+# Get chromosomes from BED file
+chroms = {}
+a = pybedtools.BedTool(bedfile)
+for f in a:
+    chroms[f[0]] = 1
+config["chromosomes"] = chroms.keys()
+
+# Do all intersections with BED file
+for f in config["annotation"]:
+    path = f['path']
+    if not os.path.exists(path):
+        path = os.path.join(base, path)
+    if not os.path.exists(path):
+        print("File not found: {0}".format(path))
+        sys.exit(1)
+    a = pybedtools.BedTool(path)
+    b = pybedtools.BedTool(bedfile)
+    a.intersect(b).saveas(os.path.join(outname, os.path.basename(path)))
+    f['path'] = os.path.basename(path)
+
+for f in config['data']:
+    for p in f['path']:
+        if not os.path.exists(p):
+            os.symlink(os.path.join(base, p), os.path.join(outname, p))
+            if p.endswith("bam"):
+                os.symlink(os.path.join(base, p + ".bai"), os.path.join(outname, p + ".bai"))
+
+config["data_path"] = os.path.abspath(outname)
+
+with open("{0}.yaml".format(outname), "w") as f:
+    f.write(yaml.dump(config))