Skip to content

Commit

Permalink
Various tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon van Heeringen committed Mar 10, 2014
1 parent 4af03ca commit 085f266
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 45 deletions.
66 changes: 55 additions & 11 deletions pita/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def __init__(self, index=None):
else:
self.index = None



def add_exon(self, chrom, start, end, strand):
"""
Create Exon object if does not exist.
Expand All @@ -81,6 +83,10 @@ def add_exon(self, chrom, start, end, strand):
self.exons[chrom][to_sloc(start, end, strand)] = e
return e

def remove_exon(self, e):
self.graph.remove_node(e)
del self.exons[e.chrom][to_sloc(e.start, e.end, e.strand)]

def get_exons(self, chrom=None):
""" Return exon objects
Optional argument is the chromosome, otherwise return all
Expand Down Expand Up @@ -186,16 +192,53 @@ def prune(self):
def filter_long(self, l=1000):
exons = self.get_exons()
for exon in self.get_exons():
if len(exon) >= l and len(exon.evidence) < 2:
out_edges = len(self.graph.out_edges([exon]))
in_edges = len(self.graph.in_edges([exon]))
if in_edges >= 1 and out_edges >= 1:
self.logger.info("Removing long exon {0}".format(exon))
self.graph.remove_node(exon)
del self.exons[exon.chrom][to_sloc(exon.start, exon.end, exon.strand)]

if len(exon) >= l:
self.logger.info("Checking exon {0} length {1} evidence {2}".format(exon, l, exon.evidence))
if len(exon.evidence) < 2:
out_edges = len(self.graph.out_edges([exon]))
in_edges = len(self.graph.in_edges([exon]))
self.logger.info("In {0} Out {1}".format(in_edges,out_edges))

if in_edges >= 0 and out_edges >= 1 and exon.strand == "+" or in_edges >= 1 and out_edges >= 0 and exon.strand == "-":
self.logger.info("Removing long exon {0}".format(exon))
self.graph.remove_node(exon)
del self.exons[exon.chrom][to_sloc(exon.start, exon.end, exon.strand)]

def filter_short_introns(self, l=10, mode='merge'):
for intron in self.graph.edges_iter():
e1,e2 = intron
if e2.start - e1.end <= l:
if mode == "merge":
print "merging {0} {1}".format(e1, e2)
new_exon = self.add_exon(e1.chrom, e1.start, e2.end, e1.strand)
for e_in in [e[0] for e in self.graph.in_edges([e1])]:
self.graph.add_edge(e_in, new_exon)
for e_out in [e[1] for e in self.graph.out_edges([e2])]:
self.graph.add_edge(new_exon, e2)

for e in (e1, e2):
self.remove_exon(e)

def all_simple_paths(self, exon1, exon2):
return nx.all_simple_paths(self.graph, exon1, exon2)

def get_alt_splicing_exons(self):
for exon in self.get_exons():
out_exons = [e[1] for e in self.graph.out_edges([exon]) if len(self.graph.out_edges([e[1]])) > 0]
if len(out_exons) > 1:

out_exon = out_exons[0]

self.logger.info("ALT SPLICING {0} {1}".format(exon, out_exon))

#in_exons = [e[0] for e in self.graph.in_edges([exon])]
#for in_exon in in_exons:
# my_in_exons = [e[0] for e in self.graph.in_edges([in_exon])]
# for my_in_exon in my_in_exons:
# if my_in_exon in in_exons:
# self.logger.info("{0} is alternative exon".format(in_exon))

def get_read_statistics(self, fnames, name, span="exon", extend=(0,0)):
def get_read_statistics(self, fnames, name, span="exon", extend=(0,0), nreads=None):
from fluff.fluffio import get_binned_stats
from tempfile import NamedTemporaryFile

Expand Down Expand Up @@ -232,9 +275,10 @@ def get_read_statistics(self, fnames, name, span="exon", extend=(0,0)):
if not self.nreads.has_key(name):
self.nreads[name] = 0

for fname in fnames:
if fname.endswith("bam"):
for i, fname in enumerate(fnames):
if fname.endswith("bam") and (not nreads or not nreads[i]):
rmrepeats = True
self.logger.info("Counting reads in {0}".format(fname))
self.nreads[name] += read_statistics(fname)
else:
rmrepeats = False
Expand Down
17 changes: 14 additions & 3 deletions pita/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,19 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
mc.get_splice_statistics(fname, name=name)
else:
logger.debug("Reading BAM data {0} from {1}".format(name, fname))
mc.get_read_statistics(fname, name=name, span=span, extend=extend)
mc.get_read_statistics(fname, name=name, span=span, extend=extend, nreads=None)

models = {}
exons = {}
for cluster in mc.get_connected_models():
while len(cluster) > 0:
best_model = mc.max_weight(cluster, weight)
variants = [m for m in mc.all_simple_paths(best_model[0], best_model[-1])]
if len(variants) > 1:
logger.info("Checking {0} extra variants".format(len(variants)))
best_model = mc.max_weight(variants, weight)


genename = "{0}:{1}-{2}_".format(
best_model[0].chrom,
best_model[0].start,
Expand Down Expand Up @@ -115,8 +121,13 @@ def get_chrom_models(chrom, anno_files, data, weight, prune=None, index=None):
w2 = 0.0
for d in prune:
logger.debug("Pruning overlap: {0}".format(d))
w1 += mc.get_weight(m1, d["name"], d["type"])
w2 += mc.get_weight(m2, d["name"], d["type"])
tmp_w1 = mc.get_weight(m1, d["name"], d["type"])
tmp_w2 = mc.get_weight(m2, d["name"], d["type"])
m = max((tmp_w1, tmp_w2))
if m > 0:
w1 += tmp_w1 / max((tmp_w1, tmp_w2))
w2 += tmp_w2 / max((tmp_w1, tmp_w2))

if w1 >= w2:
discard[gene2] = 1
else:
Expand Down
22 changes: 13 additions & 9 deletions pita/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def read_statistics(fname, rmrepeat=False, rmdup=False, mapped=False):
)

stdout,stderr = p.communicate()

n = int(stdout.strip())

return n
Expand Down Expand Up @@ -48,24 +48,28 @@ def exons_to_seq(exons):
seq = "".join((seq, e.seq))
return seq


def longest_orf(seq):
def longest_orf(seq, do_prot=False):
if type(seq) == type([]):
seq = exons_to_seq(seq)
dna = Seq(seq, IUPAC.ambiguous_dna)

my_cmp = lambda x,y: cmp(len(x), len(y))

orfs = []
prots = []
for i in range(3):
prot = str(dna[i:].translate())
putative_orfs = [re.sub(r'^[^M]*', "", o) for o in prot.split("*")]
longest_orf = sorted(putative_orfs, cmp=my_cmp)[-1]
start = prot.find(longest_orf) * 3 + i
end = start + (len(longest_orf) + 1) * 3
#putative_orfs = [re.sub(r'^[^M]*', "", o) for o in prot.split("*")]
putative_orfs = [o for o in prot.split("*")]
longest = sorted(putative_orfs, cmp=my_cmp)[-1]
prots.append(longest)
start = prot.find(longest) * 3 + i
end = start + (len(longest) + 1) * 3
orfs.append((start,end))

return sorted(orfs, cmp=lambda x,y: cmp(x[1] - x[0], y[1] - y[0]))[-1]
if do_prot:
return sorted(prots, cmp=lambda x,y: cmp(len(x), len(y)))[-1]
else:
return sorted(orfs, cmp=lambda x,y: cmp(x[1] - x[0], y[1] - y[0]))[-1]

def find_genomic_pos(pos, exons):
if exons[0].strand == "-":
Expand Down
56 changes: 44 additions & 12 deletions scripts/flatbread
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
#!/usr/bin/env python
from pita.model import get_chrom_models
from pita.log import AnnotationLog
from pita.util import to_genomic_orf, longest_orf
# Create test set for use with pita
import os
import sys
import logging
import yaml
import argparse
import pysam
import subprocess
import pp
from tempfile import NamedTemporaryFile
import multiprocessing as mp
from functools import partial
import signal
import pybedtools

p = argparse.ArgumentParser()
p.add_argument("-c",
Expand All @@ -34,6 +25,10 @@ configfile = args.configfile
bedfile = args.bedfile
outname = args.output

if not (configfile and bedfile and outname):
p.print_help()
sys.exit()

# Parse YAML config file
f = open(configfile, "r")
config = yaml.load(f)
Expand All @@ -43,4 +38,41 @@ base = "."
if config.has_key("data_path"):
base = config["data_path"]

print config["annotation"]
# Output data directoy
if not os.path.exists(outname):
os.mkdir(outname)
else:
print("Output path {0} already exists!".format(outname))
sys.exit(1)

# Get chromosomes from BED file
chroms = {}
a = pybedtools.BedTool(bedfile)
for f in a:
chroms[f[0]] = 1
config["chromosomes"] = chroms.keys()

# Do all intersections with BED file
for f in config["annotation"]:
path = f['path']
if not os.path.exists(path):
path = os.path.join(base, path)
if not os.path.exists(path):
print("File not found: {0}".format(path))
sys.exit(1)
a = pybedtools.BedTool(path)
b = pybedtools.BedTool(bedfile)
a.intersect(b).saveas(os.path.join(outname, os.path.basename(path)))
f['path'] = os.path.basename(path)

for f in config['data']:
for p in f['path']:
if not os.path.exists(p):
os.symlink(os.path.join(base, p), os.path.join(outname, p))
if p.endswith("bam"):
os.symlink(os.path.join(base, p + ".bai"), os.path.join(outname, p + ".bai"))

config["data_path"] = os.path.abspath(outname)

with open("{0}.yaml".format(outname), "w") as f:
f.write(yaml.dump(config))
Loading

0 comments on commit 085f266

Please sign in to comment.