Skip to content

Commit

Permalink
initial db version
Browse files Browse the repository at this point in the history
  • Loading branch information
simonvh committed Sep 24, 2014
1 parent a323299 commit 5454f56
Show file tree
Hide file tree
Showing 4 changed files with 311 additions and 1 deletion.
205 changes: 205 additions & 0 deletions pita/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import os
import sys
import logging
from gimmemotifs.genome_index import GenomeIndex
from sqlalchemy import Column, ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship,sessionmaker
from sqlalchemy import create_engine, and_
from sqlalchemy.ext.associationproxy import association_proxy

Base = declarative_base()

class FeatureEvidence(Base):
__tablename__ = 'feature_evidence'
feature_id = Column(Integer, ForeignKey('feature.id'), primary_key=True)
evidence_id = Column(Integer, ForeignKey('evidence.id'), primary_key=True)
evidence = relationship("Evidence")
feature = relationship("Feature")

class Feature(Base):
__tablename__ = 'feature'
__table_args__ = (
UniqueConstraint(
'chrom',
'start',
'end',
'strand',
'ftype',
name='uix_1'),
)
id = Column(Integer, primary_key=True)
chrom = Column(String(250), nullable=False)
start = Column(Integer, nullable=False)
end = Column(Integer, nullable=False)
strand = Column(String(1), nullable=False)
ftype = Column(String(250), nullable=False)
_evidences = relationship('FeatureEvidence')
evidences = association_proxy('_evidences', 'evidence',
creator=lambda _i: FeatureEvidence(evidence=_i),
)

class Evidence(Base):
__tablename__ = "evidence"
id = Column(Integer, primary_key=True)
name = Column(String(50))
source = Column(String(50))

class ReadSource(Base):
__tablename__ = "read_source"
id = Column(Integer, primary_key=True)
name = Column(String(250))
source = Column(String(250))

class FeatureReadcount(Base):
__tablename__ = "read_count"
id = Column(Integer, primary_key=True)
read_source_id = Column(Integer, ForeignKey('read_source.id'), primary_key=True)
feature_id = Column(Integer, ForeignKey('feature.id'), primary_key=True)
read_source = relationship("ReadSource")
feature = relationship("Feature")
count = Column(Integer)
span = Column(String(50))
extend_up = Column(Integer)
extend_down = Column(Integer)

#class Read(Base):
# __tablename__ = 'read'
# id = Column(Integer, primary_key=True)
# name = Column(String(250), nullable=False)

#class SplicedRead(Base):
# __tablename__ = 'spliced_read'
# id = Column(Integer, primary_key=True)
# splice_id = Column(Integer, ForeignKey('splice.id'))
# read_id = Column(Integer, ForeignKey('read.id'))

def get_or_create(session, model, **kwargs):
instance = session.query(model).filter_by(**kwargs).first()
if instance:
return instance
else:
instance = model(**kwargs)
session.add(instance)
return instance

class AnnotationDb():
def __init__(self, new=False):
self.logger = logging.getLogger("pita")

self.engine = create_engine('sqlite:///pita_test.db')
if new:
Base.metadata.drop_all(self.engine)
Base.metadata.create_all(self.engine)
Base.metadata.bind = self.engine
DBSession = sessionmaker(bind=self.engine)
self.session = DBSession()

def add_transcript(self, name, source, exons):
"""
Add a transcript to the database
"""

# Sanity checks
for e1, e2 in zip(exons[:-1], exons[1:]):
if e1[0] != e2[0]:
sys.stderr.write("{0} - {1}\n".format(e1, e2))
raise ValueError, "Different chromosomes!"
if e2[1] <= e1[2]:
sys.stderr.write("{0} - {1}\n".format(e1, e2))
raise ValueError, "exons overlap, or in wrong order"
if e1[3] != e2[3]:
sys.stderr.write("{0} - {1}\n".format(e1, e2))
raise ValueError, "strands don't match"

chrom = exons[0][0]
strand = exons[0][-1]

for exon in exons:
exon = get_or_create(self.session, Feature,
chrom = chrom,
start = exon[1],
end = exon[2],
strand = strand,
ftype = "exon"
)
exon.evidences.append(Evidence(name=name, source=source))

for start,end in [(e1[2], e2[1]) for e1, e2 in zip(exons[0:-1], exons[1:])]:
sj = get_or_create(self.session, Feature,
chrom = chrom,
start = start,
end = end,
strand = strand,
ftype = "splice_junction"
)
sj.evidences.append(Evidence(name=name, source=source))
self.session.commit()

def get_exons(self, chrom=None):
if chrom:
it = self.session.query(Feature).filter(and_(Feature.chrom == chrom, Feature.ftype == "exon"))
else:
it = self.session.query(Feature).filter(Feature.ftype == "exon")
exons = [e for e in it]
return exons

def get_read_statistics(self, fnames, name, span="exon", extend=(0,0), nreads=None):
from fluff.fluffio import get_binned_stats
from tempfile import NamedTemporaryFile

tmp = NamedTemporaryFile()
estore = {}
self.logger.debug("Writing exons to file")
for exon in self.get_exons():
start = exon.start
end = exon.end
if exon.strand == "-":
start -= extend[1]
end += extend[0]
else:
start -= extend[0]
end += extend[1]
if start < 0:
start = 0

estore["%s:%s-%s" % (exon.chrom, start, end)] = exon
tmp.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (
exon.chrom,
start,
end,
str(exon),
0,
exon.strand
))
tmp.flush()

if type("") == type(fnames):
fnames = [fnames]

#if not self.nreads.has_key(name):
# self.nreads[name] = 0

for i, fname in enumerate(fnames):
if fname.endswith("bam") and (not nreads or not nreads[i]):
rmrepeats = True
self.logger.debug("Counting reads in {0}".format(fname))
#self.nreads[name] += read_statistics(fname)
else:
rmrepeats = False

self.logger.debug("Getting overlap from {0}".format(fname))
result = get_binned_stats(tmp.name, fname, 1, rpkm=False, rmdup=False, rmrepeats=False)

self.logger.debug("Reading results, save to exon stats")

for row in result:
vals = row.strip().split("\t")
print vals
# e = "%s:%s-%s" % (vals[0], vals[1], vals[2])
# c = float(vals[3])
# estore[e].stats[name] = estore[e].stats.setdefault(name, 0) + c
# self.graph.node[estore[e]][name] = -estore[e].stats[name]

tmp.close()

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def run_tests(self):
tests_require=['pytest'],
install_requires=[
"gimmemotifs",
"pysam >= 0.7.4",
"pysam < 0.8",
"pyyaml",
"HTSeq",
"bcbio-gff",
Expand Down
5 changes: 5 additions & 0 deletions tests/data/read_stats.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
scaffold_1 18070000 18080000
scaffold_1 18080000 18200000
scaffold_1 18200000 18200100
scaffold_1 18200100 18250000
scaffold_1 18250000 18300000
100 changes: 100 additions & 0 deletions tests/test_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import pytest

@pytest.fixture
def three_exons():
exons = [
["chr1", 100, 200, "+"],
["chr1", 300, 400, "+"],
["chr2", 100, 200, "-"],
]
return exons

@pytest.fixture
def three_transcripts():
transcripts = [
["t1", "annotation",
[
["chr1", 100, 200, "+"],
["chr1", 300, 400, "+"],
["chr1", 500, 600, "+"],
]
],
["t2", "annotation",
[
["chr1", 50, 200, "+"],
["chr1", 300, 400, "+"],
]
],
["t3", "annotation",
[
["chr1", 300, 400, "+"],
["chr1", 500, 800, "+"],
]
],
]
return transcripts

@pytest.fixture
def two_transcripts():
transcripts = [
["t1", "annotation",
[
["chr1", 700, 900, "+"],
["chr1", 1100, 1200, "+"],
]
],
["t2", "annotation",
[
["chr1", 1100, 1200, "+"],
["chr1", 1400, 1600, "+"],
]
],
]
return transcripts

def test_create_db():
from pita.db import AnnotationDb
d = AnnotationDb()

def test_add_transcripts(three_transcripts):
from pita.db import AnnotationDb
d = AnnotationDb(new=True)
for name, source, exons in three_transcripts:
d.add_transcript(name, source, exons)

assert 5 == len(d.get_exons())
assert 5 == len(d.get_exons("chr1"))
assert 0 == len(d.get_exons("chr2"))

@pytest.fixture
def bam_file():
return "tests/data/H3K4me3.bam"

@pytest.fixture
def transcripts():
transcripts = [
["t1", "annotation",
[
["scaffold_1", 18070000, 18080000, "+"],
["scaffold_1", 18200000, 18200100, "+"],
["scaffold_1", 18250000, 18300000, "+"],
]
]

]
return transcripts

#scaffold_1 18070000 18080000 64
#scaffold_1 18080000 18200000 1092
#scaffold_1 18200000 18200100 1
#scaffold_1 18200100 18250000 318
#scaffold_1 18250000 18300000 300

def test_read_statistics(bam_file, transcripts):
from pita.db import AnnotationDb
d = AnnotationDb(new=True)
for name, source, exons in transcripts:
d.add_transcript(name, source, exons)

d.get_read_statistics(bam_file, "test")
assert 1 == 0

0 comments on commit 5454f56

Please sign in to comment.