Skip to content

Commit

Permalink
Script to convert pita bed12 output to gff3
Browse files Browse the repository at this point in the history
  • Loading branch information
simonvh committed Jan 9, 2014
1 parent 2580b84 commit 6a2cab6
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 0 deletions.
86 changes: 86 additions & 0 deletions scripts/bed12togff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python
# Copyright (c) 2013 Simon van Heeringen <s.vanheeringen@ncmls.ru.nl>
#
# This script is free software. You can redistribute it and/or modify it under
# the terms of the MIT License, see the file COPYING included with this
# distribution.

import sys

GFF_LINE = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n"

if not len(sys.argv) == 2:
sys.stderr.write("Usage: bed12togff3 bed12\n")
sys.exit(1)

fh = sys.stdout
fh.write("##gff-version 3\n")
for line in open(sys.argv[1]):
vals = line.strip().split("\t")
if len(vals) != 12:
fh.write("#{0}\n".format(line.strip()))
else:
chrom = vals[0]
start,end = int(vals[1]), int(vals[2])
thickStart, thickEnd = int(vals[6]), int(vals[7])
strand = vals[5]
if strand not in "+-":
strand = "?" # ? can be used for features whose strandedness is relevant, but unknown.
name = vals[3]

exonStarts = [int(x) for x in vals[11].strip(",").split(",")]
exonSizes = [int(x) for x in vals[10].strip(",").split(",")]

# Gene
fh.write(GFF_LINE.format(
chrom,
".",
"mRNA",
start + 1,
end,
".",
strand,
".",
"ID={0}".format(name)
))

# Exons
for i, (estart, esize) in enumerate(zip(exonStarts, exonSizes)):
fh.write(GFF_LINE.format(
chrom,
".",
"exon",
start + 1 + estart,
start + estart + esize,
".",
strand,
".",
"ID={0}.Exon.{1};Parent={0}".format(name, i + 1)
))
# CDS
phase = 0
for i, (estart, esize) in enumerate(zip(exonStarts, exonSizes)):
if thickStart <= start + estart + esize and thickEnd >= start + estart:
cds_start = start + 1 + estart
cds_end = start + estart + esize
if thickStart >= start + estart:
cds_start = thickStart + 1
if thickEnd <= start + estart + esize:
cds_end = thickEnd

# First CDS exon
fh.write(GFF_LINE.format(
chrom,
".",
"CDS",
cds_start,
cds_end,
".",
strand,
phase,
"ID={0}.CDS;Parent={0}".format(name)
))
phase = ((cds_end - cds_start + 1) - phase) % 3



1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def run_tests(self):
],
scripts=[
"scripts/pita",
"scripts/bed12togff3",
],
data_files=[],
tests_require=['pytest'],
Expand Down

0 comments on commit 6a2cab6

Please sign in to comment.