Skip to content

Commit

Permalink
#71 Equivalent functionality with new gff3 parser by Uli Kohler
Browse files Browse the repository at this point in the history
  • Loading branch information
josiahseaman committed Dec 14, 2018
1 parent 7b013db commit 693d4ec
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 38 deletions.
2 changes: 1 addition & 1 deletion DDV/AnnotatedTrackLayout.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def prepare_annotation_labels(self):
if scaff_name not in labels.keys():
continue
for entry in labels[scaff_name]:
if entry.feature in ['gene', 'mRNA']:
if entry.type in ['gene', 'mRNA']:
progress = (entry.start ) // genome_width *\
self.annotation_width + scaffold["xy_seq_start"]
end = (entry.end) // genome_width *\
Expand Down
40 changes: 20 additions & 20 deletions DDV/Annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _import_gff(self, annotation_file):

ID = counter
source = elements[1]
feature = elements[2]
type = elements[2]
start = int(elements[3])
end = int(elements[4])

Expand All @@ -90,9 +90,9 @@ def _import_gff(self, annotation_file):
attributes = {}

chromosome_lengths[chromosome] = max(chromosome_lengths[chromosome], end)
if feature != 'chromosome': # chromosomes don't have strand or frame
if type != 'chromosome': # chromosomes don't have strand or frame
annotation = self.Annotation(chromosome, ID,
source, feature,
source, type,
start, end,
score, strand,
frame, attributes, line)
Expand All @@ -103,22 +103,22 @@ def _import_gff(self, annotation_file):
return specimen, gff_version, genome_version, date, file_name, annotations, chromosome_lengths

class Annotation(object):
def __init__(self, chromosome, ID, source, feature, start, end, score, strand, frame, attributes, line):
assert isinstance(chromosome, str), line
assert isinstance(ID, int), line
assert isinstance(source, str), line
assert isinstance(feature, str), line
assert isinstance(start, int), line
assert isinstance(end, int), line
assert score is None or isinstance(score, float), line
assert isinstance(strand, str), line
assert frame is None or isinstance(frame, int), line
assert isinstance(attributes, dict), line
def __init__(self, chromosome, ID, source, type, start, end, score, strand, frame, attributes, line):
# assert chromosome is None or isinstance(chromosome, str), line
# assert ID is None or isinstance(ID, int), line
# assert source is None or isinstance(source, str), line
# assert type is None or isinstance(type, str), line
# assert start is None or isinstance(start, int), line
# assert end is None or isinstance(end, int), line
# assert score is None or isinstance(score, float), line
# assert strand is None or isinstance(strand, str), line
# assert frame is None or isinstance(frame, int), line
# assert attributes is None or isinstance(attributes, dict), line

self.chromosome = chromosome
self.ID = ID
self.source = source
self.feature = feature
self.type = type
self.start = start
self.end = end
self.score = score
Expand Down Expand Up @@ -171,13 +171,13 @@ def create_fasta_from_annotation(gff, scaffold_names, scaffold_lengths=None, out
seq_array = editable_str(gap_char * (gff.chromosome_lengths[scaff_name] + 1))
for entry in gff.annotations[scaff_name]:
assert isinstance(entry, GFF.Annotation), "This isn't a proper GFF object"
if entry.feature in features.keys():
if entry.type in features.keys():
count += 1
my = features[entry.feature]
my = features[entry.type]
for i in range(entry.start, entry.end + 1):
if symbol_priority[seq_array[i]] > my.priority :
seq_array[i] = my.symbol
if entry.feature == 'gene':
if entry.type == 'gene':
# TODO: output header JSON every time we find a gene
pass
handle_tail(seq_array, scaffold_lengths, sc_index)
Expand All @@ -204,10 +204,10 @@ def purge_annotation(gff_filename, features_of_interest=('exon', 'gene')):
for entry in gff.annotations[chromosome]:
assert isinstance(entry, GFF.Annotation), "This isn't a GFF annotation."
total += 1
if entry.feature in features_of_interest:
if entry.type in features_of_interest:
if survivors:
last = survivors[-1]
if last.start == entry.start and last.end == entry.end and entry.feature == last.feature:
if last.start == entry.start and last.end == entry.end and entry.type == last.type:
continue # skip this because it's a duplicate of what we already have
kept += 1
survivors.append(entry)
Expand Down
50 changes: 34 additions & 16 deletions DDV/HighlightedAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

import sys
from PIL import Image, ImageFont
from gff3_parser import parseGFF3, GFFRecord

from DDV.Annotations import GFF, extract_gene_name, find_universal_prefix
from DDV.Span import Span
from DDV.TileLayout import TileLayout
from DDV.DDVUtils import linspace
from collections import namedtuple
from collections import namedtuple, defaultdict


Point = namedtuple('Point', ['x', 'y'])


Expand All @@ -30,12 +33,21 @@ def annotation_points(entry, renderer, start_offset):
class HighlightedAnnotation(TileLayout):
def __init__(self, gff_file, query=None, repeat_annotation=None, **kwargs):
super(HighlightedAnnotation, self).__init__(border_width=12, **kwargs)
self.annotation = GFF(gff_file).annotations if gff_file is not None else None
self.query_annotation = GFF(query).annotations if query is not None else None
self.repeat_annotation = GFF(repeat_annotation).annotations if repeat_annotation is not None else None
self.annotation = self.gff3(gff_file)
self.query_annotation = self.gff3(query)
self.repeat_annotation = self.gff3(repeat_annotation)
self.pil_mode = 'RGBA' # Alpha channel necessary for outline blending
self.font_name = "ariblk.ttf" # TODO: compatibility testing with Mac

def gff3(self, gff_file):
if gff_file is None:
return None
parser = parseGFF3(gff_file)
annotations = defaultdict(lambda : [])
for entry in parser:
annotations[entry.seqid].append(entry)
return annotations

def process_file(self, input_file_path, output_folder, output_file_name,
no_webpage=False, extract_contigs=None):
if self.annotation is not None:
Expand Down Expand Up @@ -75,7 +87,7 @@ def draw_extras_for_chromosome(self, scaff_name, coordinate_frame):
if self.query_annotation is not None:
self.draw_annotation_layer(self.query_annotation, scaff_name, coordinate_frame, genic_color,
(50, 50, 50, 255), shadows=True)
if self.annotation is not None:
if self.annotation is not None: # drawn last so it's on top
self.draw_annotation_layer(self.annotation, scaff_name, coordinate_frame, genic_color,
(50, 50, 50, 255))

Expand All @@ -102,7 +114,7 @@ def draw_annotation_layer(self, annotations, scaff_name, coordinate_frame, color
simple_entry=simple_entry, shadows=shadows)

if self.use_titles and label_color[3]: # if the text color is transparent, don't bother
universal_prefix = find_universal_prefix(regions)
universal_prefix = '' #find_universal_prefix(regions)
print("Removing Universal Prefix from annotations: '%s'" % universal_prefix)
self.draw_annotation_labels(markup_image, regions, coordinate_frame["title_padding"],
label_color, universal_prefix,
Expand Down Expand Up @@ -191,12 +203,12 @@ def find_annotated_regions(self, annotations, scaff_name, start_offset, no_struc
if no_structure: # life is simple
regions.append(AnnotatedRegion(entry, self.levels, start_offset))
else: # find gene/mRNA/exon/CDS hierarchy
if entry.feature == 'gene':
if entry.type == 'gene':
regions.append(AnnotatedRegion(entry, self.levels, start_offset))
# genes_seen.add(extract_gene_name(entry).replace('g','t'))
if entry.type == 'mRNA' and extract_gene_name(entry) not in genes_seen:
regions.append(AnnotatedRegion(entry, self.levels, start_offset))
genes_seen.add(extract_gene_name(entry).replace('g','t'))
# if entry.feature == 'mRNA' and extract_gene_name(entry) not in genes_seen:
# regions.append(AnnotatedRegion(entry, self.levels, start_offset))
if entry.feature == 'CDS' or entry.feature == 'exon':
if entry.type == 'CDS' or entry.type == 'exon':
# hopefully mRNA comes first in the file
# if extract_gene_name(regions[-1]) == entry.attributes['Parent']:
# the if was commented out because CDS [Parent] to mRNA, not gene names
Expand Down Expand Up @@ -300,11 +312,17 @@ def outlines(annotation_points, radius, width, height):

class AnnotatedRegion(GFF.Annotation):
def __init__(self, GFF_annotation, renderer, start_offset):
assert isinstance(GFF_annotation, GFF.Annotation), "This isn't a proper GFF object"
g = GFF_annotation # short name
super(AnnotatedRegion, self).__init__(g.chromosome, g.ID, g.source, g.feature,
g.start, g.end, g.score, g.strand, g.frame,
g.attributes, g.line)
if isinstance(GFF_annotation, GFFRecord):
g = GFF_annotation # short name
super(AnnotatedRegion, self).__init__(g.seqid, g.attributes['ID'], g.source, g.type,
g.start, g.end, g.score, g.strand, g.phase,
g.attributes, '')
else:
assert isinstance(GFF_annotation, GFF.Annotation), "This isn't a proper GFF object"
g = GFF_annotation # short name
super(AnnotatedRegion, self).__init__(g.chromosome, g.ID, g.source, g.type,
g.start, g.end, g.score, g.strand, g.frame,
g.attributes, g.line)
self.points = annotation_points(GFF_annotation, renderer, start_offset)
self.protein_spans = []

Expand Down
93 changes: 93 additions & 0 deletions DDV/gff3_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python3
"""
Author: Uli Köhler
Source: https://techoverflow.net/2013/11/30/a-simple-gff3-parser-in-python/
A simple parser for the GFF3 format.
Test with transcripts.gff3 from
http://www.broadinstitute.org/annotation/gebo/help/gff3.html.
Format specification source:
http://www.sequenceontology.org/gff3.shtml
Version 1.1: Python3 ready
"""
from collections import namedtuple
import gzip

ulrlib_available = False
try:
from urllib import parse
except ImportError:
from urllib2 import parse # for python 2.7

__author__ = "Uli Köhler"
__license__ = "Apache License v2.0"
__version__ = "1.1"

# Initialized GeneInfo named tuple. Note: namedtuple is immutable
gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
GFFRecord = namedtuple("GFFRecord", gffInfoFields)


def parseGFFAttributes(attributeString):
"""Parse the GFF3 attribute column and return a dict""" #
if attributeString == ".": return {}
ret = {}
for attribute in attributeString.split(";"):
key, value = attribute.split("=")
ret[parse.unquote(key)] = parse.unquote(value)
return ret


def parseGFF3(filename):
"""
A minimalistic GFF3 format parser.
Yields objects that contain info about a single GFF3 feature.
Supports transparent gzip decompression.
"""
# Parse with transparent decompression
openFunc = gzip.open if filename.endswith(".gz") else open
with openFunc(filename) as infile:
for line in infile:
if line.startswith("#"): continue
parts = line.strip().split("\t")
# If this fails, the file format is not standard-compatible
assert len(parts) == len(gffInfoFields)
# Normalize data
normalizedInfo = {
"seqid": None if parts[0] == "." else parse.unquote(parts[0]),
"source": None if parts[1] == "." else parse.unquote(parts[1]),
"type": None if parts[2] == "." else parse.unquote(parts[2]),
"start": None if parts[3] == "." else int(parts[3]),
"end": None if parts[4] == "." else int(parts[4]),
"score": None if parts[5] == "." else float(parts[5]),
"strand": None if parts[6] == "." else parse.unquote(parts[6]),
"phase": None if parts[7] == "." else parse.unquote(parts[7]),
"attributes": parseGFFAttributes(parts[8])
}
# Alternatively, you can emit the dictionary here, if you need mutability:
# yield normalizedInfo
yield GFFRecord(**normalizedInfo)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("file", help="The GFF3 input file (.gz allowed)")
parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")
parser.add_argument("--filter-type", help="Ignore records not having the given type")
args = parser.parse_args()
# Execute the parser
recordCount = 0
for record in parseGFF3(args.file):
# Apply filter, if any
if args.filter_type and record.type != args.filter_type:
continue
# Print record if specified by the user
if args.print_records: print(record)
# Access attributes like this: my_strand = record.strand
recordCount += 1
print("Total records: %d" % recordCount)
File renamed without changes.
2 changes: 1 addition & 1 deletion DDV/html_template/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<script type='text/javascript' src='Biojs.js'></script>
<script type='text/javascript' src='Biojs.Sequence.js'></script>
<script src='./nucleotideNumber.js' type='text/javascript'></script>
<link rel='stylesheet' type='text/css' href='seadragon.css'/>
<link rel='stylesheet' type='text/css' href='fluentdna.css'/>
</head>

<body>
Expand Down

0 comments on commit 693d4ec

Please sign in to comment.