From 693d4ecd32b358af4c6bb2dd87e666036ae89f9c Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Fri, 14 Dec 2018 14:34:29 +0000 Subject: [PATCH] #71 Equivalent functionality with new gff3 parser by Uli Kohler --- DDV/AnnotatedTrackLayout.py | 2 +- DDV/Annotations.py | 40 ++++---- DDV/HighlightedAnnotation.py | 50 ++++++---- DDV/gff3_parser.py | 93 +++++++++++++++++++ .../{seadragon.css => fluentdna.css} | 0 DDV/html_template/index.html | 2 +- 6 files changed, 149 insertions(+), 38 deletions(-) create mode 100644 DDV/gff3_parser.py rename DDV/html_template/{seadragon.css => fluentdna.css} (100%) diff --git a/DDV/AnnotatedTrackLayout.py b/DDV/AnnotatedTrackLayout.py index c9cd3da..e5d4de3 100644 --- a/DDV/AnnotatedTrackLayout.py +++ b/DDV/AnnotatedTrackLayout.py @@ -82,7 +82,7 @@ def prepare_annotation_labels(self): if scaff_name not in labels.keys(): continue for entry in labels[scaff_name]: - if entry.feature in ['gene', 'mRNA']: + if entry.type in ['gene', 'mRNA']: progress = (entry.start ) // genome_width *\ self.annotation_width + scaffold["xy_seq_start"] end = (entry.end) // genome_width *\ diff --git a/DDV/Annotations.py b/DDV/Annotations.py index 040f375..96b9499 100644 --- a/DDV/Annotations.py +++ b/DDV/Annotations.py @@ -63,7 +63,7 @@ def _import_gff(self, annotation_file): ID = counter source = elements[1] - feature = elements[2] + type = elements[2] start = int(elements[3]) end = int(elements[4]) @@ -90,9 +90,9 @@ def _import_gff(self, annotation_file): attributes = {} chromosome_lengths[chromosome] = max(chromosome_lengths[chromosome], end) - if feature != 'chromosome': # chromosomes don't have strand or frame + if type != 'chromosome': # chromosomes don't have strand or frame annotation = self.Annotation(chromosome, ID, - source, feature, + source, type, start, end, score, strand, frame, attributes, line) @@ -103,22 +103,22 @@ def _import_gff(self, annotation_file): return specimen, gff_version, genome_version, date, file_name, annotations, chromosome_lengths class Annotation(object): - def __init__(self, chromosome, ID, source, feature, start, end, score, strand, frame, attributes, line): - assert isinstance(chromosome, str), line - assert isinstance(ID, int), line - assert isinstance(source, str), line - assert isinstance(feature, str), line - assert isinstance(start, int), line - assert isinstance(end, int), line - assert score is None or isinstance(score, float), line - assert isinstance(strand, str), line - assert frame is None or isinstance(frame, int), line - assert isinstance(attributes, dict), line + def __init__(self, chromosome, ID, source, type, start, end, score, strand, frame, attributes, line): + # assert chromosome is None or isinstance(chromosome, str), line + # assert ID is None or isinstance(ID, int), line + # assert source is None or isinstance(source, str), line + # assert type is None or isinstance(type, str), line + # assert start is None or isinstance(start, int), line + # assert end is None or isinstance(end, int), line + # assert score is None or isinstance(score, float), line + # assert strand is None or isinstance(strand, str), line + # assert frame is None or isinstance(frame, int), line + # assert attributes is None or isinstance(attributes, dict), line self.chromosome = chromosome self.ID = ID self.source = source - self.feature = feature + self.type = type self.start = start self.end = end self.score = score @@ -171,13 +171,13 @@ def create_fasta_from_annotation(gff, scaffold_names, scaffold_lengths=None, out seq_array = editable_str(gap_char * (gff.chromosome_lengths[scaff_name] + 1)) for entry in gff.annotations[scaff_name]: assert isinstance(entry, GFF.Annotation), "This isn't a proper GFF object" - if entry.feature in features.keys(): + if entry.type in features.keys(): count += 1 - my = features[entry.feature] + my = features[entry.type] for i in range(entry.start, entry.end + 1): if symbol_priority[seq_array[i]] > my.priority : seq_array[i] = my.symbol - if entry.feature == 'gene': + if entry.type == 'gene': # TODO: output header JSON every time we find a gene pass handle_tail(seq_array, scaffold_lengths, sc_index) @@ -204,10 +204,10 @@ def purge_annotation(gff_filename, features_of_interest=('exon', 'gene')): for entry in gff.annotations[chromosome]: assert isinstance(entry, GFF.Annotation), "This isn't a GFF annotation." total += 1 - if entry.feature in features_of_interest: + if entry.type in features_of_interest: if survivors: last = survivors[-1] - if last.start == entry.start and last.end == entry.end and entry.feature == last.feature: + if last.start == entry.start and last.end == entry.end and entry.type == last.type: continue # skip this because it's a duplicate of what we already have kept += 1 survivors.append(entry) diff --git a/DDV/HighlightedAnnotation.py b/DDV/HighlightedAnnotation.py index 1de65c0..8842663 100644 --- a/DDV/HighlightedAnnotation.py +++ b/DDV/HighlightedAnnotation.py @@ -2,12 +2,15 @@ import sys from PIL import Image, ImageFont +from gff3_parser import parseGFF3, GFFRecord from DDV.Annotations import GFF, extract_gene_name, find_universal_prefix from DDV.Span import Span from DDV.TileLayout import TileLayout from DDV.DDVUtils import linspace -from collections import namedtuple +from collections import namedtuple, defaultdict + + Point = namedtuple('Point', ['x', 'y']) @@ -30,12 +33,21 @@ def annotation_points(entry, renderer, start_offset): class HighlightedAnnotation(TileLayout): def __init__(self, gff_file, query=None, repeat_annotation=None, **kwargs): super(HighlightedAnnotation, self).__init__(border_width=12, **kwargs) - self.annotation = GFF(gff_file).annotations if gff_file is not None else None - self.query_annotation = GFF(query).annotations if query is not None else None - self.repeat_annotation = GFF(repeat_annotation).annotations if repeat_annotation is not None else None + self.annotation = self.gff3(gff_file) + self.query_annotation = self.gff3(query) + self.repeat_annotation = self.gff3(repeat_annotation) self.pil_mode = 'RGBA' # Alpha channel necessary for outline blending self.font_name = "ariblk.ttf" # TODO: compatibility testing with Mac + def gff3(self, gff_file): + if gff_file is None: + return None + parser = parseGFF3(gff_file) + annotations = defaultdict(lambda : []) + for entry in parser: + annotations[entry.seqid].append(entry) + return annotations + def process_file(self, input_file_path, output_folder, output_file_name, no_webpage=False, extract_contigs=None): if self.annotation is not None: @@ -75,7 +87,7 @@ def draw_extras_for_chromosome(self, scaff_name, coordinate_frame): if self.query_annotation is not None: self.draw_annotation_layer(self.query_annotation, scaff_name, coordinate_frame, genic_color, (50, 50, 50, 255), shadows=True) - if self.annotation is not None: + if self.annotation is not None: # drawn last so it's on top self.draw_annotation_layer(self.annotation, scaff_name, coordinate_frame, genic_color, (50, 50, 50, 255)) @@ -102,7 +114,7 @@ def draw_annotation_layer(self, annotations, scaff_name, coordinate_frame, color simple_entry=simple_entry, shadows=shadows) if self.use_titles and label_color[3]: # if the text color is transparent, don't bother - universal_prefix = find_universal_prefix(regions) + universal_prefix = '' #find_universal_prefix(regions) print("Removing Universal Prefix from annotations: '%s'" % universal_prefix) self.draw_annotation_labels(markup_image, regions, coordinate_frame["title_padding"], label_color, universal_prefix, @@ -191,12 +203,12 @@ def find_annotated_regions(self, annotations, scaff_name, start_offset, no_struc if no_structure: # life is simple regions.append(AnnotatedRegion(entry, self.levels, start_offset)) else: # find gene/mRNA/exon/CDS hierarchy - if entry.feature == 'gene': + if entry.type == 'gene': + regions.append(AnnotatedRegion(entry, self.levels, start_offset)) + # genes_seen.add(extract_gene_name(entry).replace('g','t')) + if entry.type == 'mRNA' and extract_gene_name(entry) not in genes_seen: regions.append(AnnotatedRegion(entry, self.levels, start_offset)) - genes_seen.add(extract_gene_name(entry).replace('g','t')) - # if entry.feature == 'mRNA' and extract_gene_name(entry) not in genes_seen: - # regions.append(AnnotatedRegion(entry, self.levels, start_offset)) - if entry.feature == 'CDS' or entry.feature == 'exon': + if entry.type == 'CDS' or entry.type == 'exon': # hopefully mRNA comes first in the file # if extract_gene_name(regions[-1]) == entry.attributes['Parent']: # the if was commented out because CDS [Parent] to mRNA, not gene names @@ -300,11 +312,17 @@ def outlines(annotation_points, radius, width, height): class AnnotatedRegion(GFF.Annotation): def __init__(self, GFF_annotation, renderer, start_offset): - assert isinstance(GFF_annotation, GFF.Annotation), "This isn't a proper GFF object" - g = GFF_annotation # short name - super(AnnotatedRegion, self).__init__(g.chromosome, g.ID, g.source, g.feature, - g.start, g.end, g.score, g.strand, g.frame, - g.attributes, g.line) + if isinstance(GFF_annotation, GFFRecord): + g = GFF_annotation # short name + super(AnnotatedRegion, self).__init__(g.seqid, g.attributes['ID'], g.source, g.type, + g.start, g.end, g.score, g.strand, g.phase, + g.attributes, '') + else: + assert isinstance(GFF_annotation, GFF.Annotation), "This isn't a proper GFF object" + g = GFF_annotation # short name + super(AnnotatedRegion, self).__init__(g.chromosome, g.ID, g.source, g.type, + g.start, g.end, g.score, g.strand, g.frame, + g.attributes, g.line) self.points = annotation_points(GFF_annotation, renderer, start_offset) self.protein_spans = [] diff --git a/DDV/gff3_parser.py b/DDV/gff3_parser.py new file mode 100644 index 0000000..eec0c15 --- /dev/null +++ b/DDV/gff3_parser.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +""" +Author: Uli Köhler +Source: https://techoverflow.net/2013/11/30/a-simple-gff3-parser-in-python/ +A simple parser for the GFF3 format. + +Test with transcripts.gff3 from +http://www.broadinstitute.org/annotation/gebo/help/gff3.html. + +Format specification source: +http://www.sequenceontology.org/gff3.shtml + +Version 1.1: Python3 ready +""" +from collections import namedtuple +import gzip + +ulrlib_available = False +try: + from urllib import parse +except ImportError: + from urllib2 import parse # for python 2.7 + +__author__ = "Uli Köhler" +__license__ = "Apache License v2.0" +__version__ = "1.1" + +# Initialized GeneInfo named tuple. Note: namedtuple is immutable +gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"] +GFFRecord = namedtuple("GFFRecord", gffInfoFields) + + +def parseGFFAttributes(attributeString): + """Parse the GFF3 attribute column and return a dict""" # + if attributeString == ".": return {} + ret = {} + for attribute in attributeString.split(";"): + key, value = attribute.split("=") + ret[parse.unquote(key)] = parse.unquote(value) + return ret + + +def parseGFF3(filename): + """ + A minimalistic GFF3 format parser. + Yields objects that contain info about a single GFF3 feature. + + Supports transparent gzip decompression. + """ + # Parse with transparent decompression + openFunc = gzip.open if filename.endswith(".gz") else open + with openFunc(filename) as infile: + for line in infile: + if line.startswith("#"): continue + parts = line.strip().split("\t") + # If this fails, the file format is not standard-compatible + assert len(parts) == len(gffInfoFields) + # Normalize data + normalizedInfo = { + "seqid": None if parts[0] == "." else parse.unquote(parts[0]), + "source": None if parts[1] == "." else parse.unquote(parts[1]), + "type": None if parts[2] == "." else parse.unquote(parts[2]), + "start": None if parts[3] == "." else int(parts[3]), + "end": None if parts[4] == "." else int(parts[4]), + "score": None if parts[5] == "." else float(parts[5]), + "strand": None if parts[6] == "." else parse.unquote(parts[6]), + "phase": None if parts[7] == "." else parse.unquote(parts[7]), + "attributes": parseGFFAttributes(parts[8]) + } + # Alternatively, you can emit the dictionary here, if you need mutability: + # yield normalizedInfo + yield GFFRecord(**normalizedInfo) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("file", help="The GFF3 input file (.gz allowed)") + parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only") + parser.add_argument("--filter-type", help="Ignore records not having the given type") + args = parser.parse_args() + # Execute the parser + recordCount = 0 + for record in parseGFF3(args.file): + # Apply filter, if any + if args.filter_type and record.type != args.filter_type: + continue + # Print record if specified by the user + if args.print_records: print(record) + # Access attributes like this: my_strand = record.strand + recordCount += 1 + print("Total records: %d" % recordCount) \ No newline at end of file diff --git a/DDV/html_template/seadragon.css b/DDV/html_template/fluentdna.css similarity index 100% rename from DDV/html_template/seadragon.css rename to DDV/html_template/fluentdna.css diff --git a/DDV/html_template/index.html b/DDV/html_template/index.html index bc137d8..005eca9 100644 --- a/DDV/html_template/index.html +++ b/DDV/html_template/index.html @@ -20,7 +20,7 @@ - +