#71 Equivalent functionality with new gff3 parser by Uli Kohler

josiahseaman · Dec 14, 2018 · 693d4ec · 693d4ec
1 parent 7b013db
commit 693d4ec
Show file tree

Hide file tree

Showing 6 changed files with 149 additions and 38 deletions.
diff --git a/DDV/AnnotatedTrackLayout.py b/DDV/AnnotatedTrackLayout.py
@@ -82,7 +82,7 @@ def prepare_annotation_labels(self):
             if scaff_name not in labels.keys():
                 continue
             for entry in labels[scaff_name]:
-                if entry.feature in ['gene', 'mRNA']:
+                if entry.type in ['gene', 'mRNA']:
                     progress = (entry.start ) // genome_width *\
                                self.annotation_width + scaffold["xy_seq_start"]
                     end = (entry.end) // genome_width *\

diff --git a/DDV/Annotations.py b/DDV/Annotations.py
@@ -63,7 +63,7 @@ def _import_gff(self, annotation_file):
 
                         ID = counter
                         source = elements[1]
-                        feature = elements[2]
+                        type = elements[2]
                         start = int(elements[3])
                         end = int(elements[4])
 
@@ -90,9 +90,9 @@ def _import_gff(self, annotation_file):
                             attributes = {}
 
                         chromosome_lengths[chromosome] = max(chromosome_lengths[chromosome], end)
-                        if feature != 'chromosome':  # chromosomes don't have strand or frame
+                        if type != 'chromosome':  # chromosomes don't have strand or frame
                             annotation = self.Annotation(chromosome, ID,
-                                                         source, feature,
+                                                         source, type,
                                                          start, end,
                                                          score, strand,
                                                          frame, attributes, line)
@@ -103,22 +103,22 @@ def _import_gff(self, annotation_file):
         return specimen, gff_version, genome_version, date, file_name, annotations, chromosome_lengths
 
     class Annotation(object):
-        def __init__(self, chromosome, ID, source, feature, start, end, score, strand, frame, attributes, line):
-            assert isinstance(chromosome, str), line
-            assert isinstance(ID, int), line
-            assert isinstance(source, str), line
-            assert isinstance(feature, str), line
-            assert isinstance(start, int), line
-            assert isinstance(end, int), line
-            assert score is None or isinstance(score, float), line
-            assert isinstance(strand, str), line
-            assert frame is None or isinstance(frame, int), line
-            assert isinstance(attributes, dict), line
+        def __init__(self, chromosome, ID, source, type, start, end, score, strand, frame, attributes, line):
+            # assert chromosome is None or isinstance(chromosome, str), line
+            # assert ID is None or isinstance(ID, int), line
+            # assert source is None or isinstance(source, str), line
+            # assert type is None or isinstance(type, str), line
+            # assert start is None or isinstance(start, int), line
+            # assert end is None or isinstance(end, int), line
+            # assert score is None or isinstance(score, float), line
+            # assert strand is None or isinstance(strand, str), line
+            # assert frame is None or isinstance(frame, int), line
+            # assert attributes is None or isinstance(attributes, dict), line
 
             self.chromosome = chromosome
             self.ID = ID
             self.source = source
-            self.feature = feature
+            self.type = type
             self.start = start
             self.end = end
             self.score = score
@@ -171,13 +171,13 @@ def create_fasta_from_annotation(gff, scaffold_names, scaffold_lengths=None, out
             seq_array = editable_str(gap_char * (gff.chromosome_lengths[scaff_name] + 1))
             for entry in gff.annotations[scaff_name]:
                 assert isinstance(entry, GFF.Annotation), "This isn't a proper GFF object"
-                if entry.feature in features.keys():
+                if entry.type in features.keys():
                     count += 1
-                    my = features[entry.feature]
+                    my = features[entry.type]
                     for i in range(entry.start, entry.end + 1):
                         if symbol_priority[seq_array[i]] > my.priority :
                             seq_array[i] = my.symbol
-                if entry.feature == 'gene':
+                if entry.type == 'gene':
                     # TODO: output header JSON every time we find a gene
                     pass
             handle_tail(seq_array, scaffold_lengths, sc_index)
@@ -204,10 +204,10 @@ def purge_annotation(gff_filename, features_of_interest=('exon', 'gene')):
         for entry in gff.annotations[chromosome]:
             assert isinstance(entry, GFF.Annotation), "This isn't a GFF annotation."
             total += 1
-            if entry.feature in features_of_interest:
+            if entry.type in features_of_interest:
                 if survivors:
                     last = survivors[-1]
-                    if last.start == entry.start and last.end == entry.end and entry.feature == last.feature:
+                    if last.start == entry.start and last.end == entry.end and entry.type == last.type:
                         continue  # skip this because it's a duplicate of what we already have
                 kept += 1
                 survivors.append(entry)

diff --git a/DDV/HighlightedAnnotation.py b/DDV/HighlightedAnnotation.py
@@ -2,12 +2,15 @@
 
 import sys
 from PIL import Image, ImageFont
+from gff3_parser import parseGFF3, GFFRecord
 
 from DDV.Annotations import GFF, extract_gene_name, find_universal_prefix
 from DDV.Span import Span
 from DDV.TileLayout import TileLayout
 from DDV.DDVUtils import linspace
-from collections import namedtuple
+from collections import namedtuple, defaultdict
+
+
 Point = namedtuple('Point', ['x', 'y'])
 
 
@@ -30,12 +33,21 @@ def annotation_points(entry, renderer, start_offset):
 class HighlightedAnnotation(TileLayout):
     def __init__(self, gff_file, query=None, repeat_annotation=None, **kwargs):
         super(HighlightedAnnotation, self).__init__(border_width=12, **kwargs)
-        self.annotation = GFF(gff_file).annotations if gff_file is not None else None
-        self.query_annotation = GFF(query).annotations if query is not None else None
-        self.repeat_annotation = GFF(repeat_annotation).annotations if repeat_annotation is not None else None
+        self.annotation = self.gff3(gff_file)
+        self.query_annotation = self.gff3(query)
+        self.repeat_annotation = self.gff3(repeat_annotation)
         self.pil_mode = 'RGBA'  # Alpha channel necessary for outline blending
         self.font_name = "ariblk.ttf"  # TODO: compatibility testing with Mac
 
+    def gff3(self, gff_file):
+        if gff_file is None:
+            return None
+        parser = parseGFF3(gff_file)
+        annotations = defaultdict(lambda : [])
+        for entry in parser:
+            annotations[entry.seqid].append(entry)
+        return annotations
+
     def process_file(self, input_file_path, output_folder, output_file_name,
                      no_webpage=False, extract_contigs=None):
         if self.annotation is not None:
@@ -75,7 +87,7 @@ def draw_extras_for_chromosome(self, scaff_name, coordinate_frame):
         if self.query_annotation is not None:
             self.draw_annotation_layer(self.query_annotation, scaff_name, coordinate_frame, genic_color,
                                        (50, 50, 50, 255), shadows=True)
-        if self.annotation is not None:
+        if self.annotation is not None:  # drawn last so it's on top
             self.draw_annotation_layer(self.annotation, scaff_name, coordinate_frame, genic_color,
                                        (50, 50, 50, 255))
 
@@ -102,7 +114,7 @@ def draw_annotation_layer(self, annotations, scaff_name, coordinate_frame, color
                                       simple_entry=simple_entry, shadows=shadows)
 
         if self.use_titles and label_color[3]:  # if the text color is transparent, don't bother
-            universal_prefix = find_universal_prefix(regions)
+            universal_prefix = '' #find_universal_prefix(regions)
             print("Removing Universal Prefix from annotations: '%s'" % universal_prefix)
             self.draw_annotation_labels(markup_image, regions, coordinate_frame["title_padding"],
                                         label_color, universal_prefix,
@@ -191,12 +203,12 @@ def find_annotated_regions(self, annotations, scaff_name, start_offset, no_struc
                     if no_structure:  # life is simple
                         regions.append(AnnotatedRegion(entry, self.levels, start_offset))
                     else:  # find gene/mRNA/exon/CDS hierarchy
-                        if entry.feature == 'gene':
+                        if entry.type == 'gene':
+                            regions.append(AnnotatedRegion(entry, self.levels, start_offset))
+                            # genes_seen.add(extract_gene_name(entry).replace('g','t'))
+                        if entry.type == 'mRNA' and extract_gene_name(entry) not in genes_seen:
                             regions.append(AnnotatedRegion(entry, self.levels, start_offset))
-                            genes_seen.add(extract_gene_name(entry).replace('g','t'))
-                        # if entry.feature == 'mRNA' and extract_gene_name(entry) not in genes_seen:
-                        #     regions.append(AnnotatedRegion(entry, self.levels, start_offset))
-                        if entry.feature == 'CDS' or entry.feature == 'exon':
+                        if entry.type == 'CDS' or entry.type == 'exon':
                             # hopefully mRNA comes first in the file
                             # if extract_gene_name(regions[-1]) == entry.attributes['Parent']:
                             # the if was commented out because CDS [Parent] to mRNA, not gene names
@@ -300,11 +312,17 @@ def outlines(annotation_points, radius, width, height):
 
 class AnnotatedRegion(GFF.Annotation):
     def __init__(self, GFF_annotation, renderer, start_offset):
-        assert isinstance(GFF_annotation, GFF.Annotation), "This isn't a proper GFF object"
-        g = GFF_annotation  # short name
-        super(AnnotatedRegion, self).__init__(g.chromosome, g.ID, g.source, g.feature,
-                                              g.start, g.end, g.score, g.strand, g.frame,
-                                              g.attributes, g.line)
+        if isinstance(GFF_annotation, GFFRecord):
+            g = GFF_annotation  # short name
+            super(AnnotatedRegion, self).__init__(g.seqid, g.attributes['ID'], g.source, g.type,
+                                                  g.start, g.end, g.score, g.strand, g.phase,
+                                                  g.attributes, '')
+        else:
+            assert isinstance(GFF_annotation, GFF.Annotation), "This isn't a proper GFF object"
+            g = GFF_annotation  # short name
+            super(AnnotatedRegion, self).__init__(g.chromosome, g.ID, g.source, g.type,
+                                                  g.start, g.end, g.score, g.strand, g.frame,
+                                                  g.attributes, g.line)
         self.points = annotation_points(GFF_annotation, renderer, start_offset)
         self.protein_spans = []
 

diff --git a/DDV/gff3_parser.py b/DDV/gff3_parser.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""
+Author: Uli Köhler
+Source: https://techoverflow.net/2013/11/30/a-simple-gff3-parser-in-python/
+A simple parser for the GFF3 format.
+
+Test with transcripts.gff3 from
+http://www.broadinstitute.org/annotation/gebo/help/gff3.html.
+
+Format specification source:
+http://www.sequenceontology.org/gff3.shtml
+
+Version 1.1: Python3 ready
+"""
+from collections import namedtuple
+import gzip
+
+ulrlib_available = False
+try:
+    from urllib import parse
+except ImportError:
+    from urllib2 import parse  # for python 2.7
+
+__author__ = "Uli Köhler"
+__license__ = "Apache License v2.0"
+__version__ = "1.1"
+
+# Initialized GeneInfo named tuple. Note: namedtuple is immutable
+gffInfoFields = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
+GFFRecord = namedtuple("GFFRecord", gffInfoFields)
+
+
+def parseGFFAttributes(attributeString):
+    """Parse the GFF3 attribute column and return a dict"""  #
+    if attributeString == ".": return {}
+    ret = {}
+    for attribute in attributeString.split(";"):
+        key, value = attribute.split("=")
+        ret[parse.unquote(key)] = parse.unquote(value)
+    return ret
+
+
+def parseGFF3(filename):
+    """
+    A minimalistic GFF3 format parser.
+    Yields objects that contain info about a single GFF3 feature.
+
+    Supports transparent gzip decompression.
+    """
+    # Parse with transparent decompression
+    openFunc = gzip.open if filename.endswith(".gz") else open
+    with openFunc(filename) as infile:
+        for line in infile:
+            if line.startswith("#"): continue
+            parts = line.strip().split("\t")
+            # If this fails, the file format is not standard-compatible
+            assert len(parts) == len(gffInfoFields)
+            # Normalize data
+            normalizedInfo = {
+                "seqid": None if parts[0] == "." else parse.unquote(parts[0]),
+                "source": None if parts[1] == "." else parse.unquote(parts[1]),
+                "type": None if parts[2] == "." else parse.unquote(parts[2]),
+                "start": None if parts[3] == "." else int(parts[3]),
+                "end": None if parts[4] == "." else int(parts[4]),
+                "score": None if parts[5] == "." else float(parts[5]),
+                "strand": None if parts[6] == "." else parse.unquote(parts[6]),
+                "phase": None if parts[7] == "." else parse.unquote(parts[7]),
+                "attributes": parseGFFAttributes(parts[8])
+            }
+            # Alternatively, you can emit the dictionary here, if you need mutability:
+            #    yield normalizedInfo
+            yield GFFRecord(**normalizedInfo)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file", help="The GFF3 input file (.gz allowed)")
+    parser.add_argument("--print-records", action="store_true", help="Print all GeneInfo objects, not only")
+    parser.add_argument("--filter-type", help="Ignore records not having the given type")
+    args = parser.parse_args()
+    # Execute the parser
+    recordCount = 0
+    for record in parseGFF3(args.file):
+        # Apply filter, if any
+        if args.filter_type and record.type != args.filter_type:
+            continue
+        # Print record if specified by the user
+        if args.print_records: print(record)
+        # Access attributes like this: my_strand = record.strand
+        recordCount += 1
+    print("Total records: %d" % recordCount)
diff --git a/DDV/html_template/seadragon.css → DDV/html_template/fluentdna.css b/DDV/html_template/seadragon.css → DDV/html_template/fluentdna.css
diff --git a/DDV/html_template/index.html b/DDV/html_template/index.html
@@ -20,7 +20,7 @@
     <script type='text/javascript' src='Biojs.js'></script>
     <script type='text/javascript' src='Biojs.Sequence.js'></script>
     <script src='./nucleotideNumber.js' type='text/javascript'></script>
-    <link rel='stylesheet' type='text/css' href='seadragon.css'/>
+    <link rel='stylesheet' type='text/css' href='fluentdna.css'/>
 </head>
 
 <body>