nextstrain · jameshadfield · Aug 20, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 16, 2023
diff --git a/augur/data/schema-annotations.json b/augur/data/schema-annotations.json
@@ -1,34 +1,90 @@
 {
     "type" : "object",
     "$schema": "http://json-schema.org/draft-06/schema#",
-    "title": "JSON object for the `annotations` key, typically produced by `augur translate`",
-    "description": "Coordinates etc of genes / genome",
+    "$id": "https://nextstrain.org/schemas/augur/annotations",
+    "title": "Schema for the 'annotations' property (node-data JSON) or the 'genome_annotations' property (auspice JSON)",
+    "properties": {
+        "nuc": {
+            "type": "object",
+            "allOf": [{ "$ref": "#/$defs/startend" }],
+            "properties": {
+                "start": {
+                    "enum": [1],
+                    "$comment": "nuc must begin at 1"
+                },
+                "strand": {
+                    "type": "string",
+                    "enum":["+"],
+                    "description": "Strand is optional for nuc, as it should be +ve for all genomes (-ve strand genomes are reverse complemented)",
+                    "$comment": "Auspice will not proceed if the JSON has strand='-'"
+                }
+            },
+            "additionalProperties": true,
+            "$comment": "All other properties are unused by Auspice."
+        }
+    },
+    "required": ["nuc"],
     "patternProperties": {
-        "^[a-zA-Z0-9*_-]+$": {
+        "^(?!nuc)[a-zA-Z0-9*_-]+$": {
+            "$comment": "Each object here defines a single CDS",
             "type": "object",
+            "oneOf": [{ "$ref": "#/$defs/startend" }, { "$ref": "#/$defs/segments" }],
+            "additionalProperties": true,
+            "required": ["strand"],
             "properties": {
-                "seqid":{
-                    "description": "Sequence on which the coordinates below are valid. Could be viral segment, bacterial contig, etc",
-                    "$comment": "Unused by Auspice 2.0",
-                    "type": "string"
+                "gene": {
+                    "type": "string",
+                    "description": "The name of the gene the CDS is from. Optional.",
+                    "$comment": "Shown in on-hover infobox & influences default CDS colors"
                 },
-                "type": {
-                    "description": "Type of the feature. could be mRNA, CDS, or similar",
-                    "$comment": "Unused by Auspice 2.0",
-                    "type": "string"
+                "strand": {
+                    "description": "Strand of the CDS",
+                    "type": "string",
+                    "enum": ["-", "+"]
                 },
-                "start": {
-                    "description": "Gene start position (one-based, following GFF format)",
-                    "type": "number"
+                "color": {
+                    "type": "string",
+                    "description": "A CSS color or a color hex code. Optional."
                 },
-                "end": {
-                    "description": "Gene end position (one-based closed, last position of feature, following GFF format)",
-                    "type": "number"
+                "display_name": {
+                    "type": "string",
+                    "$comment": "Shown in the on-hover info box"
                 },
-                "strand": {
-                    "description": "Positive or negative strand",
+                "description": {
                     "type": "string",
-                    "enum": ["-","+"]
+                    "$comment": "Shown in the on-hover info box"
+                }
+            }
+        }
+    },
+    "$defs": {
+        "startend": {
+            "type": "object",
+            "required": ["start", "end"],
+            "properties": {
+                "start": {
+                    "type": "integer",
+                    "minimum": 1,
+                    "description": "Start position (one-based, following GFF format)"
+                },
+                "end": {
+                    "type": "integer",
+                    "minimum": 2,
+                    "description": "End position (one-based, following GFF format). This value _must_ be greater than the start."
+                }
+            }
+        },
+        "segments": {
+            "type": "object",
+            "required": ["segments"],
+            "properties": {
+                "segments": {
+                    "type": "array",
+                    "minItems": 1,
+                    "items": {
+                        "type": "object",
+                        "allOf": [{ "$ref": "#/$defs/startend" }]
+                    }
                 }
             }
         }

diff --git a/augur/data/schema-export-v2.json b/augur/data/schema-export-v2.json
@@ -51,44 +51,7 @@
                     }
                 },
                 "genome_annotations": {
-                    "description": "Genome annotations (e.g. genes), relative to the reference genome",
-                    "$comment": "Required for the entropy panel",
-                    "type": "object",
-                    "required": ["nuc"],
-                    "additionalProperties": false,
-                    "properties": {
-                        "nuc": {
-                            "type": "object",
-                            "properties": {
-                                "seqid":{
-                                    "description": "Sequence on which the coordinates below are valid. Could be viral segment, bacterial contig, etc",
-                                    "$comment": "currently unused by Auspice",
-                                    "type": "string"
-                                },
-                                "type": {
-                                    "description": "Type of the feature. could be mRNA, CDS, or similar",
-                                    "$comment": "currently unused by Auspice",
-                                    "type": "string"
-                                },
-                                "start": {
-                                    "description": "Gene start position (one-based, following GFF format)",
-                                    "type": "number"
-                                },
-                                "end": {
-                                    "description": "Gene end position (one-based closed, last position of feature, following GFF format)",
-                                    "type": "number"
-                                },
-                                "strand": {
-                                    "description": "Positive or negative strand",
-                                    "type": "string",
-                                    "enum": ["-","+"]
-                                }
-                            }
-                        }
-                    },
-                    "patternProperties": {
-                        "^[a-zA-Z0-9*_-]+$": {"$ref": "#/properties/meta/properties/genome_annotations/properties/nuc"}
-                    }
+                    "$ref": "https://nextstrain.org/schemas/augur/annotations"
                 },
                 "filters": {
                     "description": "These appear as filters in the footer of Auspice (which populates the displayed values based upon the tree)",

diff --git a/augur/validate.py b/augur/validate.py
@@ -25,7 +25,7 @@ class ValidateError(Exception):
     pass
 
 
-def load_json_schema(path):
+def load_json_schema(path, refs=None):
     '''
     Load a JSON schema from the augur included set of schemas
     (located in augur/data)
@@ -40,7 +40,28 @@ def load_json_schema(path):
         Validator.check_schema(schema)
     except jsonschema.exceptions.SchemaError as err:
         raise ValidateError(f"Schema {path} is not a valid JSON Schema ({Validator.META_SCHEMA['$schema']}). Error: {err}")
-    return Validator(schema)
+
+    if refs:
+        # Make the validator aware of additional schemas
+        schema_store = {k: json.loads(resource_string(__package__, os.path.join("data", v))) for k,v in refs.items()}
+        resolver = jsonschema.RefResolver.from_schema(schema,store=schema_store)
+        schema_validator = Validator(schema, resolver=resolver)
+    else:
+        schema_validator = Validator(schema)
+
+    # By default $ref URLs which we don't define in a schema_store are fetched
+    # by jsonschema. This often indicates a typo (the $ref doesn't match the key
+    # of the schema_store) or we forgot to add a local mapping for a new $ref.
+    # Either way, Augur should not be accessing the network. 
+    def resolve_remote(url):
+        # The exception type is not important as jsonschema will catch & re-raise as a RefResolutionError
+        raise Exception(f"The schema used for validation attempted to fetch the remote URL '{url!r}'. " +
+                        "Augur should resolve schema references to local files, please check the schema used " +
+                        "and update the appropriate schema_store as needed." )
+    schema_validator.resolver.resolve_remote = resolve_remote
+
+    return schema_validator
+
 
 def load_json(path):
     with open(path, 'rb') as fh:
@@ -163,7 +184,15 @@ def auspice_config_v2(config_json, **kwargs):
     validate(config, schema, config_json)
 
 def export_v2(main_json, **kwargs):
-    main_schema = load_json_schema("schema-export-v2.json")
+    # The main_schema uses references to other schemas, and the suggested use is
+    # to define these refs as valid URLs. Augur itself should not access schemas
+    # over the wire so we provide a mapping between URLs and filepaths here. The
+    # filepath is specified relative to ./augur/data (where all the schemas
+    # live).
+    refs = {
+        'https://nextstrain.org/schemas/augur/annotations': "schema-annotations.json"
+    }
+    main_schema = load_json_schema("schema-export-v2.json", refs)
 
     if main_json.endswith("frequencies.json") or main_json.endswith("entropy.json") or main_json.endswith("sequences.json"):
         raise ValidateError("This validation subfunction is for the main `augur export v2` JSON only.")

diff --git a/tests/test_validate.py b/tests/test_validate.py
@@ -4,7 +4,10 @@
 from augur.validate import (
     validate_collection_config_fields,
     validate_collection_display_defaults,
-    validate_measurements_config
+    validate_measurements_config,
+    load_json_schema,
+    validate_json,
+    ValidateError
 )
 
 
@@ -88,3 +91,71 @@ def test_validate_measurements_config_invalid_default_collection(self, example_m
         }
         assert not validate_measurements_config(measurements)
         assert capsys.readouterr().err == "ERROR: The default collection key 'invalid_collection' does not match any of the collections' keys.\n"
+
+
+@pytest.fixture
+def genome_annotation_schema():
+    return load_json_schema("schema-annotations.json")
+
+class TestValidateGenomeAnnotations():
+    def test_negative_strand_nuc(self, capsys, genome_annotation_schema):
+        d = {"nuc": {"start": 1, "end": 200, "strand": "-"}}
+        with pytest.raises(ValidateError):
+            validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_nuc_not_starting_at_one(self, capsys, genome_annotation_schema):
+        d = {"nuc": {"start": 100, "end": 200, "strand": "+"}}
+        with pytest.raises(ValidateError):
+            validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_missing_nuc(self, capsys, genome_annotation_schema):
+        d = {"cds": {"start": 100, "end": 200, "strand": "+"}}
+        with pytest.raises(ValidateError):
+            validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_missing_properties(self, capsys, genome_annotation_schema):
+        d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 20, "strand": "+"}}
+        with pytest.raises(ValidateError):
+            validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_not_stranded_cds(self, capsys, genome_annotation_schema):
+        # Strand . is for features that are not stranded (as per GFF spec), and thus they're not CDSs
+        d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 18, "end": 20, "strand": "."}}
+        with pytest.raises(ValidateError):
+            validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_negative_coordinates(self, capsys, genome_annotation_schema):
+        d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": -2, "end": 10, "strand": "+"}}
+        with pytest.raises(ValidateError):
+            validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_valid_genome(self, capsys, genome_annotation_schema):
+        d = {"nuc": {"start": 1, "end": 100}, "cds": {"start": 20,  "end": 28, "strand": "+"}}
+        validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_valid_segmented_genome(self, capsys, genome_annotation_schema):
+        d = {"nuc": {"start": 1, "end": 100},
+             "cds": {"segments": [{"start": 20,  "end": 28}], "strand": "+"}}
+        validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_invalid_segmented_genome(self, capsys, genome_annotation_schema):
+        d = {"nuc": {"start": 1, "end": 100},
+             "cds": {"segments": [{"start": 20,  "end": 28}, {"start": 27}], "strand": "+"}}
+        with pytest.raises(ValidateError):
+            validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
+
+    def test_string_coordinates(self, capsys, genome_annotation_schema):
+        d = {"nuc": {"start": 1, "end": 100},
+             "cds": {"segments": [{"start": 20,  "end": 28}, {"start": "27", "end": "29"}], "strand": "+"}}
+        with pytest.raises(ValidateError):
+            validate_json(d, genome_annotation_schema, "<test-json>")
+        capsys.readouterr() # suppress validation error printing
diff --git a/tests/util_support/test_node_data_file.py b/tests/util_support/test_node_data_file.py
@@ -38,7 +38,7 @@ def test_validate_valid(self, build_node_data_file):
         build_node_data_file(
             f"""
             {{
-                "annotations": {{ "a": {{ "start": 5 }} }},
+                "annotations": {{ "nuc": {{ "start": 1, "end": 100 }} }},
                 "generated_by": {{ "program": "augur", "version": "{__version__}" }},
                 "nodes": {{ "a": 5 }}
             }}