hubmapconsortium · sunset666 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
 # Changelog
 
-## v0.0.25 (in progress)
+## v0.0.25
 - Update GeoMx NGS directory schema
+- Added EPIC dataset field derived_dataset_type to UNIQUE_FIELDS_MAP
 
 ## v0.0.24
 - Release MERFISH
@@ -21,6 +22,7 @@
 - Update Visium with probes directory schema
 - Update Visium no probes directory schema
 - Change to EntityTypeInfo constraint format to support constraints endpoint
+- Adding support for EPIC's new plugin
 
 ## v0.0.23
 - Add token to validation_utils.get_assaytype_data, replace URL string concatenation with urllib

diff --git a/examples/dataset-examples/bad-no-assay-type/README.md b/examples/dataset-examples/bad-no-assay-type/README.md
@@ -1,4 +1,5 @@
 ```
 Preflight Errors:
-- No assay_type or dataset_type in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv.
-```
+- 'Required dataset field not present in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv.
+  One of the following is required: assay_type, dataset_type, derived_dataset_type.'
+```
@@ -208,3 +208,18 @@ class Sample(EntityTypes):
     @classmethod
     def with_parent_type(cls):
         return [*[entity_type for entity_type in cls], OtherTypes.SAMPLE]
+
+
+# These should all be considered to be mutually exclusive,
+# even within the same type
+UNIQUE_FIELDS_MAP = {
+    OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"},
+    OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"},
+    DatasetType.DATASET: {"assay_type", "dataset_type", "derived_dataset_type"},
+    OtherTypes.SOURCE: {"strain_rrid"},
+    OtherTypes.ORGAN: {"organ_id"},  # Deprecated
+    OtherTypes.SAMPLE: {"sample_id"},
+}
+OTHER_FIELDS_UNIQUE_FIELDS_MAP = {
+    k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET
+}
@@ -3,7 +3,7 @@
 from collections.abc import Iterator
 from importlib import util
 from pathlib import Path
-from typing import List, Optional, Tuple, Type, Union
+from typing import Dict, List, Optional, Tuple, Type, Union
 
 from ingest_validation_tools.schema_loader import SchemaVersion
 
@@ -53,6 +53,9 @@ def __init__(
         assay_type: str,
         contains: List = [],
         verbose: bool = False,
+        metadata_tsv: SchemaVersion = None,
+        globus_token: str = None,
+        app_context: Dict[str, str] = {},
         **kwargs,
     ):
         """
@@ -72,6 +75,9 @@ def __init__(
         self.assay_type = assay_type
         self.contains = contains
         self.verbose = verbose
+        self.metadata_tsv = metadata_tsv
+        self.token = globus_token
+        self.app_context = app_context
 
     def _log(self, message):
         if self.verbose:
@@ -99,6 +105,8 @@ def run_plugin_validators_iter(
     plugin_dir: PathOrStr,
     is_shared_upload: bool,
     verbose: bool = True,
+    globus_token: str = None,
+    app_context: Dict[str, str] = {},
     **kwargs,
 ) -> Iterator[KeyValuePair]:
     """
@@ -134,7 +142,15 @@ def run_plugin_validators_iter(
                     raise ValidatorError(f"{data_path} should be the base directory of a dataset")
                 data_paths.append(data_path)
             for k, v in validation_error_iter(
-                data_paths, sv.dataset_type, plugin_dir, sv.contains, verbose=verbose, **kwargs
+                data_paths,
+                sv.dataset_type,
+                plugin_dir,
+                sv.contains,
+                verbose=verbose,
+                metadata_tsv=sv,
+                globus_token=globus_token,
+                app_context=app_context,
+                **kwargs,
             ):
                 yield k, v
         else:
@@ -179,6 +195,9 @@ def validation_error_iter(
     plugin_dir: PathOrStr,
     contains: List,
     verbose: bool = False,
+    metadata_tsv: SchemaVersion = None,
+    globus_token: str = None,
+    app_context: Dict[str, str] = {},
     **kwargs,
 ) -> Iterator[KeyValuePair]:
     """
@@ -195,6 +214,8 @@ def validation_error_iter(
     error messages
     """
     for cls in validation_class_iter(plugin_dir):
-        validator = cls(paths, assay_type, contains, verbose)
+        validator = cls(
+            paths, assay_type, contains, verbose, metadata_tsv, globus_token, app_context
+        )
         for err in validator.collect_errors(**kwargs):
             yield cls, err
@@ -8,6 +8,7 @@
 from typing import Dict, List, Optional, Sequence, Set, Union
 
 from ingest_validation_tools.enums import (
+    UNIQUE_FIELDS_MAP,
     DatasetType,
     EntityTypes,
     OtherTypes,
@@ -91,13 +92,8 @@ def get_row_data(self):
             self.is_cedar = True
         else:
             self.is_cedar = False
+        self.get_dataset_type_value()
         self.version = self.rows[0].get("version")
-        assay_type = self.rows[0].get("assay_type")
-        dataset_type = self.rows[0].get("dataset_type")
-        if assay_type is not None and dataset_type is not None:
-            raise PreflightError(f"Found both assay_type and dataset_type for path {self.path}!")
-        else:
-            self.dataset_type = assay_type if assay_type else dataset_type
 
     def get_assayclassifier_data(self):
         self.vitessce_hints = self.soft_assay_data.get("vitessce-hints", [])
@@ -109,6 +105,19 @@ def get_assayclassifier_data(self):
         contains = self.soft_assay_data.get("must-contain", [])
         self.contains = [schema.lower() for schema in contains]
 
+    def get_dataset_type_value(self):
+        dataset_fields = {
+            k: v for k, v in self.rows[0].items() if k in UNIQUE_FIELDS_MAP[DatasetType.DATASET]
+        }
+        values_found = list(dataset_fields.values())
+        if len(values_found) == 0:
+            return
+        elif len(values_found) > 1:
+            raise PreflightError(
+                f"Found multiple dataset fields for path {self.path}: {dataset_fields}"
+            )
+        self.dataset_type = values_found[0]
+
 
 @dataclass
 class EntityTypeInfo:

@@ -181,7 +181,7 @@ def get_app_context(self, submitted_app_context: Dict):
         Ensure that all default values are present, but privilege any
         submitted values (after making a basic validity check).
         """
-        for url_type in ["entities_url", "ingest_url", "constraints_url"]:
+        for url_type in ["entities_url", "ingest_url", "constraints_url", "uuid_url"]:
             if submitted_app_context.get(url_type):
                 split_url = urlsplit(submitted_app_context[url_type])
                 assert (
@@ -193,6 +193,7 @@ def get_app_context(self, submitted_app_context: Dict):
             "request_header": {"X-Hubmap-Application": "ingest-pipeline"},
             # TODO: does not work in HuBMAP currently
             "constraints_url": None,
+            "uuid_url": "https://uuid.api.hubmapconsortium.org/uuid/",
         } | submitted_app_context
 
     def validation_routine(
@@ -444,6 +445,8 @@ def _get_plugin_errors(self, **kwargs) -> dict:
                         plugin_path,
                         self.is_shared_upload,
                         verbose=self.verbose,
+                        globus_token=self.globus_token,
+                        app_context=self.app_context,
                         **kwargs,
                     ):
                         if v is None:

@@ -12,7 +12,14 @@
     DirectoryValidationErrors,
     validate_directory,
 )
-from ingest_validation_tools.enums import DatasetType, EntityTypes, OtherTypes, Sample
+from ingest_validation_tools.enums import (
+    OTHER_FIELDS_UNIQUE_FIELDS_MAP,
+    UNIQUE_FIELDS_MAP,
+    DatasetType,
+    EntityTypes,
+    OtherTypes,
+    Sample,
+)
 from ingest_validation_tools.schema_loader import (
     EntityTypeInfo,
     PreflightError,
@@ -21,18 +28,6 @@
 )
 from ingest_validation_tools.table_validator import ReportType
 
-UNIQUE_FIELDS_MAP = {
-    OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"},
-    OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"},
-    DatasetType.DATASET: {"assay_type", "dataset_type"},
-    OtherTypes.SOURCE: {"strain_rrid"},
-    OtherTypes.ORGAN: {"organ_id"},  # Deprecated?
-    OtherTypes.SAMPLE: {"sample_id"},
-}
-OTHER_FIELDS_UNIQUE_FIELDS_MAP = {
-    k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET
-}
-
 
 def match_field_in_unique_fields(
     match_fields: list, path: str, dataset=True
@@ -86,7 +81,9 @@ def get_schema_version(
         return other_type
     message = []
     if not [field for field in UNIQUE_FIELDS_MAP[DatasetType.DATASET] if field in rows[0].keys()]:
-        message.append(f"No assay_type or dataset_type in {path}.")
+        message.append(
+            f"Required dataset field not present in {path}. One of the following is required: {', '.join(sorted(UNIQUE_FIELDS_MAP[DatasetType.DATASET]))}"
+        )
         if "channel_id" in rows[0]:
             message.append('Has "channel_id": Antibodies TSV found where metadata TSV expected.')
         elif "orcid_id" in rows[0]: