Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjustments for new Plugins #1351

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Changelog

## v0.0.25 (in progress)
## v0.0.25
- Update GeoMx NGS directory schema
- Added EPIC dataset field derived_dataset_type to UNIQUE_FIELDS_MAP

## v0.0.24
- Release MERFISH
Expand All @@ -21,6 +22,7 @@
- Update Visium with probes directory schema
- Update Visium no probes directory schema
- Change to EntityTypeInfo constraint format to support constraints endpoint
- Adding support for EPIC's new plugin

## v0.0.23
- Add token to validation_utils.get_assaytype_data, replace URL string concatenation with urllib
Expand Down
5 changes: 3 additions & 2 deletions examples/dataset-examples/bad-no-assay-type/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
```
Preflight Errors:
- No assay_type or dataset_type in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv.
```
- 'Required dataset field not present in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv.
One of the following is required: assay_type, dataset_type, derived_dataset_type.'
```
15 changes: 15 additions & 0 deletions src/ingest_validation_tools/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,18 @@ class Sample(EntityTypes):
@classmethod
def with_parent_type(cls):
return [*[entity_type for entity_type in cls], OtherTypes.SAMPLE]


# These should all be considered to be mutually exclusive,
# even within the same type
UNIQUE_FIELDS_MAP = {
OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"},
OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"},
DatasetType.DATASET: {"assay_type", "dataset_type", "derived_dataset_type"},
OtherTypes.SOURCE: {"strain_rrid"},
OtherTypes.ORGAN: {"organ_id"}, # Deprecated
OtherTypes.SAMPLE: {"sample_id"},
}
OTHER_FIELDS_UNIQUE_FIELDS_MAP = {
k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET
}
27 changes: 24 additions & 3 deletions src/ingest_validation_tools/plugin_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections.abc import Iterator
from importlib import util
from pathlib import Path
from typing import List, Optional, Tuple, Type, Union
from typing import Dict, List, Optional, Tuple, Type, Union

from ingest_validation_tools.schema_loader import SchemaVersion

Expand Down Expand Up @@ -53,6 +53,9 @@ def __init__(
assay_type: str,
contains: List = [],
verbose: bool = False,
metadata_tsv: SchemaVersion = None,
globus_token: str = None,
app_context: Dict[str, str] = {},
**kwargs,
):
"""
Expand All @@ -72,6 +75,9 @@ def __init__(
self.assay_type = assay_type
self.contains = contains
self.verbose = verbose
self.metadata_tsv = metadata_tsv
self.token = globus_token
self.app_context = app_context

def _log(self, message):
if self.verbose:
Expand Down Expand Up @@ -99,6 +105,8 @@ def run_plugin_validators_iter(
plugin_dir: PathOrStr,
is_shared_upload: bool,
verbose: bool = True,
globus_token: str = None,
app_context: Dict[str, str] = {},
**kwargs,
) -> Iterator[KeyValuePair]:
"""
Expand Down Expand Up @@ -134,7 +142,15 @@ def run_plugin_validators_iter(
raise ValidatorError(f"{data_path} should be the base directory of a dataset")
data_paths.append(data_path)
for k, v in validation_error_iter(
data_paths, sv.dataset_type, plugin_dir, sv.contains, verbose=verbose, **kwargs
data_paths,
sv.dataset_type,
plugin_dir,
sv.contains,
verbose=verbose,
metadata_tsv=sv,
globus_token=globus_token,
app_context=app_context,
**kwargs,
):
yield k, v
else:
Expand Down Expand Up @@ -179,6 +195,9 @@ def validation_error_iter(
plugin_dir: PathOrStr,
contains: List,
verbose: bool = False,
metadata_tsv: SchemaVersion = None,
globus_token: str = None,
app_context: Dict[str, str] = {},
**kwargs,
) -> Iterator[KeyValuePair]:
"""
Expand All @@ -195,6 +214,8 @@ def validation_error_iter(
error messages
"""
for cls in validation_class_iter(plugin_dir):
validator = cls(paths, assay_type, contains, verbose)
validator = cls(
paths, assay_type, contains, verbose, metadata_tsv, globus_token, app_context
)
for err in validator.collect_errors(**kwargs):
yield cls, err
21 changes: 15 additions & 6 deletions src/ingest_validation_tools/schema_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Dict, List, Optional, Sequence, Set, Union

from ingest_validation_tools.enums import (
UNIQUE_FIELDS_MAP,
DatasetType,
EntityTypes,
OtherTypes,
Expand Down Expand Up @@ -91,13 +92,8 @@ def get_row_data(self):
self.is_cedar = True
else:
self.is_cedar = False
self.get_dataset_type_value()
self.version = self.rows[0].get("version")
assay_type = self.rows[0].get("assay_type")
dataset_type = self.rows[0].get("dataset_type")
if assay_type is not None and dataset_type is not None:
raise PreflightError(f"Found both assay_type and dataset_type for path {self.path}!")
else:
self.dataset_type = assay_type if assay_type else dataset_type

def get_assayclassifier_data(self):
self.vitessce_hints = self.soft_assay_data.get("vitessce-hints", [])
Expand All @@ -109,6 +105,19 @@ def get_assayclassifier_data(self):
contains = self.soft_assay_data.get("must-contain", [])
self.contains = [schema.lower() for schema in contains]

def get_dataset_type_value(self):
dataset_fields = {
k: v for k, v in self.rows[0].items() if k in UNIQUE_FIELDS_MAP[DatasetType.DATASET]
}
values_found = list(dataset_fields.values())
if len(values_found) == 0:
return
elif len(values_found) > 1:
raise PreflightError(
f"Found multiple dataset fields for path {self.path}: {dataset_fields}"
)
self.dataset_type = values_found[0]


@dataclass
class EntityTypeInfo:
Expand Down
5 changes: 4 additions & 1 deletion src/ingest_validation_tools/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def get_app_context(self, submitted_app_context: Dict):
Ensure that all default values are present, but privilege any
submitted values (after making a basic validity check).
"""
for url_type in ["entities_url", "ingest_url", "constraints_url"]:
for url_type in ["entities_url", "ingest_url", "constraints_url", "uuid_url"]:
if submitted_app_context.get(url_type):
split_url = urlsplit(submitted_app_context[url_type])
assert (
Expand All @@ -193,6 +193,7 @@ def get_app_context(self, submitted_app_context: Dict):
"request_header": {"X-Hubmap-Application": "ingest-pipeline"},
# TODO: does not work in HuBMAP currently
"constraints_url": None,
"uuid_url": "https://uuid.api.hubmapconsortium.org/uuid/",
} | submitted_app_context

def validation_routine(
Expand Down Expand Up @@ -444,6 +445,8 @@ def _get_plugin_errors(self, **kwargs) -> dict:
plugin_path,
self.is_shared_upload,
verbose=self.verbose,
globus_token=self.globus_token,
app_context=self.app_context,
**kwargs,
):
if v is None:
Expand Down
25 changes: 11 additions & 14 deletions src/ingest_validation_tools/validation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@
DirectoryValidationErrors,
validate_directory,
)
from ingest_validation_tools.enums import DatasetType, EntityTypes, OtherTypes, Sample
from ingest_validation_tools.enums import (
OTHER_FIELDS_UNIQUE_FIELDS_MAP,
UNIQUE_FIELDS_MAP,
DatasetType,
EntityTypes,
OtherTypes,
Sample,
)
from ingest_validation_tools.schema_loader import (
EntityTypeInfo,
PreflightError,
Expand All @@ -21,18 +28,6 @@
)
from ingest_validation_tools.table_validator import ReportType

UNIQUE_FIELDS_MAP = {
OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"},
OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"},
DatasetType.DATASET: {"assay_type", "dataset_type"},
OtherTypes.SOURCE: {"strain_rrid"},
OtherTypes.ORGAN: {"organ_id"}, # Deprecated?
OtherTypes.SAMPLE: {"sample_id"},
}
OTHER_FIELDS_UNIQUE_FIELDS_MAP = {
k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET
}


def match_field_in_unique_fields(
match_fields: list, path: str, dataset=True
Expand Down Expand Up @@ -86,7 +81,9 @@ def get_schema_version(
return other_type
message = []
if not [field for field in UNIQUE_FIELDS_MAP[DatasetType.DATASET] if field in rows[0].keys()]:
message.append(f"No assay_type or dataset_type in {path}.")
message.append(
f"Required dataset field not present in {path}. One of the following is required: {', '.join(sorted(UNIQUE_FIELDS_MAP[DatasetType.DATASET]))}"
)
if "channel_id" in rows[0]:
message.append('Has "channel_id": Antibodies TSV found where metadata TSV expected.')
elif "orcid_id" in rows[0]:
Expand Down
Loading