Skip to content

Commit

Permalink
Add static_checker to parser utils (#193)
Browse files Browse the repository at this point in the history
  • Loading branch information
GlassOfWhiskey authored Jan 31, 2023
1 parent e7c5fa0 commit 1ecaedc
Show file tree
Hide file tree
Showing 13 changed files with 1,502 additions and 100 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ include testdata/*.yaml
include testdata/*.input
include testdata/*.ttl
include testdata/*.owl
include testdata/checker_wf/*.cwl
include cwl_utils/py.typed
include docs/conf.py docs/Makefile docs/_static/favicon.ico docs/requirements.txt
include docs/*.rst
Expand Down
63 changes: 61 additions & 2 deletions cwl_utils/parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,97 @@
from ..errors import GraphTargetMissingException
from . import cwl_v1_0, cwl_v1_1, cwl_v1_2


LoadingOptions = Union[
cwl_v1_0.LoadingOptions, cwl_v1_1.LoadingOptions, cwl_v1_2.LoadingOptions
]
"""Type union for a CWL v1.x LoadingOptions object."""
Saveable = Union[cwl_v1_0.Saveable, cwl_v1_1.Saveable, cwl_v1_2.Saveable]
"""Type union for a CWL v1.x Savable object."""
InputParameter = Union[
cwl_v1_0.InputParameter, cwl_v1_1.InputParameter, cwl_v1_2.InputParameter
]
"""Type union for a CWL v1.x InputParameter object."""
OutputParameter = Union[
cwl_v1_0.OutputParameter, cwl_v1_1.OutputParameter, cwl_v1_2.OutputParameter
]
"""Type union for a CWL v1.x OutputParameter object."""
Workflow = Union[cwl_v1_0.Workflow, cwl_v1_1.Workflow, cwl_v1_2.Workflow]
"""Type union for a CWL v1.x Workflow object."""
WorkflowTypes = (cwl_v1_0.Workflow, cwl_v1_1.Workflow, cwl_v1_2.Workflow)
"""Type union for a CWL v1.x Workflow object."""
WorkflowInputParameter = Union[
cwl_v1_0.InputParameter,
cwl_v1_1.WorkflowInputParameter,
cwl_v1_2.WorkflowInputParameter,
]
"""Type union for a CWL v1.x WorkflowInputParameter object."""
WorkflowOutputParameter = Union[
cwl_v1_0.WorkflowOutputParameter,
cwl_v1_1.WorkflowOutputParameter,
cwl_v1_2.WorkflowOutputParameter,
]
"""Type union for a CWL v1.x WorkflowOutputParameter object."""
WorkflowStep = Union[
cwl_v1_0.WorkflowStep, cwl_v1_1.WorkflowStep, cwl_v1_2.WorkflowStep
]
"""Type union for a CWL v1.x WorkflowStep object."""
WorkflowStepInput = Union[
cwl_v1_0.WorkflowStepInput, cwl_v1_1.WorkflowStepInput, cwl_v1_2.WorkflowStepInput
]
"""Type union for a CWL v1.x WorkflowStepInput object."""
WorkflowStepOutput = Union[
cwl_v1_0.WorkflowStepOutput,
cwl_v1_1.WorkflowStepOutput,
cwl_v1_2.WorkflowStepOutput,
]
"""Type union for a CWL v1.x WorkflowStepOutput object."""
CommandLineTool = Union[
cwl_v1_0.CommandLineTool, cwl_v1_1.CommandLineTool, cwl_v1_2.CommandLineTool
]
"""Type union for a CWL v1.x CommandLineTool object."""
CommandLineBinding = Union[
cwl_v1_0.CommandLineBinding,
cwl_v1_1.CommandLineBinding,
cwl_v1_2.CommandLineBinding,
]
"""Type union for a CWL v1.x CommandLineBinding object."""
CommandOutputParameter = Union[
cwl_v1_0.CommandOutputParameter,
cwl_v1_1.CommandOutputParameter,
cwl_v1_2.CommandOutputParameter,
]
"""Type union for a CWL v1.x CommandOutputParameter object."""
ExpressionTool = Union[
cwl_v1_0.ExpressionTool, cwl_v1_1.ExpressionTool, cwl_v1_2.ExpressionTool
]
"""Type union for a CWL v1.x ExpressionTool object."""
DockerRequirement = Union[
cwl_v1_0.DockerRequirement, cwl_v1_1.DockerRequirement, cwl_v1_2.DockerRequirement
]
"""Type union for a CWL v1.x DockerRequirement object."""
DockerRequirementTypes = (
cwl_v1_0.DockerRequirement,
cwl_v1_1.DockerRequirement,
cwl_v1_2.DockerRequirement,
)
"""Type union for a CWL v1.x DockerRequirement object."""
Process = Union[Workflow, CommandLineTool, ExpressionTool, cwl_v1_2.Operation]
"""Type Union for a CWL v1.x Process object."""
ArraySchema = Union[cwl_v1_0.ArraySchema, cwl_v1_1.ArraySchema, cwl_v1_2.ArraySchema]
"""Type Union for a CWL v1.x ArraySchema object."""
EnumSchema = Union[cwl_v1_0.EnumSchema, cwl_v1_1.EnumSchema, cwl_v1_2.EnumSchema]
"""Type Union for a CWL v1.x EnumSchema object."""
RecordSchema = Union[
cwl_v1_0.RecordSchema, cwl_v1_1.RecordSchema, cwl_v1_2.RecordSchema
]
"""Type Union for a CWL v1.x RecordSchema object."""
File = Union[cwl_v1_0.File, cwl_v1_1.File, cwl_v1_2.File]
"""Type Union for a CWL v1.x File object."""
SecondaryFileSchema = Union[cwl_v1_1.SecondaryFileSchema, cwl_v1_2.SecondaryFileSchema]
"""Type Union for a CWL v1.x SecondaryFileSchema object."""
Directory = Union[cwl_v1_0.Directory, cwl_v1_1.Directory, cwl_v1_2.Directory]
"""Type Union for a CWL v1.x Directory object."""
Dirent = Union[cwl_v1_0.Dirent, cwl_v1_1.Dirent, cwl_v1_2.Dirent]
"""Type Union for a CWL v1.x Dirent object."""

_Loader = Union[cwl_v1_0._Loader, cwl_v1_1._Loader, cwl_v1_2._Loader]

Expand Down
229 changes: 212 additions & 17 deletions cwl_utils/parser/cwl_v1_0_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
import copy
import hashlib
from typing import IO, Any, List, MutableSequence, Optional, Tuple, Union, cast
import logging
from collections import namedtuple
from typing import Any, Dict, IO, List, MutableSequence, Optional, Tuple, Union, cast

from ruamel import yaml
from schema_salad.exceptions import ValidationException
from schema_salad.utils import json_dumps
from schema_salad.sourceline import SourceLine
from schema_salad.utils import aslist, json_dumps

import cwl_utils.parser
import cwl_utils.parser.cwl_v1_0 as cwl
Expand All @@ -13,6 +17,47 @@

CONTENT_LIMIT: int = 64 * 1024

_logger = logging.getLogger("cwl_utils")

SrcSink = namedtuple("SrcSink", ["src", "sink", "linkMerge", "message"])


def _compare_records(
src: cwl.RecordSchema, sink: cwl.RecordSchema, strict: bool = False
) -> bool:
"""
Compare two records, ensuring they have compatible fields.
This handles normalizing record names, which will be relative to workflow
step, so that they can be compared.
"""
srcfields = {cwl.shortname(field.name): field.type for field in (src.fields or {})}
sinkfields = {
cwl.shortname(field.name): field.type for field in (sink.fields or {})
}
for key in sinkfields.keys():
if (
not can_assign_src_to_sink(
srcfields.get(key, "null"), sinkfields.get(key, "null"), strict
)
and sinkfields.get(key) is not None
):
_logger.info(
"Record comparison failure for %s and %s\n"
"Did not match fields for %s: %s and %s",
cast(
Union[cwl.InputRecordSchema, cwl.CommandOutputRecordSchema], src
).name,
cast(
Union[cwl.InputRecordSchema, cwl.CommandOutputRecordSchema], sink
).name,
key,
srcfields.get(key),
sinkfields.get(key),
)
return False
return True


def _compare_type(type1: Any, type2: Any) -> bool:
if isinstance(type1, cwl.ArraySchema) and isinstance(type2, cwl.ArraySchema):
Expand All @@ -38,6 +83,115 @@ def _compare_type(type1: Any, type2: Any) -> bool:
return bool(type1 == type2)


def can_assign_src_to_sink(src: Any, sink: Any, strict: bool = False) -> bool:
"""
Check for identical type specifications, ignoring extra keys like inputBinding.
src: admissible source types
sink: admissible sink types
In non-strict comparison, at least one source type must match one sink type,
except for 'null'.
In strict comparison, all source types must match at least one sink type.
"""
if src == "Any" or sink == "Any":
return True
if isinstance(src, cwl.ArraySchema) and isinstance(sink, cwl.ArraySchema):
return can_assign_src_to_sink(src.items, sink.items, strict)
if isinstance(src, cwl.RecordSchema) and isinstance(sink, cwl.RecordSchema):
return _compare_records(src, sink, strict)
if isinstance(src, MutableSequence):
if strict:
for this_src in src:
if not can_assign_src_to_sink(this_src, sink):
return False
return True
for this_src in src:
if this_src != "null" and can_assign_src_to_sink(this_src, sink):
return True
return False
if isinstance(sink, MutableSequence):
for this_sink in sink:
if can_assign_src_to_sink(src, this_sink):
return True
return False
return bool(src == sink)


def check_all_types(
src_dict: Dict[str, Any],
sinks: MutableSequence[Union[cwl.WorkflowStepInput, cwl.WorkflowOutputParameter]],
type_dict: Dict[str, Any],
) -> Dict[str, List[SrcSink]]:
"""Given a list of sinks, check if their types match with the types of their sources."""
validation: Dict[str, List[SrcSink]] = {"warning": [], "exception": []}
for sink in sinks:
if isinstance(sink, cwl.WorkflowOutputParameter):
sourceName = "outputSource"
sourceField = sink.outputSource
elif isinstance(sink, cwl.WorkflowStepInput):
sourceName = "source"
sourceField = sink.source
else:
continue
if sourceField is not None:
if isinstance(sourceField, MutableSequence):
linkMerge = sink.linkMerge or (
"merge_nested" if len(sourceField) > 1 else None
)
srcs_of_sink = []
for parm_id in sourceField:
srcs_of_sink += [src_dict[parm_id]]
else:
parm_id = cast(str, sourceField)
if parm_id not in src_dict:
raise SourceLine(sink, sourceName, ValidationException).makeError(
f"{sourceName} not found: {parm_id}"
)
srcs_of_sink = [src_dict[parm_id]]
linkMerge = None
for src in srcs_of_sink:
check_result = check_types(
type_dict[cast(str, src.id)],
type_dict[cast(str, sink.id)],
linkMerge,
getattr(sink, "valueFrom", None),
)
if check_result == "warning":
validation["warning"].append(SrcSink(src, sink, linkMerge, None))
elif check_result == "exception":
validation["exception"].append(SrcSink(src, sink, linkMerge, None))
return validation


def check_types(
srctype: Any,
sinktype: Any,
linkMerge: Optional[str],
valueFrom: Optional[str] = None,
) -> str:
"""
Check if the source and sink types are correct.
Acceptable types are "pass", "warning", or "exception".
"""
if valueFrom is not None:
return "pass"
if linkMerge is None:
if can_assign_src_to_sink(srctype, sinktype, strict=True):
return "pass"
if can_assign_src_to_sink(srctype, sinktype, strict=False):
return "warning"
return "exception"
if linkMerge == "merge_nested":
return check_types(
cwl.ArraySchema(items=srctype, type="array"), sinktype, None, None
)
if linkMerge == "merge_flattened":
return check_types(merge_flatten_type(srctype), sinktype, None, None)
raise ValidationException(f"Invalid value {linkMerge} for linkMerge field.")


def content_limit_respected_read_bytes(f: IO[bytes]) -> bytes:
"""
Read file content up to 64 kB as a byte array.
Expand Down Expand Up @@ -96,6 +250,59 @@ def merge_flatten_type(src: Any) -> Any:
return cwl.ArraySchema(type="array", items=src)


def type_for_step_input(
step: cwl.WorkflowStep,
in_: cwl.WorkflowStepInput,
) -> Any:
"""Determine the type for the given step input."""
if in_.valueFrom is not None:
return "Any"
step_run = cwl_utils.parser.utils.load_step(step)
cwl_utils.parser.utils.convert_stdstreams_to_files(step_run)
if step_run and step_run.inputs:
for step_input in step_run.inputs:
if (
cast(str, step_input.id).split("#")[-1]
== cast(str, in_.id).split("#")[-1]
):
input_type = step_input.type
if step.scatter is not None and in_.id in aslist(step.scatter):
input_type = cwl.ArraySchema(items=input_type, type="array")
return input_type
return "Any"


def type_for_step_output(
step: cwl.WorkflowStep,
sourcename: str,
) -> Any:
"""Determine the type for the given step output."""
step_run = cwl_utils.parser.utils.load_step(step)
cwl_utils.parser.utils.convert_stdstreams_to_files(step_run)
if step_run and step_run.outputs:
for step_output in step_run.outputs:
if (
step_output.id.split("#")[-1].split("/")[-1]
== sourcename.split("#")[-1].split("/")[-1]
):
output_type = step_output.type
if step.scatter is not None:
if step.scatterMethod == "nested_crossproduct":
for _ in range(len(aslist(step.scatter))):
output_type = cwl.ArraySchema(
items=output_type, type="array"
)
else:
output_type = cwl.ArraySchema(items=output_type, type="array")
return output_type
raise ValidationException(
"param {} not found in {}.".format(
sourcename,
yaml.main.round_trip_dump(cwl.save(step_run)),
)
)


def type_for_source(
process: Union[cwl.CommandLineTool, cwl.Workflow, cwl.ExpressionTool],
sourcenames: Union[str, List[str]],
Expand Down Expand Up @@ -142,7 +349,7 @@ def type_for_source(
return cwl.ArraySchema(items=new_type, type="array")
elif linkMerge == "merge_flattened":
return merge_flatten_type(new_type)
elif isinstance(sourcenames, List):
elif isinstance(sourcenames, List) and len(sourcenames) > 1:
return cwl.ArraySchema(items=new_type, type="array")
else:
return new_type
Expand Down Expand Up @@ -181,26 +388,14 @@ def param_for_source_id(
== step.id.split("#")[-1]
and step.out
):
step_run = cwl_utils.parser.utils.load_step(step)
cwl_utils.parser.utils.convert_stdstreams_to_files(step_run)
for outp in step.out:
outp_id = outp if isinstance(outp, str) else outp.id
if (
outp_id.split("#")[-1].split("/")[-1]
== sourcename.split("#")[-1].split("/")[-1]
):
step_run = step.run
if isinstance(step.run, str):
step_run = cwl_utils.parser.load_document_by_uri(
path=target.loadingOptions.fetcher.urljoin(
base_url=cast(
str, target.loadingOptions.fileuri
),
url=step.run,
),
loadingOptions=target.loadingOptions,
)
cwl_utils.parser.utils.convert_stdstreams_to_files(
step_run
)
if step_run and step_run.outputs:
for output in step_run.outputs:
if (
Expand Down
Loading

0 comments on commit 1ecaedc

Please sign in to comment.