Add static_checker to parser utils (#193)

common-workflow-language · Jan 31, 2023 · 1ecaedc · 1ecaedc
1 parent e7c5fa0
commit 1ecaedc
Show file tree

Hide file tree

Showing 13 changed files with 1,502 additions and 100 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -13,6 +13,7 @@ include testdata/*.yaml
 include testdata/*.input
 include testdata/*.ttl
 include testdata/*.owl
+include testdata/checker_wf/*.cwl
 include cwl_utils/py.typed
 include docs/conf.py docs/Makefile docs/_static/favicon.ico docs/requirements.txt
 include docs/*.rst

diff --git a/cwl_utils/parser/__init__.py b/cwl_utils/parser/__init__.py
@@ -11,38 +11,97 @@
 from ..errors import GraphTargetMissingException
 from . import cwl_v1_0, cwl_v1_1, cwl_v1_2
 
+
 LoadingOptions = Union[
     cwl_v1_0.LoadingOptions, cwl_v1_1.LoadingOptions, cwl_v1_2.LoadingOptions
 ]
 """Type union for a CWL v1.x LoadingOptions object."""
 Saveable = Union[cwl_v1_0.Saveable, cwl_v1_1.Saveable, cwl_v1_2.Saveable]
 """Type union for a CWL v1.x Savable object."""
+InputParameter = Union[
+    cwl_v1_0.InputParameter, cwl_v1_1.InputParameter, cwl_v1_2.InputParameter
+]
+"""Type union for a CWL v1.x InputParameter object."""
+OutputParameter = Union[
+    cwl_v1_0.OutputParameter, cwl_v1_1.OutputParameter, cwl_v1_2.OutputParameter
+]
+"""Type union for a CWL v1.x OutputParameter object."""
 Workflow = Union[cwl_v1_0.Workflow, cwl_v1_1.Workflow, cwl_v1_2.Workflow]
-"""Type union for a CWL v1.x Workflow object."""
 WorkflowTypes = (cwl_v1_0.Workflow, cwl_v1_1.Workflow, cwl_v1_2.Workflow)
+"""Type union for a CWL v1.x Workflow object."""
+WorkflowInputParameter = Union[
+    cwl_v1_0.InputParameter,
+    cwl_v1_1.WorkflowInputParameter,
+    cwl_v1_2.WorkflowInputParameter,
+]
+"""Type union for a CWL v1.x WorkflowInputParameter object."""
+WorkflowOutputParameter = Union[
+    cwl_v1_0.WorkflowOutputParameter,
+    cwl_v1_1.WorkflowOutputParameter,
+    cwl_v1_2.WorkflowOutputParameter,
+]
+"""Type union for a CWL v1.x WorkflowOutputParameter object."""
 WorkflowStep = Union[
     cwl_v1_0.WorkflowStep, cwl_v1_1.WorkflowStep, cwl_v1_2.WorkflowStep
 ]
 """Type union for a CWL v1.x WorkflowStep object."""
+WorkflowStepInput = Union[
+    cwl_v1_0.WorkflowStepInput, cwl_v1_1.WorkflowStepInput, cwl_v1_2.WorkflowStepInput
+]
+"""Type union for a CWL v1.x WorkflowStepInput object."""
+WorkflowStepOutput = Union[
+    cwl_v1_0.WorkflowStepOutput,
+    cwl_v1_1.WorkflowStepOutput,
+    cwl_v1_2.WorkflowStepOutput,
+]
+"""Type union for a CWL v1.x WorkflowStepOutput object."""
 CommandLineTool = Union[
     cwl_v1_0.CommandLineTool, cwl_v1_1.CommandLineTool, cwl_v1_2.CommandLineTool
 ]
 """Type union for a CWL v1.x CommandLineTool object."""
+CommandLineBinding = Union[
+    cwl_v1_0.CommandLineBinding,
+    cwl_v1_1.CommandLineBinding,
+    cwl_v1_2.CommandLineBinding,
+]
+"""Type union for a CWL v1.x CommandLineBinding object."""
+CommandOutputParameter = Union[
+    cwl_v1_0.CommandOutputParameter,
+    cwl_v1_1.CommandOutputParameter,
+    cwl_v1_2.CommandOutputParameter,
+]
+"""Type union for a CWL v1.x CommandOutputParameter object."""
 ExpressionTool = Union[
     cwl_v1_0.ExpressionTool, cwl_v1_1.ExpressionTool, cwl_v1_2.ExpressionTool
 ]
 """Type union for a CWL v1.x ExpressionTool object."""
 DockerRequirement = Union[
     cwl_v1_0.DockerRequirement, cwl_v1_1.DockerRequirement, cwl_v1_2.DockerRequirement
 ]
-"""Type union for a CWL v1.x DockerRequirement object."""
 DockerRequirementTypes = (
     cwl_v1_0.DockerRequirement,
     cwl_v1_1.DockerRequirement,
     cwl_v1_2.DockerRequirement,
 )
+"""Type union for a CWL v1.x DockerRequirement object."""
 Process = Union[Workflow, CommandLineTool, ExpressionTool, cwl_v1_2.Operation]
 """Type Union for a CWL v1.x Process object."""
+ArraySchema = Union[cwl_v1_0.ArraySchema, cwl_v1_1.ArraySchema, cwl_v1_2.ArraySchema]
+"""Type Union for a CWL v1.x ArraySchema object."""
+EnumSchema = Union[cwl_v1_0.EnumSchema, cwl_v1_1.EnumSchema, cwl_v1_2.EnumSchema]
+"""Type Union for a CWL v1.x EnumSchema object."""
+RecordSchema = Union[
+    cwl_v1_0.RecordSchema, cwl_v1_1.RecordSchema, cwl_v1_2.RecordSchema
+]
+"""Type Union for a CWL v1.x RecordSchema object."""
+File = Union[cwl_v1_0.File, cwl_v1_1.File, cwl_v1_2.File]
+"""Type Union for a CWL v1.x File object."""
+SecondaryFileSchema = Union[cwl_v1_1.SecondaryFileSchema, cwl_v1_2.SecondaryFileSchema]
+"""Type Union for a CWL v1.x SecondaryFileSchema object."""
+Directory = Union[cwl_v1_0.Directory, cwl_v1_1.Directory, cwl_v1_2.Directory]
+"""Type Union for a CWL v1.x Directory object."""
+Dirent = Union[cwl_v1_0.Dirent, cwl_v1_1.Dirent, cwl_v1_2.Dirent]
+"""Type Union for a CWL v1.x Dirent object."""
 
 _Loader = Union[cwl_v1_0._Loader, cwl_v1_1._Loader, cwl_v1_2._Loader]
 

diff --git a/cwl_utils/parser/cwl_v1_0_utils.py b/cwl_utils/parser/cwl_v1_0_utils.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
+import copy
 import hashlib
-from typing import IO, Any, List, MutableSequence, Optional, Tuple, Union, cast
+import logging
+from collections import namedtuple
+from typing import Any, Dict, IO, List, MutableSequence, Optional, Tuple, Union, cast
 
 from ruamel import yaml
 from schema_salad.exceptions import ValidationException
-from schema_salad.utils import json_dumps
+from schema_salad.sourceline import SourceLine
+from schema_salad.utils import aslist, json_dumps
 
 import cwl_utils.parser
 import cwl_utils.parser.cwl_v1_0 as cwl
@@ -13,6 +17,47 @@
 
 CONTENT_LIMIT: int = 64 * 1024
 
+_logger = logging.getLogger("cwl_utils")
+
+SrcSink = namedtuple("SrcSink", ["src", "sink", "linkMerge", "message"])
+
+
+def _compare_records(
+    src: cwl.RecordSchema, sink: cwl.RecordSchema, strict: bool = False
+) -> bool:
+    """
+    Compare two records, ensuring they have compatible fields.
+
+    This handles normalizing record names, which will be relative to workflow
+    step, so that they can be compared.
+    """
+    srcfields = {cwl.shortname(field.name): field.type for field in (src.fields or {})}
+    sinkfields = {
+        cwl.shortname(field.name): field.type for field in (sink.fields or {})
+    }
+    for key in sinkfields.keys():
+        if (
+            not can_assign_src_to_sink(
+                srcfields.get(key, "null"), sinkfields.get(key, "null"), strict
+            )
+            and sinkfields.get(key) is not None
+        ):
+            _logger.info(
+                "Record comparison failure for %s and %s\n"
+                "Did not match fields for %s: %s and %s",
+                cast(
+                    Union[cwl.InputRecordSchema, cwl.CommandOutputRecordSchema], src
+                ).name,
+                cast(
+                    Union[cwl.InputRecordSchema, cwl.CommandOutputRecordSchema], sink
+                ).name,
+                key,
+                srcfields.get(key),
+                sinkfields.get(key),
+            )
+            return False
+    return True
+
 
 def _compare_type(type1: Any, type2: Any) -> bool:
     if isinstance(type1, cwl.ArraySchema) and isinstance(type2, cwl.ArraySchema):
@@ -38,6 +83,115 @@ def _compare_type(type1: Any, type2: Any) -> bool:
         return bool(type1 == type2)
 
 
+def can_assign_src_to_sink(src: Any, sink: Any, strict: bool = False) -> bool:
+    """
+    Check for identical type specifications, ignoring extra keys like inputBinding.
+
+    src: admissible source types
+    sink: admissible sink types
+
+    In non-strict comparison, at least one source type must match one sink type,
+       except for 'null'.
+    In strict comparison, all source types must match at least one sink type.
+    """
+    if src == "Any" or sink == "Any":
+        return True
+    if isinstance(src, cwl.ArraySchema) and isinstance(sink, cwl.ArraySchema):
+        return can_assign_src_to_sink(src.items, sink.items, strict)
+    if isinstance(src, cwl.RecordSchema) and isinstance(sink, cwl.RecordSchema):
+        return _compare_records(src, sink, strict)
+    if isinstance(src, MutableSequence):
+        if strict:
+            for this_src in src:
+                if not can_assign_src_to_sink(this_src, sink):
+                    return False
+            return True
+        for this_src in src:
+            if this_src != "null" and can_assign_src_to_sink(this_src, sink):
+                return True
+        return False
+    if isinstance(sink, MutableSequence):
+        for this_sink in sink:
+            if can_assign_src_to_sink(src, this_sink):
+                return True
+        return False
+    return bool(src == sink)
+
+
+def check_all_types(
+    src_dict: Dict[str, Any],
+    sinks: MutableSequence[Union[cwl.WorkflowStepInput, cwl.WorkflowOutputParameter]],
+    type_dict: Dict[str, Any],
+) -> Dict[str, List[SrcSink]]:
+    """Given a list of sinks, check if their types match with the types of their sources."""
+    validation: Dict[str, List[SrcSink]] = {"warning": [], "exception": []}
+    for sink in sinks:
+        if isinstance(sink, cwl.WorkflowOutputParameter):
+            sourceName = "outputSource"
+            sourceField = sink.outputSource
+        elif isinstance(sink, cwl.WorkflowStepInput):
+            sourceName = "source"
+            sourceField = sink.source
+        else:
+            continue
+        if sourceField is not None:
+            if isinstance(sourceField, MutableSequence):
+                linkMerge = sink.linkMerge or (
+                    "merge_nested" if len(sourceField) > 1 else None
+                )
+                srcs_of_sink = []
+                for parm_id in sourceField:
+                    srcs_of_sink += [src_dict[parm_id]]
+            else:
+                parm_id = cast(str, sourceField)
+                if parm_id not in src_dict:
+                    raise SourceLine(sink, sourceName, ValidationException).makeError(
+                        f"{sourceName} not found: {parm_id}"
+                    )
+                srcs_of_sink = [src_dict[parm_id]]
+                linkMerge = None
+            for src in srcs_of_sink:
+                check_result = check_types(
+                    type_dict[cast(str, src.id)],
+                    type_dict[cast(str, sink.id)],
+                    linkMerge,
+                    getattr(sink, "valueFrom", None),
+                )
+                if check_result == "warning":
+                    validation["warning"].append(SrcSink(src, sink, linkMerge, None))
+                elif check_result == "exception":
+                    validation["exception"].append(SrcSink(src, sink, linkMerge, None))
+    return validation
+
+
+def check_types(
+    srctype: Any,
+    sinktype: Any,
+    linkMerge: Optional[str],
+    valueFrom: Optional[str] = None,
+) -> str:
+    """
+    Check if the source and sink types are correct.
+
+    Acceptable types are "pass", "warning", or "exception".
+    """
+    if valueFrom is not None:
+        return "pass"
+    if linkMerge is None:
+        if can_assign_src_to_sink(srctype, sinktype, strict=True):
+            return "pass"
+        if can_assign_src_to_sink(srctype, sinktype, strict=False):
+            return "warning"
+        return "exception"
+    if linkMerge == "merge_nested":
+        return check_types(
+            cwl.ArraySchema(items=srctype, type="array"), sinktype, None, None
+        )
+    if linkMerge == "merge_flattened":
+        return check_types(merge_flatten_type(srctype), sinktype, None, None)
+    raise ValidationException(f"Invalid value {linkMerge} for linkMerge field.")
+
+
 def content_limit_respected_read_bytes(f: IO[bytes]) -> bytes:
     """
     Read file content up to 64 kB as a byte array.
@@ -96,6 +250,59 @@ def merge_flatten_type(src: Any) -> Any:
     return cwl.ArraySchema(type="array", items=src)
 
 
+def type_for_step_input(
+    step: cwl.WorkflowStep,
+    in_: cwl.WorkflowStepInput,
+) -> Any:
+    """Determine the type for the given step input."""
+    if in_.valueFrom is not None:
+        return "Any"
+    step_run = cwl_utils.parser.utils.load_step(step)
+    cwl_utils.parser.utils.convert_stdstreams_to_files(step_run)
+    if step_run and step_run.inputs:
+        for step_input in step_run.inputs:
+            if (
+                cast(str, step_input.id).split("#")[-1]
+                == cast(str, in_.id).split("#")[-1]
+            ):
+                input_type = step_input.type
+                if step.scatter is not None and in_.id in aslist(step.scatter):
+                    input_type = cwl.ArraySchema(items=input_type, type="array")
+                return input_type
+    return "Any"
+
+
+def type_for_step_output(
+    step: cwl.WorkflowStep,
+    sourcename: str,
+) -> Any:
+    """Determine the type for the given step output."""
+    step_run = cwl_utils.parser.utils.load_step(step)
+    cwl_utils.parser.utils.convert_stdstreams_to_files(step_run)
+    if step_run and step_run.outputs:
+        for step_output in step_run.outputs:
+            if (
+                step_output.id.split("#")[-1].split("/")[-1]
+                == sourcename.split("#")[-1].split("/")[-1]
+            ):
+                output_type = step_output.type
+                if step.scatter is not None:
+                    if step.scatterMethod == "nested_crossproduct":
+                        for _ in range(len(aslist(step.scatter))):
+                            output_type = cwl.ArraySchema(
+                                items=output_type, type="array"
+                            )
+                    else:
+                        output_type = cwl.ArraySchema(items=output_type, type="array")
+                return output_type
+    raise ValidationException(
+        "param {} not found in {}.".format(
+            sourcename,
+            yaml.main.round_trip_dump(cwl.save(step_run)),
+        )
+    )
+
+
 def type_for_source(
     process: Union[cwl.CommandLineTool, cwl.Workflow, cwl.ExpressionTool],
     sourcenames: Union[str, List[str]],
@@ -142,7 +349,7 @@ def type_for_source(
         return cwl.ArraySchema(items=new_type, type="array")
     elif linkMerge == "merge_flattened":
         return merge_flatten_type(new_type)
-    elif isinstance(sourcenames, List):
+    elif isinstance(sourcenames, List) and len(sourcenames) > 1:
         return cwl.ArraySchema(items=new_type, type="array")
     else:
         return new_type
@@ -181,26 +388,14 @@ def param_for_source_id(
                         == step.id.split("#")[-1]
                         and step.out
                     ):
+                        step_run = cwl_utils.parser.utils.load_step(step)
+                        cwl_utils.parser.utils.convert_stdstreams_to_files(step_run)
                         for outp in step.out:
                             outp_id = outp if isinstance(outp, str) else outp.id
                             if (
                                 outp_id.split("#")[-1].split("/")[-1]
                                 == sourcename.split("#")[-1].split("/")[-1]
                             ):
-                                step_run = step.run
-                                if isinstance(step.run, str):
-                                    step_run = cwl_utils.parser.load_document_by_uri(
-                                        path=target.loadingOptions.fetcher.urljoin(
-                                            base_url=cast(
-                                                str, target.loadingOptions.fileuri
-                                            ),
-                                            url=step.run,
-                                        ),
-                                        loadingOptions=target.loadingOptions,
-                                    )
-                                    cwl_utils.parser.utils.convert_stdstreams_to_files(
-                                        step_run
-                                    )
                                 if step_run and step_run.outputs:
                                     for output in step_run.outputs:
                                         if (