Skip to content

Proposal for a Function to Generate JSON Schema for Workflow Parameter Files #273

Open
@suecharo

Description

@suecharo

Hi CWL community,

I'm reaching out to share some thoughts that emerged from a recent discussion within the Japan community (@inutano, @tom-tan) regarding the development of JSON schemas for workflow parameter files.
Specifically, we're looking at creating JSON schemas that corresponds to the YAML templates generated by cwltool --make-template.

While the templates created by cwltool --make-template are incredibly useful, I believe that a JSON schema would be more suitable for generating forms for expected workflow inputs and representing workflow parameters in Workflow Execution Services (WES). (Ref.: nf-core - rnaseq - schema_input.json)

To address this, I have drafted a preliminary Python function snippet:

from json import dumps
from typing import Any

from cwl_utils.parser import load_document_by_uri, save


def parse_inputs(cwl_url: str) -> Any:
    cwl_obj = load_document_by_uri(cwl_url)
    saved_obj = save(cwl_obj)
    if "inputs" not in saved_obj:
        raise ValueError("Inputs are missing in the provided object.")
    return saved_obj["inputs"]


def inputs_to_jsonschema(inputs: Any) -> Any:
    """
    Converts a CWL inputs object into a jsonschema object.

    Args:
        inputs: CWL inputs object.

    Returns:
        A jsonschema object.
    """
    schema = {
        "$schema": "http://json-schema.org/draft-07/schema#",
        "type": "object",
        "properties": {},
        "required": [],
        "additionalProperties": False,
    }

    # Refer to https://www.commonwl.org/v1.2/Workflow.html#WorkflowInputParameter for more details
    for input_item in inputs:
        input_id = input_item.get("id")
        input_type = input_item.get("type")
        if input_id is None or input_type is None:
            raise ValueError(
                "Each item in the 'inputs' object must include 'id' and 'type' fields.")

        property_schema = _input_type_to_property_schema(input_type)

        if "secondaryFiles" in input_item:
            # TODO: do nothing?
            # secondaryFiles does not seem to affect the --make-template
            # For example, refer to $ cwltool --make-template https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/stage-array.cwl
            pass

        if "default" in input_item:
            property_schema["default"] = input_item["default"]

        schema["properties"][input_id] = property_schema  # type: ignore
        if "default" not in input_item and "null" not in input_type:
            schema["required"].append(input_id)

    return schema


def _input_type_to_property_schema(input_type: Any) -> Any:
    if isinstance(input_type, dict):
        nested_type = input_type.get("type")
        if nested_type is None:
            raise ValueError("The 'inputs.[].type' nested type object must contain a 'type' field.")

        if nested_type == "enum":
            enum = input_type.get("symbols")
            if enum is None:
                raise ValueError("The 'inputs.[].type' nested type object must contain a 'symbols' field.")
            return {
                "type": "string",
                "enum": enum,
            }

        elif nested_type == "record":
            schema = {
                "type": "object",
                "properties": {},
                "required": [],
                "additionalProperties": False,
            }

            fields = input_type.get("fields")
            if fields is None:
                raise ValueError("The 'inputs.[].type' nested type object must contain a 'fields' field.")
            for field in fields:
                field_name = field.get("name")
                field_type = field.get("type")
                if field_name is None or field_type is None:
                    raise ValueError("The 'inputs.[].type.[].fields' object must contain 'name' and 'type' fields.")
                field_id = field_name.split("#")[-1].split("/")[-1]
                schema["properties"][field_id] = _input_type_to_property_schema(field_type)  # type: ignore
                if "default" not in field:
                    schema["required"].append(field_id)
            return schema

        elif nested_type == "array":
            item_type = input_type.get("items")
            if item_type is None:
                raise ValueError("If 'inputs.[].type.type' is 'array', 'inputs.[].type' must contain an 'items' field.")
            return {
                "type": "array",
                "items": _input_type_to_property_schema(item_type),
                "additionalItems": False
            }

        else:
            raise ValueError(f"Unexpected type encountered: {input_type}.")

    elif isinstance(input_type, list):
        if len(input_type) != 2 or "null" not in input_type:
            raise ValueError(f"Unexpected type encountered: {input_type}.")
        original_type = [t for t in input_type if t != "null"][0]
        schema = _input_type_to_property_schema(original_type)
        schema["nullable"] = True
        return schema

    else:
        if input_type == "File":
            return {
                "type": "object",
                "properties": {
                    "class": {"type": "string", "const": "File"},
                    "path": {"type": "string"},
                    "location": {"type": "string"}
                },
                "required": ["class"],
                "oneOf": [
                    {"required": ["path"]},
                    {"required": ["location"]}
                ],
                "additionalProperties": False,
            }
        elif input_type == "Directory":
            return {
                "type": "object",
                "properties": {
                    "class": {"type": "string", "const": "Directory"},
                    "path": {"type": "string"},
                    "location": {"type": "string"}
                },
                "required": ["class"],
                "oneOf": [
                    {"required": ["path"]},
                    {"required": ["location"]}
                ],
                "additionalProperties": False,
            }
        elif input_type == "Any":
            return {
                "anyOf": [
                    {"type": "boolean"},
                    {"type": "integer"},
                    {"type": "number"},
                    {"type": "string"},
                    {"type": "array"},
                    {"type": "object"}
                ]
            }
        elif input_type == "null":
            return {"type": "null"}
        else:
            if input_type in ["long", "float", "double"]:
                return {"type": "number"}
            elif input_type == "int":
                return {"type": "integer"}
            else:
                return {"type": input_type}


def validate_jsonschema_itself(jsonschema: Any) -> None:
    from jsonschema.validators import validator_for

    validator = validator_for(jsonschema)
    validator.check_schema(jsonschema)


def main() -> None:
    test_urls = [
        # Sapporo example workflow.
        "https://raw.githubusercontent.com/sapporo-wes/sapporo-service/main/tests/resources/cwltool/trimming_and_qc.cwl",
        # When the definition itself is a nasty case.
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/echo-tool-packed.cwl",
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/revsort-packed.cwl",
        # When the type is nasty.
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/anon_enum_inside_array.cwl",
        # The number of parameters is a little large, and the definition itself is a straightforward case.
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/bwa-mem-tool.cwl",
        # The case where CommandInputParameter is shortened (e.g., param: string)
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/env-tool1.cwl",
        # No input parameters
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/envvar3.cwl",
        # Any
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/params.cwl",
        # Dir
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/dir.cwl",
        # SecondaryFiles
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/secondaryfiles/rename-inputs.cwl",
        "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/stage-array.cwl",
    ]

    for url in test_urls:
        try:
            print(f"{'-' * 3} Test URL: {url} {'-' * 10}")
            print("\n")

            inputs = parse_inputs(url)
            print("Inputs object: \n")
            print(dumps(inputs, indent=2))
            print("\n")

            print("JSON Schema: \n")
            jsonschema = inputs_to_jsonschema(inputs)
            validate_jsonschema_itself(jsonschema)
            print(dumps(jsonschema, indent=2))
            print("\n")

        except Exception as e:
            print(f"Failed to parse: {url}")
            print(e)
            import traceback
            traceback.print_exc()


if __name__ == "__main__":
    main()

This function is capable of generating a JSON schema like the following example (https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests/bwa-mem-tool.cwl):

{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "reference": {
      "type": "object",
      "properties": {
        "class": {
          "type": "string",
          "const": "File"
        },
        "path": {
          "type": "string"
        },
        "location": {
          "type": "string"
        }
      },
      "required": [
        "class"
      ],
      "oneOf": [
        {
          "required": [
            "path"
          ]
        },
        {
          "required": [
            "location"
          ]
        }
      ],
      "additionalProperties": false
    },
    "reads": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "class": {
            "type": "string",
            "const": "File"
          },
          "path": {
            "type": "string"
          },
          "location": {
            "type": "string"
          }
        },
        "required": [
          "class"
        ],
        "oneOf": [
          {
            "required": [
              "path"
            ]
          },
          {
            "required": [
              "location"
            ]
          }
        ],
        "additionalProperties": false
      },
      "additionalItems": false
    },
    "minimum_seed_length": {
      "type": "integer"
    },
    "min_std_max_min": {
      "type": "array",
      "items": {
        "type": "integer"
      },
      "additionalItems": false
    },
    "args.py": {
      "type": "object",
      "properties": {
        "class": {
          "type": "string",
          "const": "File"
        },
        "path": {
          "type": "string"
        },
        "location": {
          "type": "string"
        }
      },
      "required": [
        "class"
      ],
      "oneOf": [
        {
          "required": [
            "path"
          ]
        },
        {
          "required": [
            "location"
          ]
        }
      ],
      "additionalProperties": false,
      "default": {
        "class": "File",
        "location": "args.py"
      }
    }
  },
  "required": [
    "reference",
    "reads",
    "minimum_seed_length",
    "min_std_max_min"
  ],
  "additionalProperties": false
}

I am aware that there may be deficiencies, such as a lack of comprehensive test cases. Therefore, I am eager to receive feedback on this implementation approach and any other suggestions you may have.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions