diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..d3ee6a7ff --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - id: check-toml + - id: flake8 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..4cdd0c857 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include maestrowf/datastructures/schemas.json diff --git a/Pipfile b/Pipfile index 6de850fe1..1c689656a 100644 --- a/Pipfile +++ b/Pipfile @@ -4,23 +4,28 @@ verify_ssl = true name = "pypi" [packages] -"enum34" = "*" -filelock = "*" +PyYAML = ">=4.2b1" six = "*" +filelock = "*" tabulate = "*" -Fabric = "*" -PyYAML = ">= 4.2b1" +dill = "*" maestrowf = {path = "."} [dev-packages] -"flake8" = "*" +flake8 = "*" pydocstyle = "*" pylint = "*" tox = "*" coverage = "*" -sphinx_rtd_theme = "*" -Sphinx = "*" pytest = "*" +fabric = "*" +Sphinx = "*" +pytest-cov = "*" +pre-commit = "*" +sphinx-rtd-theme = "*" + +[pipenv] +allow_prereleases = true [requires] python_version = "3.6" diff --git a/maestrowf/datastructures/schemas.json b/maestrowf/datastructures/schemas.json new file mode 100644 index 000000000..7875af444 --- /dev/null +++ b/maestrowf/datastructures/schemas.json @@ -0,0 +1,115 @@ +{ + "DESCRIPTION": { + "type": "object", + "properties": { + "name": {"type": "string", "minLength": 1}, + "description": {"type": "string", "minLength": 1} + }, + "required": [ + "name", + "description" + ] + }, + "PARAM": { + "type": "object", + "properties": { + "values": { + "type": "array" + }, + "label": {"type": "string", "minLength": 1} + }, + "additionalProperties": false, + "required": [ + "values", + "label" + ] + }, + "STUDY_STEP": { + "type": "object", + "properties": { + "name": {"type": "string", "minLength": 1}, + "description": {"type": "string", "minLength": 1}, + "run": { + "type": "object", + "properties": { + "cmd": {"type": "string", "minLength": 1}, + "depends": {"type": "array", "uniqueItems": true}, + "pre": {"type": "string", "minLength": 1}, + "post": {"type": "string", "minLength": 1}, + "restart": {"type": "string", "minLength": 1}, + "nodes": {"type": "integer"}, + "procs": {"type": "integer"}, + "gpus": {"type": "integer"}, + "cores per task": {"type": "integer"}, + "walltime": {"type": "string", "minLength": 1}, + "reservation": {"type": "string", "minLength": 1} + }, + "additionalProperties": false, + "required": [ + "cmd" + ] + } + }, + "additionalProperties": false, + "required": [ + "name", + "description", + "run" + ] + }, + "ENV": { + "type": "object", + "properties": { + "variables": {"type": "object"}, + "labels": {"type": "object"}, + "sources": {"type": "object"}, + "dependencies": { + "type": "object", + "properties": { + "paths": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string", "minLength": 1}, + "path": {"type": "string", "minLength": 1} + }, + "required": [ + "name", + "path" + ], + "additionalProperties": false + } + }, + "git": { + "type": "array", + "items": { + "properties": { + "name": {"type": "string", "minLength": 1}, + "path": {"type": "string", "minLength": 1}, + "url": {"type": "string", "minLength": 1} + }, + "required": [ + "name", + "path", + "url" + ] + } + }, + "spack": { + "type": "object", + "properties": { + "name": {"type": "string", "minLength": 1}, + "package_name": {"type": "string", "minLength": 1} + }, + "required": [ + "type", + "package_name" + ] + } + } + } + }, + "additionalProperties": false + } +} diff --git a/maestrowf/datastructures/yamlspecification.py b/maestrowf/datastructures/yamlspecification.py index 2528d0ff1..987d7f242 100644 --- a/maestrowf/datastructures/yamlspecification.py +++ b/maestrowf/datastructures/yamlspecification.py @@ -30,18 +30,32 @@ """Module containing all things needed for a YAML Study Specification.""" from copy import deepcopy +import json import logging +import os +import re import yaml +import jsonschema + from maestrowf.abstracts import Specification -from maestrowf.datastructures.core import ParameterGenerator, \ - StudyEnvironment, \ - StudyStep +from maestrowf.datastructures.core import ( + ParameterGenerator, + StudyEnvironment, + StudyStep, +) from maestrowf.datastructures import environment logger = logging.getLogger(__name__) +# load the schemas.json file +dirpath = os.path.dirname(os.path.abspath(__file__)) +schemas_file = os.path.join(dirpath, "schemas.json") +with open(schemas_file, "r") as json_file: + schemas = json.load(json_file) + + class YAMLSpecification(Specification): """ Class for loading and verifying a Study Specification. @@ -94,7 +108,7 @@ def load_specification(cls, path): logger.info("Loading specification -- path = %s", path) try: # Load the YAML spec from the file. - with open(path, 'r') as data: + with open(path, "r") as data: specification = cls.load_specification_from_stream(data) except Exception as e: @@ -121,18 +135,18 @@ def load_specification_from_stream(cls, stream): logger.warning( "*** PyYAML is using an unsafe version with a known " "load vulnerability. Please upgrade your installation " - "to a more recent version! ***") + "to a more recent version! ***" + ) spec = yaml.load(stream) logger.debug("Loaded specification -- \n%s", spec["description"]) specification = cls() specification.path = None specification.description = spec.pop("description", {}) - specification.environment = spec.pop("env", - {'variables': {}, - 'sources': [], - 'labels': {}, - 'dependencies': {}}) + specification.environment = spec.pop( + "env", + {"variables": {}, "sources": [], "labels": {}, "dependencies": {}}, + ) specification.batch = spec.pop("batch", {}) specification.study = spec.pop("study", []) specification.globals = spec.pop("global.parameters", {}) @@ -145,11 +159,13 @@ def load_specification_from_stream(cls, stream): def verify(self): """Verify the whole specification.""" self.verify_description() + self.verify_environment() self.verify_study() self.verify_parameters() - logger.info("Specification %s - Verified. No apparent issues.", - self.name) + logger.debug( + "Specification %s - Verified. No apparent issues.", self.name + ) def verify_description(self): """ @@ -162,21 +178,13 @@ def verify_description(self): # and study. # We're REQUIRING that user specify a name and description for the # study. - try: - if not self.description: - raise ValueError("The 'description' key is required in the " - "YAML study for user sanity. Provide a " - "description.") - else: - if not (self.description["name"] and - self.description["description"]): - raise ValueError("Both 'name' and 'description' must be " - "provided for a valid study description.") - except Exception as e: - logger.exception(e.args) - raise - logger.info("Study description verified -- \n%s", self.description) + # validate description against json schema + YAMLSpecification.validate_schema( + "description", self.description, schemas["DESCRIPTION"] + ) + + logger.debug("Study description verified -- \n%s", self.description) def _verify_variables(self): """ @@ -189,23 +197,29 @@ def _verify_variables(self): :returns: A set of keys encountered in the variables section. """ keys_seen = set() - for key, value in self.environment['variables'].items(): + for key, value in self.environment["variables"].items(): logger.debug("Verifying %s...", key) if not key: - msg = "All variables must have a valid name. Empty strings " \ - "are not allowed." + msg = ( + "All variables must have a valid name. Empty strings " + "are not allowed." + ) logger.error(msg) raise ValueError(msg) if not value: - msg = "All variables must have a valid value. Empty strings " \ - "are not allowed." + msg = ( + "All variables must have a valid value. Empty strings " + "are not allowed." + ) logger.error(msg) raise ValueError(msg) - if key in self.keys_seen: - msg = "Variable name '{}' is already taken. All variable " \ - "names must be unique.".format(key) + if key in keys_seen: + msg = ( + "Variable name '{}' is already taken. All variable " + "names must be unique.".format(key) + ) logger.error(msg) raise ValueError(msg) @@ -230,45 +244,25 @@ def _verify_dependencies(self, keys_seen): :returns: A set of variable names seen. """ dep_types = ["path", "git", "spack"] - # Required keys - req_keys = {} - # For each PathDependency, we require two things: - # 1. A unique name (which will be its variable name for substitution) - # 2. A path. - req_keys["path"] = set(["name", "path"]) - - # For each GitDependency, required items are: - # 1. A name for the dependency (variable name to be substituted) - # 2. A git URL (ssh or http) - # 3. A path to store the repository to. - req_keys["git"] = set(["name", "path", "url"]) - - # For each SpackDependency, required items are: - # 1. A name for the dependency (variable name to be substituted) - # 2. Spack package name - req_keys["spack"] = set(["name", "package_name"]) + + if "dependencies" not in self.environment: + return keys_seen # For each dependency type, run through the required keys and name. for dep_type in dep_types: if dep_type in self.environment["dependencies"]: for item in self.environment["dependencies"][dep_type]: - # Check that the name and path attributes are populated. - missing_keys = req_keys[dep_type] - set(item.keys()) - if missing_keys: - msg = "Incomplete %s dependency detected -- missing" \ - " %s required keys. Value: %s" \ - .format(dep_type, missing_keys, item) - logger.error(msg) - raise ValueError(msg) - # Make sure that the "name" attribute is not taken. # Because every dependency should be responsible for # substituting itself into data, they are required to have # a name field. if item["name"] in keys_seen: - msg = "Variable name '{}' is already taken. All " \ - "variable names must be unique." \ - .format(item["name"]) + msg = ( + "Variable name '{}' is already taken. All " + "variable names must be unique.".format( + item["name"] + ) + ) logger.error(msg) raise ValueError(msg) @@ -278,6 +272,10 @@ def _verify_dependencies(self, keys_seen): def verify_environment(self): """Verify that the environment in a specification is valid.""" + # validate environment against json schema + YAMLSpecification.validate_schema( + "env", self.environment, schemas["ENV"] + ) # Verify the variables section of the specification. keys_seen = self._verify_variables() # Verify the sources section of the specification. @@ -291,11 +289,14 @@ def verify_study(self): # not a workflow... try: if not self.study: - raise ValueError("A study specification MUST contain at least " - "one step in its workflow.") - - logger.debug("Verified that a study block exists. -- verifying " - "steps.") + raise ValueError( + "A study specification MUST contain at least " + "one step in its workflow." + ) + + logger.debug( + "Verified that a study block exists. -- verifying " "steps." + ) self._verify_steps() except Exception as e: @@ -309,35 +310,20 @@ def _verify_steps(self): A study step is required to have a name, description, and a command. If any are missing, the specification is considered invalid. """ - # Verify that each step has the minimum required information. - # Each step in the 'study' section must at least specify three things. - # 1. name - # 2. description - # 3. run try: - req_study = set(["name", "description", "run"]) - req_run = set(["cmd"]) for step in self.study: - logger.debug("Verifying -- \n%s" % step) - # Missing attributes in a study step. - missing_attrs = req_study - set(step.keys()) - if missing_attrs: - raise ValueError("Missing required keys {} from study step" - " containing following: {}" - .format(missing_attrs, step)) - - # Each step's 'run' requires a command and dependency. - # Missing keys in the 'run' attribute of a step. - missing_attrs = req_run - set(step["run"].keys()) - if missing_attrs: - raise ValueError("Missing {} keys from the run " - "configuration for step named '{}'." - .format(missing_attrs, step["name"])) + # validate step against json schema + YAMLSpecification.validate_schema( + "study.{}".format(step["name"]), + step, + schemas["STUDY_STEP"], + ) + except Exception as e: logger.exception(e.args) raise - logger.debug("Verified") + logger.debug("Verified steps") def verify_parameters(self): """ @@ -357,36 +343,39 @@ def verify_parameters(self): """ try: if self.globals: - req_global = set(["values", "label"]) global_names = set() values_len = -1 for name, value in self.globals.items(): # Check if the name is in the set if name in global_names: - raise ValueError("Parameter '{}' is not unique in the " - "set of global parameters." - .format(name)) - - # Check to make sure the required info is in the parameter. - missing_attrs = req_global - set(value.keys()) - if missing_attrs: - raise ValueError("Missing {} keys in the global " - "parameter named {}" - .format(missing_attrs, name)) + raise ValueError( + "Parameter '{}' is not unique in the " + "set of global parameters.".format(name) + ) + + # validate parameters against json schema + YAMLSpecification.validate_schema( + "global.params.{}".format(name), + value, + schemas["PARAM"], + ) + # If label is a list, check its length against values. values = value["values"] label = value["label"] if isinstance(label, list): if len(values) != len(label): - raise ValueError("Global parameter '{}' the " - "values length does not " - "match the label list length." - .format(name)) + raise ValueError( + "Global parameter '{}' the " + "values length does not " + "match the label list length.".format(name) + ) if len(label) != len(set(label)): - raise ValueError("Global parameter '{}' the " - "label does not contain " - "unique labels." - .format(name)) + raise ValueError( + "Global parameter '{}' the " + "label does not contain " + "unique labels.".format(name) + ) # Add the name to global parameters encountered, check if # length of values is the same as previously encountered. global_names.add(name) @@ -397,14 +386,76 @@ def verify_parameters(self): # Check length. Exception if doesn't match. if len(values) != values_len: - raise ValueError("Global parameter '{}' is not the " - "same length as other parameters." - .format(name)) + raise ValueError( + "Global parameter '{}' is not the " + "same length as other parameters.".format(name) + ) except Exception as e: logger.exception(e.args) raise + @staticmethod + def validate_schema(parent_key, instance, schema): + """ + Given a parent key, an instance of a spec section, and a json schema + for that section, validate the instance against the schema. + """ + validator = jsonschema.Draft7Validator(schema) + errors = validator.iter_errors(instance) + for error in errors: + print("DEBUG: " + error.message) + if error.validator == "additionalProperties": + unrecognized = ( + re.search(r"'.+'", error.message).group(0).strip("'") + ) + raise jsonschema.ValidationError( + "Unrecognized key '{0}' found in spec section '{1}'." + .format(unrecognized, parent_key) + ) + + elif error.validator == "type": + bad_val = ( + re.search(r".+ is not of type", error.message) + .group(0) + .strip(" is not of type") + ) + expected_type = ( + re.search(r"is not of type '.+'", error.message) + .group(0) + .strip("is not of type ") + .strip("'") + ) + raise jsonschema.ValidationError( + "Value {0} in spec section '{1}' must be of type '{2}'." + .format(bad_val, parent_key, expected_type) + ) + + elif error.validator == "required": + missing = re.search(r"'.+'", error.message) + missing = missing.group(0) + missing = missing.strip("'") + raise jsonschema.ValidationError( + "Key '{0}' is missing from spec section '{1}'.".format( + missing, parent_key + ) + ) + + elif error.validator == "uniqueItems": + raise jsonschema.ValidationError( + "Non-unique step names in spec section '{0}.run.depends'." + .format(parent_key) + ) + + elif error.validator == "minLength": + raise jsonschema.ValidationError( + "Empty string found in value in spec section '{0}'." + .format(parent_key) + ) + + else: + raise ValueError("Unknown validation error: " + error.message) + @property def output_path(self): """ @@ -414,8 +465,9 @@ def output_path(self): """ if "variables" in self.environment: if "OUTPUT_PATH" in self.environment["variables"]: - logger.debug("OUTPUT_PATH found in %s.", - self.description["name"]) + logger.debug( + "OUTPUT_PATH found in %s.", self.description["name"] + ) return self.environment["variables"]["OUTPUT_PATH"] else: return "" @@ -484,7 +536,7 @@ def get_study_environment(self): if "dependencies" in self.environment: if "paths" in self.environment["dependencies"]: for path in self.environment["dependencies"]["paths"]: - _ = environment.PathDependency(path['name'], path['path']) + _ = environment.PathDependency(path["name"], path["path"]) env.add(_) if "git" in self.environment["dependencies"]: @@ -493,8 +545,9 @@ def get_study_environment(self): optionals.pop("name") optionals.pop("url") optionals.pop("path") - _ = environment.GitDependency(repo["name"], repo["url"], - repo["path"], **optionals) + _ = environment.GitDependency( + repo["name"], repo["url"], repo["path"], **optionals + ) env.add(_) return env @@ -510,8 +563,9 @@ def get_parameters(self): if "name" not in value: params.add_parameter(key, value["values"], value["label"]) else: - params.add_parameter(key, value["values"], value["label"], - value["name"]) + params.add_parameter( + key, value["values"], value["label"], value["name"] + ) return params @@ -524,9 +578,9 @@ def get_study_steps(self): steps = [] for step in self.study: _ = StudyStep() - _.name = step['name'] - _.description = step['description'] - for key, value in step['run'].items(): + _.name = step["name"] + _.description = step["description"] + for key, value in step["run"].items(): _.run[key] = value steps.append(_) diff --git a/maestrowf/maestro.py b/maestrowf/maestro.py index 901ef450c..42bf0024f 100644 --- a/maestrowf/maestro.py +++ b/maestrowf/maestro.py @@ -29,7 +29,7 @@ """A script for launching a YAML study specification.""" from argparse import ArgumentParser, ArgumentError, RawTextHelpFormatter -import inspect +import jsonschema import logging import os import shutil @@ -44,13 +44,13 @@ from maestrowf.datastructures.core import Study from maestrowf.datastructures.environment import Variable from maestrowf.utils import \ - create_parentdir, create_dictionary, make_safe_path, \ + create_parentdir, create_dictionary, LoggerUtility, make_safe_path, \ start_process # Program Globals -ROOTLOGGER = logging.getLogger(inspect.getmodule(__name__)) LOGGER = logging.getLogger(__name__) +LOG_UTIL = LoggerUtility(LOGGER) # Configuration globals LFORMAT = "%(asctime)s - %(name)s:%(funcName)s:%(lineno)s - " \ @@ -125,7 +125,11 @@ def load_parameter_generator(path, env, kwargs): def run_study(args): """Run a Maestro study.""" # Load the Specification - spec = YAMLSpecification.load_specification(args.specification) + try: + spec = YAMLSpecification.load_specification(args.specification) + except jsonschema.ValidationError as e: + LOGGER.error(e.message) + sys.exit(1) environment = spec.get_study_environment() steps = spec.get_study_steps() @@ -168,8 +172,10 @@ def run_study(args): output_path = make_safe_path(out_dir, *[out_name]) environment.add(Variable("OUTPUT_PATH", output_path)) - # Now that we know outpath, set up logging. - setup_logging(args, output_path, spec.name.replace(" ", "_").lower()) + # Set up file logging + create_parentdir(os.path.join(output_path, "logs")) + log_path = os.path.join(output_path, "logs", "{}.log".format(spec.name)) + LOG_UTIL.add_file_handler(log_path, LFORMAT, args.debug_lvl) # Check for pargs without the matching pgen if args.pargs and not args.pgen: @@ -389,49 +395,6 @@ def setup_argparser(): return parser -def setup_logging(args, path, name): - """ - Set up logging based on the ArgumentParser. - - :param args: A Namespace object created by a parsed ArgumentParser. - :param path: A default path to be used if a log path is not specified by - user command line arguments. - :param name: The name of the log file. - """ - # If the user has specified a path, use that. - if args.logpath: - logpath = args.logpath - # Otherwise, we should just output to the OUTPUT_PATH. - else: - logpath = make_safe_path(path, *["logs"]) - - loglevel = args.debug_lvl * 10 - - # Create the FileHandler and add it to the logger. - create_parentdir(logpath) - formatter = logging.Formatter(LFORMAT) - ROOTLOGGER.setLevel(loglevel) - - log_path = make_safe_path(logpath, *["{}.log".format(name)]) - fh = logging.FileHandler(log_path) - fh.setLevel(loglevel) - fh.setFormatter(formatter) - ROOTLOGGER.addHandler(fh) - - if args.logstdout: - # Add the StreamHandler - sh = logging.StreamHandler() - sh.setLevel(loglevel) - sh.setFormatter(formatter) - ROOTLOGGER.addHandler(sh) - - # Print the level of logging. - LOGGER.info("INFO Logging Level -- Enabled") - LOGGER.warning("WARNING Logging Level -- Enabled") - LOGGER.critical("CRITICAL Logging Level -- Enabled") - LOGGER.debug("DEBUG Logging Level -- Enabled") - - def main(): """ Execute the main program's functionality. @@ -444,6 +407,15 @@ def main(): parser = setup_argparser() args = parser.parse_args() + # If we have requested to log stdout, set it up to be logged. + if args.logstdout: + LOG_UTIL.configure(LFORMAT, args.debug_lvl) + + LOGGER.info("INFO Logging Level -- Enabled") + LOGGER.warning("WARNING Logging Level -- Enabled") + LOGGER.critical("CRITICAL Logging Level -- Enabled") + LOGGER.debug("DEBUG Logging Level -- Enabled") + rc = args.func(args) sys.exit(rc) diff --git a/maestrowf/utils.py b/maestrowf/utils.py index d7404faec..bafdccaeb 100644 --- a/maestrowf/utils.py +++ b/maestrowf/utils.py @@ -258,3 +258,71 @@ def create_dictionary(list_keyvalues, token=":"): raise ValueError(msg) return _dict + + +class LoggerUtility: + """Utility class for setting up logging consistently.""" + + def __init__(self, logger): + """ + Initialize a new LoggerUtility class instance. + + :param logger: An instance of a logger to configure. + """ + self._logger = logger + + def configure(self, log_format, log_lvl=2): + """ + Configures the general logging facility. + + :param log_format: String containing the desired logging format. + :param log_lvl: Integer level (1-5) to set the logger to. + """ + logging.basicConfig(level=self.map_level(log_lvl), format=log_format) + + def add_stream_handler(self, log_format, log_lvl=2): + """ + Add a stream handler to logging. + + :param log_format: String containing the desired logging format. + :param log_lvl: Integer level (1-5) to set the logger to. + """ + # Create the FileHandler and add it to the logger. + sh = logging.StreamHandler() + sh.setLevel(self.map_level(log_lvl)) + sh.setFormatter(logging.Formatter(log_format)) + self._logger.addHandler(sh) + + def add_file_handler(self, log_path, log_format, log_lvl=2): + """ + Add a file handler to logging. + + :param log_path: String containing the file path to store logging. + :param log_format: String containing the desired logging format. + :param log_lvl: Integer level (1-5) to set the logger to. + """ + # Create the FileHandler and add it to the logger. + formatter = logging.Formatter(log_format) + + fh = logging.FileHandler(log_path) + fh.setLevel(self.map_level(log_lvl)) + fh.setFormatter(formatter) + self._logger.addHandler(fh) + + @staticmethod + def map_level(log_lvl): + """ + Map level 1-5 to their respective logging enumerations. + + :param log_lvl: Integer level (1-5) representing logging verbosity. + """ + if log_lvl == 1: + return logging.DEBUG + elif log_lvl == 2: + return logging.INFO + elif log_lvl == 3: + return logging.WARNING + elif log_lvl == 4: + return logging.ERROR + else: + return logging.CRITICAL diff --git a/requirements.txt b/requirements.txt index a6c9b459a..4057a258a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,20 @@ -fabric -coverage -filelock PyYAML>=4.2b1 six +filelock +tabulate +enum34; python_version<'3.4' +dill +jsonschema>=3.2.0 +-e . + +fabric +coverage sphinx_rtd_theme sphinx flake8 pydocstyle pylint tox -tabulate pytest -enum34; python_version<'3.4' -dill +pytest-cov +pre-commit diff --git a/setup.py b/setup.py index 920816e07..5034deede 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ "tabulate", "enum34 ; python_version<'3.4'", "dill", + "jsonschema>=3.2.0", ], extras_require={}, long_description_content_type='text/markdown', @@ -44,4 +45,8 @@ 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], + package_data={ + 'maestrowf': ['maestrowf/datastructures/schemas.json'], + }, + include_package_data=True, )