From 7d8b30300a3cc8b3af9046691ab28bb6c973961b Mon Sep 17 00:00:00 2001 From: Remi Gau Date: Tue, 9 Jul 2024 09:40:42 +0200 Subject: [PATCH 1/4] skip .git and refactor --- reproschema/validate.py | 140 ++++++++++++++++++++++++++-------------- 1 file changed, 90 insertions(+), 50 deletions(-) diff --git a/reproschema/validate.py b/reproschema/validate.py index ae281b6..e2df11b 100644 --- a/reproschema/validate.py +++ b/reproschema/validate.py @@ -5,20 +5,41 @@ from .jsonldutils import load_file, validate_data from .utils import lgr, start_server, stop_server +DIR_TO_SKIP = [".git", "__pycache__", "env", "venv"] +FILES_TO_SKIP = [".DS_Store", ".gitignore", ".flake8", ".autorc", "LICENSE"] +SUPPORTED_EXTENSIONS = [ + ".jsonld", + "json", + "js", + "", +] -def validate_dir(directory, started=False, http_kwargs={}): + +def validate_dir( + directory: str, + started: bool = False, + http_kwargs: None | dict[str, int] = None, + stop=None, +): """Validate a directory containing JSONLD documents against the ReproSchema pydantic model. + Recursively goes through the directory tree and validates files with the allowed extensions. + Parameters ---------- directory: str Path to directory to walk for validation + started : bool Whether an http server exists or not - http_kwargs : dict + + http_kwargs : dict or None Keyword arguments for the http server. Valid keywords are: port, path and tmpdir + stop: None or function + Function to use to stop the HTTP server + Returns ------- conforms: bool @@ -26,54 +47,60 @@ def validate_dir(directory, started=False, http_kwargs={}): if any document is non-conformant. """ + if http_kwargs is None: + http_kwargs = {} + if not os.path.isdir(directory): + if stop is not None: + stop_server(stop) raise Exception(f"{directory} is not a directory") - print(f"Validating directory {directory}") - stop = None - if not started: - stop, port = start_server(**http_kwargs) - http_kwargs["port"] = port - else: - if "port" not in http_kwargs: - raise KeyError("HTTP server started, but port key is missing") - - for root, _, files in os.walk(directory): - for name in files: - full_file_name = os.path.join(root, name) - - if Path(full_file_name).suffix not in [ - ".jsonld", - "json", - "js", - "", - ]: - lgr.info(f"Skipping file {full_file_name}") - continue - - lgr.debug(f"Validating file {full_file_name}") - try: - data = load_file( - full_file_name, started=True, http_kwargs=http_kwargs - ) - if len(data) == 0: - raise ValueError("Empty data graph") - print(f"Validating {full_file_name}") - conforms, vtext = validate_data(data) - except (ValueError, json.JSONDecodeError): + + if Path(directory).name in DIR_TO_SKIP: + lgr.info(f"Skipping directory {directory}") + return True + + lgr.debug(f"Validating directory {directory}") + + files_to_validate = [ + str(x) + for x in Path(directory).iterdir() + if x.is_file() + and x.name not in FILES_TO_SKIP + and x.suffix in SUPPORTED_EXTENSIONS + ] + + for name in files_to_validate: + lgr.debug(f"Validating file {name}") + + try: + data = load_file(name, started=started, http_kwargs=http_kwargs) + if len(data) == 0: if stop is not None: stop_server(stop) - raise - else: - if not conforms: - lgr.critical( - f"File {full_file_name} has validation errors." - ) - if stop is not None: - stop_server(stop) - raise ValueError(vtext) - if not started: - stop_server(stop) - return True + raise ValueError(f"Empty data graph in file {name}") + conforms, vtext = validate_data(data) + except (ValueError, json.JSONDecodeError): + if stop is not None: + stop_server(stop) + raise + else: + if not conforms: + lgr.critical(f"File {name} has validation errors.") + stop_server(stop) + raise ValueError(vtext) + + dirs_to_validate = [ + str(x) + for x in Path(directory).iterdir() + if x.is_dir() and x.name not in DIR_TO_SKIP + ] + + for dir in dirs_to_validate: + conforms, stop = validate_dir( + dir, started=started, http_kwargs=http_kwargs, stop=stop + ) + + return True, stop def validate(path): @@ -92,16 +119,29 @@ def validate(path): """ if os.path.isdir(path): - conforms = validate_dir(path) + + stop, port = start_server() + http_kwargs = {"port": port} + started = True + + conforms, _ = validate_dir( + path, started=started, http_kwargs=http_kwargs, stop=stop + ) + + stop_server(stop) + else: - # Skip validation for .DS_Store files - if Path(path).name == ".DS_Store": - lgr.info(f"{path} is a .DS_Store file and is skipped.") + + if Path(path).name in FILES_TO_SKIP: + lgr.info(f"Skipping file {path}") return True + data = load_file(path, started=False) conforms, vtext = validate_data(data) if not conforms: lgr.critical(f"File {path} has validation errors.") raise ValueError(vtext) + lgr.info(f"{path} conforms.") + return conforms From 8ad724a23af30211d0652617c76b5f3837a18362 Mon Sep 17 00:00:00 2001 From: Remi Gau Date: Tue, 9 Jul 2024 09:58:42 +0200 Subject: [PATCH 2/4] make it slightly more verbose --- reproschema/validate.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/reproschema/validate.py b/reproschema/validate.py index e2df11b..b07db7e 100644 --- a/reproschema/validate.py +++ b/reproschema/validate.py @@ -5,8 +5,21 @@ from .jsonldutils import load_file, validate_data from .utils import lgr, start_server, stop_server -DIR_TO_SKIP = [".git", "__pycache__", "env", "venv"] -FILES_TO_SKIP = [".DS_Store", ".gitignore", ".flake8", ".autorc", "LICENSE"] +DIR_TO_SKIP = [ + ".git", + ".github", + "__pycache__", + "env", + "venv", +] +FILES_TO_SKIP = [ + ".DS_Store", + ".gitignore", + ".flake8", + ".autorc", + "LICENSE", + "Makefile", +] SUPPORTED_EXTENSIONS = [ ".jsonld", "json", @@ -59,7 +72,7 @@ def validate_dir( lgr.info(f"Skipping directory {directory}") return True - lgr.debug(f"Validating directory {directory}") + lgr.info(f"Validating directory {directory}") files_to_validate = [ str(x) @@ -120,6 +133,8 @@ def validate(path): """ if os.path.isdir(path): + lgr.info(f"Validating directory {path}") + stop, port = start_server() http_kwargs = {"port": port} started = True From b5e595f150ce05b9b3226c3fc0c8df9eec78031d Mon Sep 17 00:00:00 2001 From: Remi Gau Date: Tue, 9 Jul 2024 10:05:32 +0200 Subject: [PATCH 3/4] drop os --- reproschema/validate.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/reproschema/validate.py b/reproschema/validate.py index b07db7e..d865c87 100644 --- a/reproschema/validate.py +++ b/reproschema/validate.py @@ -1,5 +1,4 @@ import json -import os from pathlib import Path from .jsonldutils import load_file, validate_data @@ -63,12 +62,14 @@ def validate_dir( if http_kwargs is None: http_kwargs = {} - if not os.path.isdir(directory): + directory = Path(directory) + + if not directory.is_dir(): if stop is not None: stop_server(stop) - raise Exception(f"{directory} is not a directory") + raise Exception(f"{str(directory)} is not a directory") - if Path(directory).name in DIR_TO_SKIP: + if directory.name in DIR_TO_SKIP: lgr.info(f"Skipping directory {directory}") return True @@ -76,7 +77,7 @@ def validate_dir( files_to_validate = [ str(x) - for x in Path(directory).iterdir() + for x in directory.iterdir() if x.is_file() and x.name not in FILES_TO_SKIP and x.suffix in SUPPORTED_EXTENSIONS @@ -104,7 +105,7 @@ def validate_dir( dirs_to_validate = [ str(x) - for x in Path(directory).iterdir() + for x in directory.iterdir() if x.is_dir() and x.name not in DIR_TO_SKIP ] @@ -131,7 +132,7 @@ def validate(path): exception. """ - if os.path.isdir(path): + if Path(path).is_dir(): lgr.info(f"Validating directory {path}") From a8a63b67f9f98edbd4fb52d0236e13334910415e Mon Sep 17 00:00:00 2001 From: Dorota Jarecka Date: Tue, 9 Jul 2024 18:34:42 -0400 Subject: [PATCH 4/4] readingg schemaversion from the contextfile url; adding read_contextfile function --- reproschema/jsonldutils.py | 49 +++++++++++++++++++++++-------- reproschema/redcap2reproschema.py | 3 +- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/reproschema/jsonldutils.py b/reproschema/jsonldutils.py index 9ac65b3..1bda6db 100644 --- a/reproschema/jsonldutils.py +++ b/reproschema/jsonldutils.py @@ -88,15 +88,7 @@ def load_file( data = fixing_old_schema(data, copy_data=True) if compact: if compact_context: - if _is_file(compact_context): - with open(compact_context) as fp: - context = json.load(fp) - elif _is_url(compact_context): - context = _fetch_jsonld_context(compact_context) - else: - raise Exception( - f"compact_context has tobe a file or url, but {compact_context} provided" - ) + context = read_contextfile(compact_context) if _is_file(path_or_url): data = jsonld.compact( data, ctx=context, options={"base": base_url} @@ -128,7 +120,7 @@ def validate_data(data): # normalized = jsonld.normalize(data, kwargs) obj_type = identify_model_class(data["@type"][0]) data_fixed = [fixing_old_schema(data, copy_data=True)] - context = _fetch_jsonld_context(CONTEXTFILE_URL) + context = read_contextfile(CONTEXTFILE_URL) data_fixed_comp = jsonld.compact(data_fixed, context) del data_fixed_comp["@context"] conforms = False @@ -141,6 +133,40 @@ def validate_data(data): return conforms, v_text +def read_contextfile(contextfile): + """Read a context file and return the context.""" + if _is_file(contextfile): + with open(contextfile) as fp: + context = json.load(fp) + elif _is_url(contextfile): + context = _fetch_jsonld_context(contextfile) + else: + raise Exception( + f"compact_context has tobe a file or url, but {contextfile} provided" + ) + return context + + +def get_context_version(contextfile): + """Get the version from the context file path""" + from packaging.version import InvalidVersion, Version + + if contextfile.split("/")[-3] != "releases": + raise ValueError( + f"Can't get the version from {contextfile}, expected to have releases in the path" + ) + else: + try: + Version(contextfile.split("/")[-2]) + return contextfile.split("/")[-2] + except InvalidVersion: + raise ValueError( + f"Can't get the version from {contextfile}, " + f"expected to have a valid version in the path, " + f"but got {contextfile.split('/')[-2]}" + ) + + def to_newformat(path, format, prefixfile=None, contextfile=None): """Convert a JSONLD document to n-triples format @@ -171,8 +197,7 @@ def to_newformat(path, format, prefixfile=None, contextfile=None): data = load_file(path) if format == "jsonld": if contextfile is not None: - with open(contextfile) as fp: - context = json.load(fp) + context = read_contextfile(contextfile) data = jsonld.compact(data, context) return json.dumps(data, indent=2) kwargs = {"algorithm": "URDNA2015", "format": "application/n-quads"} diff --git a/reproschema/redcap2reproschema.py b/reproschema/redcap2reproschema.py index 5f35cc6..007d1a8 100644 --- a/reproschema/redcap2reproschema.py +++ b/reproschema/redcap2reproschema.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup from .context_url import CONTEXTFILE_URL +from .jsonldutils import get_context_version from .models import Activity, Item, Protocol, write_obj_jsonld matrix_group_count = {} @@ -378,7 +379,7 @@ def create_form_schema( "id": f"{form_name}_schema", "prefLabel": {"en": activity_display_name}, # "description": {"en": activity_description}, - "schemaVersion": "1.0.0-rc4", + "schemaVersion": get_context_version(schema_context_url), "version": redcap_version, "ui": { "order": unique_order,