diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4dcc1c5..85c1d1b 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -13,9 +13,3 @@ updates: directory: "/.github/workflows" schedule: interval: "daily" - - # Maintain dependencies for Python scripts - - package-ecosystem: "pip" - directory: "/.github/scripts" - schedule: - interval: "daily" diff --git a/.github/scripts/requirements.txt b/.github/scripts/requirements.txt deleted file mode 100644 index 1e6548c..0000000 --- a/.github/scripts/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -check-jsonschema>=0.28.2 diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 6307561..f846ef3 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -38,7 +38,6 @@ jobs: uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: fetch-depth: 0 - submodules: true - name: "Download actionlint" run: | diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 0a91e48..9af23d2 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -40,7 +40,6 @@ jobs: uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: fetch-depth: 0 - submodules: true - name: "Check Markdown documents" uses: DavidAnson/markdownlint-cli2-action@b4c9feab76d8025d1e83c653fa3990936df0e6c8 # v16.0.0 with: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b996948..c156a3a 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,24 +1,29 @@ # SPDX-License-Identifier: Apache-2.0 -name: Lint Schema +name: Lint on: - workflow_dispatch: push: branches: - - main + - "main" paths: - - 'v*/**/*.json' + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' - '.github/workflows/lint.yml' # This workflow - - '.github/scripts/**' # Scripts used by this workflow pull_request: branches: - - main + - "main" paths: - - 'v*/**/*.json' + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' - '.github/workflows/lint.yml' # This workflow - - '.github/scripts/**' # Scripts used by this workflow env: LC_ALL: en_US.UTF-8 @@ -33,6 +38,25 @@ permissions: jobs: lint: runs-on: ubuntu-latest + name: "${{ matrix.lint.name }}" + strategy: + fail-fast: false + matrix: + lint: + - name: "jsonschema" + commands: | + tox -e jsonschema + - name: "ruff" + commands: | + tox -e ruff -- check + - name: "pylint" + commands: | + echo "::add-matcher::.github/workflows/matchers/pylint.json" + tox -e pylint + - name: "mypy" + commands: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tox -e mypy steps: - name: "Harden Runner" uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 @@ -49,24 +73,13 @@ jobs: with: python-version: "3.11" - - name: "Install Python Packages" + - name: "Install tox" run: | - pip install -r .github/scripts/requirements.txt + python -m pip install --upgrade pip + python -m pip install tox tox-gh - - name: "Find changed schema files" - id: changed-files - uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78 # v44.5.2 - with: - files: | - v*/**/*.json - - - name: "Check changed schema file contents" - if: steps.changed-files.outputs.any_changed == 'true' - run: | - check-jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema ${{ steps.changed-files.outputs.all_changed_files }} - - - name: "Check all schema file contents" - if: steps.changed-files.outputs.any_changed != 'true' + - name: "${{ matrix.lint.name }}" run: | - # shellcheck disable=SC2046 - check-jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema $(find v* -name "*.json") + ${{ matrix.lint.commands }} + env: + RUFF_OUTPUT_FORMAT: github diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000..f048fce --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/pylint.json b/.github/workflows/matchers/pylint.json new file mode 100644 index 0000000..5624ca6 --- /dev/null +++ b/.github/workflows/matchers/pylint.json @@ -0,0 +1,32 @@ +{ + "problemMatcher": [ + { + "owner": "pylint-error", + "severity": "error", + "pattern": [ + { + "regexp": "^(.+):(\\d+):(\\d+):\\s(([EF]\\d{4}):\\s.+)$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + }, + { + "owner": "pylint-warning", + "severity": "warning", + "pattern": [ + { + "regexp": "^(.+):(\\d+):(\\d+):\\s(([CRW]\\d{4}):\\s.+)$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..8cc77da --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Build, test, and upload PyPI package + +on: + push: + branches: + - "main" + tags: + - "v*" + pull_request: + branches: + - "main" + release: + types: + - published + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + # Create and verify release artifacts + # - build source dist (tar ball) and wheel + # - validate artifacts with various tools + # - upload artifacts to GHA + build-package: + name: Build and check packages + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + + - name: "Checkout" + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + # for setuptools-scm + fetch-depth: 0 + + - name: "Build and Inspect" + uses: hynek/build-and-inspect-python-package@b4fc3f6ba2b3da04f09659be99e2a29fb6146a61 # v2.6.0 + + # push to Test PyPI on + # - a new GitHub release is published + # - a PR is merged into main branch + publish-test-pypi: + name: Publish packages to test.pypi.org + # environment: publish-test-pypi + if: ${{ (github.repository_owner == 'instructlab') && ((github.event.action == 'published') || ((github.event_name == 'push') && (github.ref == 'refs/heads/main'))) }} + permissions: + contents: read + # see https://docs.pypi.org/trusted-publishers/ + id-token: write + runs-on: ubuntu-latest + needs: build-package + + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Download build artifacts" + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: Packages + path: dist + + - name: "Upload to Test PyPI" + uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450 # v1.8.14 + with: + repository-url: https://test.pypi.org/legacy/ + + # push to Production PyPI on + # - a new GitHub release is published + publish-pypi: + name: Publish release to pypi.org + # environment: publish-pypi + if: ${{ (github.repository_owner == 'instructlab') && (github.event.action == 'published') }} + permissions: + # see https://docs.pypi.org/trusted-publishers/ + id-token: write + # allow gh release upload + contents: write + + runs-on: ubuntu-latest + needs: build-package + + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Download build artifacts" + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: Packages + path: dist + + - name: "Sigstore sign package" + uses: sigstore/gh-action-sigstore-python@61f6a500bbfdd9a2a339cf033e5421951fbc1cd2 # v2.1.1 + with: + inputs: | + ./dist/*.tar.gz + ./dist/*.whl + + - name: "Upload artifacts and signatures to GitHub release" + run: | + gh release upload '${{ github.ref_name }}' dist/* --repo '${{ github.repository }}' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + # PyPI does not accept .sigstore artifacts and + # gh-action-pypi-publish has no option to ignore them. + - name: "Remove sigstore signatures before uploading to PyPI" + run: | + rm ./dist/*.sigstore + + - name: "Upload to PyPI" + uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450 # v1.8.14 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..c398350 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Test + +on: + push: + branches: + - "main" + paths: + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' + - '.github/workflows/test.yml' # This workflow + pull_request: + branches: + - "main" + paths: + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' + - '.github/workflows/test.yml' # This workflow + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + test: + name: "${{ matrix.python }} on ${{ matrix.platform }}" + runs-on: "${{ matrix.platform }}" + strategy: + matrix: + python: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + platform: + - "ubuntu-latest" + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Checkout" + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + fetch-depth: 0 + + - name: "Setup Python ${{ matrix.python }}" + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: ${{ matrix.python }} + + - name: "Install tox" + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh + + - name: "Unit tests" + run: | + tox diff --git a/.gitignore b/.gitignore index 701ff28..7fd12a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,32 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# generated by setuptools_scm +/src/instructlab/schema/_version.py + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + # Environments .env .venv @@ -11,5 +40,9 @@ venv.bak/ .vscode/ .idea/ +# Caches +.tox/ +.*_cache/ + # Mac personalization files .DS_Store diff --git a/README.md b/README.md index 7756104..d5d5953 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # Taxonomy Schema -This repository defines the JSON schema for the [Taxonomy](https://github.com/instructlab/taxonomy) YAML. +This Python package defines the JSON schema for the InstructLab [Taxonomy](https://github.com/instructlab/taxonomy) YAML. + +Consumers of this schema can `pip install instructlab-schema`, and access the schema files using `importlib.resources` on the `instructlab.schema` package. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..05e69b5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 + +[build-system] +requires = ["setuptools>=64", "setuptools_scm>=8"] +build-backend = "setuptools.build_meta" + +[project] +name = "instructlab-schema" +authors = [{ name = "InstructLab", email = "dev@instructlab.ai" }] +description = "InstructLab Taxonomy Schema" +readme = "README.md" +license = { text = "Apache-2.0" } +requires-python = ">=3.9" +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dynamic = ["dependencies", "optional-dependencies", "version"] + +[project.urls] +homepage = "https://instructlab.ai" +source = "https://github.com/instructlab/schema" +issues = "https://github.com/instructlab/schema/issues" + +[tool.setuptools_scm] +version_file = "src/instructlab/schema/_version.py" +local_scheme = "no-local-version" # do not include +gREV local version, required for Test PyPI upload + +[tool.mypy] +python_version = "3.9" +exclude = ["^src/instructlab/schema/_version\\.py$"] + +[tool.ruff] +target-version = "py39" +src = ["src", "tests"] +extend-exclude = ["src/instructlab/schema/_version.py"] + +[tool.ruff.lint] +select = [ + "B", # flake8-bugbear + "E", # pycodestyle + "F", # Pyflakes + "Q", # flake8-quotes + "I", # isort + "UP", # pyupgrade + "SIM", # flake8-simplify + "TID", # flake8-tidy-imports +] + +[tool.pylint.main] +py-version = "3.9" +source-roots = ["src", "tests"] +ignore = ["_version.py"] + +[tool.pylint."messages control"] +disable = [ + "missing-class-docstring", + "missing-module-docstring", + "missing-function-docstring", +] + +[tool.pylint.reports] +reports = true +score = true diff --git a/scripts/ruff.sh b/scripts/ruff.sh new file mode 100755 index 0000000..6bf131f --- /dev/null +++ b/scripts/ruff.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +set -e + +# wrapper to combine ruff check and ruff format +# +# "ruff.sh fix" runs fixes and reformats the code +# "ruff.sh check" checks style, format, and imports +# "ruff.sh " passes abitrary args to ruff + +if [ -z "$1" ]; then + echo "USAGE: $0 [check|fix|]" >&2 + exit 2 +fi + +run() { + declare -i err + + echo "RUN: '$*'" + "$@" + err=$? + echo + return $err +} + +case $1 in + "check") + declare -i exitcode=0 + + set +e + run ruff check --diff + exitcode=$(( exitcode + $? )) + + run ruff format --check + exitcode=$(( exitcode + $? )) + set -e + + if [ $exitcode -ne 0 ]; then + echo "ERROR: one or more checks have failed." >&2 + echo "Run 'tox -e ruff' to auto-correct all fixable errors." >&2 + exit 3 + fi + ;; + "fix") + run ruff check --fix + run ruff format + ;; + *) + ruff "$@" +esac diff --git a/src/instructlab/schema/__init__.py b/src/instructlab/schema/__init__.py new file mode 100644 index 0000000..5ac6fdc --- /dev/null +++ b/src/instructlab/schema/__init__.py @@ -0,0 +1,25 @@ +"""InstructLab Taxonomy Schema""" + +# Standard +from importlib import resources + +try: + from importlib.resources.abc import Traversable # type: ignore[import-not-found] +except ImportError: # python>=3.9,<3.11 + from importlib.abc import Traversable + +__all__ = ["schema_versions"] + + +def schema_versions() -> list[Traversable]: + """Return the sorted list of schema versions. + + Returns: + list[Traversable]: A sorted list of schema versions. + """ + schema_base = resources.files(__package__) + versions = sorted( + (v for v in schema_base.iterdir() if v.name[0] == "v" and v.name[1:].isdigit()), + key=lambda k: int(k.name[1:]), + ) + return versions diff --git a/src/instructlab/schema/py.typed b/src/instructlab/schema/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/instructlab/schema/v1/__init__.py b/src/instructlab/schema/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/v1/compositional_skills.json b/src/instructlab/schema/v1/compositional_skills.json similarity index 100% rename from v1/compositional_skills.json rename to src/instructlab/schema/v1/compositional_skills.json diff --git a/v1/knowledge.json b/src/instructlab/schema/v1/knowledge.json similarity index 100% rename from v1/knowledge.json rename to src/instructlab/schema/v1/knowledge.json diff --git a/v1/version.json b/src/instructlab/schema/v1/version.json similarity index 100% rename from v1/version.json rename to src/instructlab/schema/v1/version.json diff --git a/src/instructlab/schema/v2/__init__.py b/src/instructlab/schema/v2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/v2/compositional_skills.json b/src/instructlab/schema/v2/compositional_skills.json similarity index 100% rename from v2/compositional_skills.json rename to src/instructlab/schema/v2/compositional_skills.json diff --git a/v2/knowledge.json b/src/instructlab/schema/v2/knowledge.json similarity index 100% rename from v2/knowledge.json rename to src/instructlab/schema/v2/knowledge.json diff --git a/v2/version.json b/src/instructlab/schema/v2/version.json similarity index 100% rename from v2/version.json rename to src/instructlab/schema/v2/version.json diff --git a/tests/test_versions.py b/tests/test_versions.py new file mode 100644 index 0000000..ed08865 --- /dev/null +++ b/tests/test_versions.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +import json +from importlib import resources + +# Third Party +from referencing import Resource +from referencing.jsonschema import DRAFT202012 + +from instructlab.schema import schema_versions + + +class TestVersions: + def test_versions(self): + versions = schema_versions() + assert versions is not None + assert len(versions) > 1 + for i, v in enumerate(versions): + assert v.name == f"v{i+1}" + + def _load_schema(self, path): + text = path.read_text(encoding="utf-8") + assert text + assert len(text) > 1 + contents = json.loads(text) + assert contents + assert len(contents) > 1 + resource = Resource.from_contents( + contents=contents, default_specification=DRAFT202012 + ) + assert resource + assert resource.contents == contents + + def test_import_schema_base(self): + schema_base = resources.files("instructlab.schema") + for i in range(len(schema_versions())): + schema_version = schema_base.joinpath(f"v{i+1}") + for schema_name in ("compositional_skills", "knowledge", "version"): + path = schema_version.joinpath(f"{schema_name}.json") + self._load_schema(path) + + def test_import_schema_versions(self): + for i in range(len(schema_versions())): + schema_version = resources.files(f"instructlab.schema.v{i+1}") + for schema_name in ("compositional_skills", "knowledge", "version"): + path = schema_version.joinpath(f"{schema_name}.json") + self._load_schema(path) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..2847d8a --- /dev/null +++ b/tox.ini @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 + +[tox] +# py3-unit runs unit tests with 'python3' +# py311-unit runs the same tests with 'python3.11' +envlist = ruff, pylint, mypy, jsonschema, py3-unit +minversion = 4.4 + +[testenv] +description = Run tests (unit) +package = wheel +wheel_build_env = pkg +deps = + pytest + jsonschema +commands = + unit: {envpython} -m pytest {posargs:tests} + +[testenv:pylint] +description = Lint with pylint +deps = + pylint + jsonschema +commands = + {envpython} -m pylint {posargs:src tests} + +[testenv:ruff] +description = Reformat and fix code with Ruff +skip_install = True +skipsdist = true +deps = + ruff + jsonschema +commands = + ./scripts/ruff.sh {posargs:fix} +allowlist_externals = ./scripts/ruff.sh + +[testenv:mypy] +description = Python type checking with mypy +namespace_packages = True +explicit_package_bases = True +deps = + mypy + jsonschema +commands = + {envpython} -m mypy {posargs:src tests} + +[testenv:jsonschema] +description = JSON schema file validation with check-jsonschema +skip_install = True +skipsdist = true +deps = + check-jsonschema +commands = + bash -c "{envpython} -m check_jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema {posargs:$(find src/instructlab/schema/v* -name \"*.json\" -print)}" +allowlist_externals = bash + +[gh] +python = + 3.12 = py312-unit + 3.11 = py311-unit + 3.10 = py310-unit + 3.9 = py39-unit