diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 357ab543b..7c273806c 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -21,7 +21,7 @@ build: # the readthedocs environment. - pip install -r devtools/requirements-poetry.in post_install: - - poetry export --only=main --only=docs --extras=html -o requirements.txt + - poetry export --only=main --only=docs -o requirements.txt - pip install --no-cache-dir -r requirements.txt - pip install . - python -c "from rdflib import Graph; print(Graph)" diff --git a/docker/latest/requirements.txt b/docker/latest/requirements.txt index aeaa35cea..80c3106b0 100644 --- a/docker/latest/requirements.txt +++ b/docker/latest/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --config=pyproject.toml docker/latest/requirements.in # -html5lib==1.1 +html5lib-modern==1.2 # via -r docker/latest/requirements.in isodate==0.6.1 # via rdflib @@ -14,7 +14,4 @@ rdflib==7.0.0 # via -r docker/latest/requirements.in six==1.16.0 # via - # html5lib # isodate -webencodings==0.5.1 - # via html5lib diff --git a/poetry.lock b/poetry.lock index 239177a11..c6a0cde8c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -314,25 +314,21 @@ files = [ test = ["pytest (>=6)"] [[package]] -name = "html5lib" -version = "1.1" +name = "html5lib-modern" +version = "1.2" description = "HTML parser based on the WHATWG HTML specification" -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +optional = false +python-versions = ">=3.8" files = [ - {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, - {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, + {file = "html5lib_modern-1.2-py2.py3-none-any.whl", hash = "sha256:3458b6e31525ede4fcaac0ff42d9eeb5efaf755473768103cb56e0275caa8d99"}, + {file = "html5lib_modern-1.2.tar.gz", hash = "sha256:1fadbfc27ea955431270e4e79a4a4c290ba11c3a3098a95cc22dc73e312a1768"}, ] -[package.dependencies] -six = ">=1.9" -webencodings = "*" - [package.extras] -all = ["chardet (>=2.2)", "genshi", "lxml"] -chardet = ["chardet (>=2.2)"] -genshi = ["genshi"] -lxml = ["lxml"] +all = ["chardet (>=2.2.1)", "genshi (>=0.7.1)", "lxml (>=3.4.0)"] +chardet = ["chardet (>=2.2.1)"] +genshi = ["genshi (>=0.7.1)"] +lxml = ["lxml (>=3.4.0)"] [[package]] name = "idna" @@ -1043,7 +1039,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -1372,17 +1367,6 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] -[[package]] -name = "webencodings" -version = "0.5.1" -description = "Character encoding aliases for legacy web content" -optional = true -python-versions = "*" -files = [ - {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, - {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, -] - [[package]] name = "wheel" version = "0.44.0" @@ -1414,7 +1398,6 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] berkeleydb = ["berkeleydb"] -html = ["html5lib"] lxml = ["lxml"] networkx = ["networkx"] orjson = ["orjson"] @@ -1422,4 +1405,4 @@ orjson = ["orjson"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "af70ace5117249eb0000f2e5e5b9fb1b705b995a286e3791a5601dcf0cb45dc8" +content-hash = "8ad16d001c8cbd7ecd6516ee5997432868618f4dc31e89d646a54a065919269f" diff --git a/pyproject.toml b/pyproject.toml index ab1d950bb..62a0085b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ isodate = "^0.6.0" pyparsing = ">=2.1.0,<4" berkeleydb = {version = "^18.1.0", optional = true} networkx = {version = ">=2,<4", optional = true} -html5lib = {version = "^1.0", optional = true} +html5lib-modern = "^1.2" lxml = {version = ">=4.3,<6.0", optional = true} orjson = {version = ">=3.9.14,<4", optional = true} @@ -73,7 +73,6 @@ ruff = ">=0.0.286,<0.7.0" [tool.poetry.extras] berkeleydb = ["berkeleydb"] networkx = ["networkx"] -html = ["html5lib"] lxml = ["lxml"] orjson = ["orjson"] diff --git a/rdflib/term.py b/rdflib/term.py index 4f9cd3951..0a5f7c52c 100644 --- a/rdflib/term.py +++ b/rdflib/term.py @@ -66,6 +66,7 @@ from urllib.parse import urldefrag, urljoin, urlparse from uuid import uuid4 +import html5lib from isodate import ( Duration, duration_isoformat, @@ -83,14 +84,6 @@ from .namespace import NamespaceManager from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath -_HAS_HTML5LIB = False - -try: - import html5lib - - _HAS_HTML5LIB = True -except ImportError: - html5lib = None _SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io" @@ -1677,7 +1670,11 @@ def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment: parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True ) - result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form) + try: + result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form) + except html5lib.html5parser.ParseError as e: + logger.info(f"Failed to parse HTML: {e}") + raise e result.normalize() return result @@ -2007,20 +2004,13 @@ def _castPythonToLiteral( # noqa: N802 (Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)), (timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)), (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)), - (Fraction, (None, _OWL_RATIONAL)), -] - -if html5lib is not None: # This is a bit dirty, by accident the html5lib parser produces # DocumentFragments, and the xml parser Documents, letting this # decide what datatype to use makes roundtripping easier, but it a # bit random. - # - # This must happen before _GenericPythonToXSDRules is assigned to - # _OriginalGenericPythonToXSDRules. - _GenericPythonToXSDRules.append( - (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)) - ) + (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)), + (Fraction, (None, _OWL_RATIONAL)), +] _OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules) @@ -2071,14 +2061,10 @@ def _castPythonToLiteral( # noqa: N802 URIRef(_XSD_PFX + "double"): float, URIRef(_XSD_PFX + "base64Binary"): b64decode, URIRef(_XSD_PFX + "anyURI"): None, + _RDF_HTMLLITERAL: _parse_html, _RDF_XMLLITERAL: _parseXML, } -if html5lib is not None: - # It is probably best to keep this close to the definition of - # _GenericPythonToXSDRules so nobody misses it. - XSDToPython[_RDF_HTMLLITERAL] = _parse_html - _check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = { URIRef(_XSD_PFX + "boolean"): _well_formed_boolean, URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer, diff --git a/test/test_literal/test_literal_html5lib.py b/test/test_literal/test_literal_html5lib.py index ce22568db..b77feb94b 100644 --- a/test/test_literal/test_literal_html5lib.py +++ b/test/test_literal/test_literal_html5lib.py @@ -1,6 +1,7 @@ import xml.dom.minidom from typing import Callable +import html5lib # noqa: F401 import pytest import rdflib.term @@ -9,14 +10,8 @@ from test.utils.literal import LiteralChecker from test.utils.outcome import OutcomeChecker, OutcomePrimitives -try: - import html5lib as _ # noqa: F401 -except ImportError: - pytest.skip("html5lib not installed", allow_module_level=True) - def test_has_html5lib() -> None: - assert rdflib.term._HAS_HTML5LIB is True assert RDF.HTML in rdflib.term.XSDToPython rule = next( ( diff --git a/tox.ini b/tox.ini index 9ec80d516..ef02ff4d2 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ setenv = COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}} MYPY_CACHE_DIR = {envdir}/.mypy_cache docs: POETRY_ARGS_docs = --only=docs - extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson + extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson lxml: POETRY_ARGS_lxml = --extras=lxml commands_pre = py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))' @@ -59,7 +59,7 @@ setenv = PYTHONHASHSEED = 0 commands_pre = poetry lock --check - poetry install --only=main --only=docs --extras=html + poetry install --only=main --only=docs poetry env info commands = poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html