Skip to content

Commit

Permalink
Replace html5lib with html5lib-modern (#2911)
Browse files Browse the repository at this point in the history
* Replace html5lib with html5lib-modern, this removes another source of `six` dependency.

* Fix import sorting

* Remove "html" extras installation from tests.
  • Loading branch information
ashleysommer authored Oct 1, 2024
1 parent a21b96d commit 0b69f4f
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 67 deletions.
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ build:
# the readthedocs environment.
- pip install -r devtools/requirements-poetry.in
post_install:
- poetry export --only=main --only=docs --extras=html -o requirements.txt
- poetry export --only=main --only=docs -o requirements.txt
- pip install --no-cache-dir -r requirements.txt
- pip install .
- python -c "from rdflib import Graph; print(Graph)"
Expand Down
5 changes: 1 addition & 4 deletions docker/latest/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile --config=pyproject.toml docker/latest/requirements.in
#
html5lib==1.1
html5lib-modern==1.2
# via -r docker/latest/requirements.in
isodate==0.6.1
# via rdflib
Expand All @@ -14,7 +14,4 @@ rdflib==7.0.0
# via -r docker/latest/requirements.in
six==1.16.0
# via
# html5lib
# isodate
webencodings==0.5.1
# via html5lib
39 changes: 11 additions & 28 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ isodate = "^0.6.0"
pyparsing = ">=2.1.0,<4"
berkeleydb = {version = "^18.1.0", optional = true}
networkx = {version = ">=2,<4", optional = true}
html5lib = {version = "^1.0", optional = true}
html5lib-modern = "^1.2"
lxml = {version = ">=4.3,<6.0", optional = true}
orjson = {version = ">=3.9.14,<4", optional = true}

Expand Down Expand Up @@ -73,7 +73,6 @@ ruff = ">=0.0.286,<0.7.0"
[tool.poetry.extras]
berkeleydb = ["berkeleydb"]
networkx = ["networkx"]
html = ["html5lib"]
lxml = ["lxml"]
orjson = ["orjson"]

Expand Down
34 changes: 10 additions & 24 deletions rdflib/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
from urllib.parse import urldefrag, urljoin, urlparse
from uuid import uuid4

import html5lib
from isodate import (
Duration,
duration_isoformat,
Expand All @@ -83,14 +84,6 @@
from .namespace import NamespaceManager
from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath

_HAS_HTML5LIB = False

try:
import html5lib

_HAS_HTML5LIB = True
except ImportError:
html5lib = None

_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"

Expand Down Expand Up @@ -1677,7 +1670,11 @@ def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
parser = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
)
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
try:
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
except html5lib.html5parser.ParseError as e:
logger.info(f"Failed to parse HTML: {e}")
raise e
result.normalize()
return result

Expand Down Expand Up @@ -2007,20 +2004,13 @@ def _castPythonToLiteral( # noqa: N802
(Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
(timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
(xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
(Fraction, (None, _OWL_RATIONAL)),
]

if html5lib is not None:
# This is a bit dirty, by accident the html5lib parser produces
# DocumentFragments, and the xml parser Documents, letting this
# decide what datatype to use makes roundtripping easier, but it a
# bit random.
#
# This must happen before _GenericPythonToXSDRules is assigned to
# _OriginalGenericPythonToXSDRules.
_GenericPythonToXSDRules.append(
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
)
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)),
(Fraction, (None, _OWL_RATIONAL)),
]

_OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)

Expand Down Expand Up @@ -2071,14 +2061,10 @@ def _castPythonToLiteral( # noqa: N802
URIRef(_XSD_PFX + "double"): float,
URIRef(_XSD_PFX + "base64Binary"): b64decode,
URIRef(_XSD_PFX + "anyURI"): None,
_RDF_HTMLLITERAL: _parse_html,
_RDF_XMLLITERAL: _parseXML,
}

if html5lib is not None:
# It is probably best to keep this close to the definition of
# _GenericPythonToXSDRules so nobody misses it.
XSDToPython[_RDF_HTMLLITERAL] = _parse_html

_check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,
Expand Down
7 changes: 1 addition & 6 deletions test/test_literal/test_literal_html5lib.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import xml.dom.minidom
from typing import Callable

import html5lib # noqa: F401
import pytest

import rdflib.term
Expand All @@ -9,14 +10,8 @@
from test.utils.literal import LiteralChecker
from test.utils.outcome import OutcomeChecker, OutcomePrimitives

try:
import html5lib as _ # noqa: F401
except ImportError:
pytest.skip("html5lib not installed", allow_module_level=True)


def test_has_html5lib() -> None:
assert rdflib.term._HAS_HTML5LIB is True
assert RDF.HTML in rdflib.term.XSDToPython
rule = next(
(
Expand Down
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ setenv =
COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
MYPY_CACHE_DIR = {envdir}/.mypy_cache
docs: POETRY_ARGS_docs = --only=docs
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson
lxml: POETRY_ARGS_lxml = --extras=lxml
commands_pre =
py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
Expand Down Expand Up @@ -59,7 +59,7 @@ setenv =
PYTHONHASHSEED = 0
commands_pre =
poetry lock --check
poetry install --only=main --only=docs --extras=html
poetry install --only=main --only=docs
poetry env info
commands =
poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html
Expand Down

0 comments on commit 0b69f4f

Please sign in to comment.