Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace html5lib with html5rdf, make it an optional dependency #2951

Merged
merged 8 commits into from
Oct 28, 2024
2 changes: 1 addition & 1 deletion devtools/constraints.min
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ pyparsing==2.1.0
importlib-metadata==4.0.0
berkeleydb==18.1.2
networkx==2.0
html5lib-modern==1.2.0
html5rdf==1.2.0
lxml==4.3.0
orjson==3.9.14
4 changes: 2 additions & 2 deletions docker/latest/requirements.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This file is used for building a docker image of the latest rdflib release. It
# will be updated by dependabot when new releases are made.
rdflib==7.1.0
html5rdf==1.2.0
# html5lib-modern is required to allow the Dockerfile to build on with pre-RDFLib-7.1.1 releases.
html5lib-modern==1.2.0
# isodate is required to allow the Dockerfile to build on with pre-RDFLib-7.1 releases.
isodate==0.7.2
4 changes: 2 additions & 2 deletions docker/latest/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
#
# pip-compile docker/latest/requirements.in
#
html5lib-modern==1.2
html5rdf==1.2
# via
# -r docker/latest/requirements.in
# rdflib
isodate==0.7.2
html5lib-modern==1.2
# via -r docker/latest/requirements.in
pyparsing==3.0.9
# via rdflib
Expand Down
12 changes: 6 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ isodate = {version=">=0.7.2,<1.0.0", python = "<3.11"}
pyparsing = ">=2.1.0,<4"
berkeleydb = {version = "^18.1.0", optional = true}
networkx = {version = ">=2,<4", optional = true}
html5lib-modern = "^1.2"
html5rdf = {version = ">=1.2,<2", optional = true}
lxml = {version = ">=4.3,<6.0", optional = true}
orjson = {version = ">=3.9.14,<4", optional = true}

Expand Down Expand Up @@ -74,6 +74,9 @@ ruff = ">=0.0.286,<0.8.0"
[tool.poetry.extras]
berkeleydb = ["berkeleydb"]
networkx = ["networkx"]
# html support is optional, it is used only in tokenizing `rdf:HTML` type Literals
html = ["html5rdf"]
# lxml support is optional, it is used only for parsing XML-formatted SPARQL results
lxml = ["lxml"]
orjson = ["orjson"]

Expand Down
73 changes: 53 additions & 20 deletions rdflib/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@
from urllib.parse import urldefrag, urljoin, urlparse
from uuid import uuid4

import html5lib

import rdflib
import rdflib.util
from rdflib.compat import long_type
Expand All @@ -86,6 +84,14 @@
from .namespace import NamespaceManager
from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath

_HAS_HTML5RDF = False

try:
import html5rdf

_HAS_HTML5RDF = True
except ImportError:
html5rdf = None

_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"

Expand Down Expand Up @@ -1107,7 +1113,7 @@ def __gt__(self, other: Any) -> bool:
if other is None:
return True # Everything is greater than None
if isinstance(other, Literal):
# Fast path for comapring numeric literals
# Fast path for comparing numeric literals
# that are not ill-typed and don't have a None value
if (
(
Expand Down Expand Up @@ -1350,9 +1356,15 @@ def eq(self, other: Any) -> bool:

"""
if isinstance(other, Literal):
# Fast path for comparing numeric literals
# that are not ill-typed and don't have a None value
if (
self.datatype in _NUMERIC_LITERAL_TYPES
and other.datatype in _NUMERIC_LITERAL_TYPES
(
self.datatype in _NUMERIC_LITERAL_TYPES
and other.datatype in _NUMERIC_LITERAL_TYPES
)
and ((not self.ill_typed) and (not other.ill_typed))
and (self.value is not None and other.value is not None)
):
if self.value is not None and other.value is not None:
return self.value == other.value
Expand All @@ -1374,6 +1386,16 @@ def eq(self, other: Any) -> bool:
# string/plain literals, compare on lexical form
return str.__eq__(self, other)

# XML can be compared to HTML, only if html5rdf is enabled
if (
(dtself in _XML_COMPARABLE and dtother in _XML_COMPARABLE)
and
# Ill-typed can be None if unknown, but we don't want it to be True.
((self.ill_typed is not True) and (other.ill_typed is not True))
and (self.value is not None and other.value is not None)
):
return _isEqualXMLNode(self.value, other.value)

if dtself != dtother:
if rdflib.DAWG_LITERAL_COLLATION:
raise TypeError(
Expand All @@ -1387,9 +1409,6 @@ def eq(self, other: Any) -> bool:
# maybe there are counter examples

if self.value is not None and other.value is not None:
if self.datatype in (_RDF_XMLLITERAL, _RDF_HTMLLITERAL):
return _isEqualXMLNode(self.value, other.value)

return self.value == other.value
else:
if str.__eq__(self, other):
Expand Down Expand Up @@ -1668,19 +1687,19 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document: # noqa: N802
def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
"""
Parse the lexical form of an HTML literal into a document fragment
using the ``dom`` from html5lib tree builder.
using the ``dom`` from html5rdf tree builder.

:param lexical_form: The lexical form of the HTML literal.
:return: A document fragment representing the HTML literal.
:raises: `html5lib.html5parser.ParseError` if the lexical form is
:raises: `html5rdf.html5parser.ParseError` if the lexical form is
not valid HTML.
"""
parser = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
parser = html5rdf.HTMLParser(
tree=html5rdf.treebuilders.getTreeBuilder("dom"), strict=True
)
try:
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
except html5lib.html5parser.ParseError as e:
except html5rdf.html5parser.ParseError as e:
logger.info(f"Failed to parse HTML: {e}")
raise e
result.normalize()
Expand All @@ -1695,7 +1714,7 @@ def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
:param value: A document fragment representing an HTML literal.
:return: The lexical form of the HTML literal.
"""
result = html5lib.serialize(value, tree="dom")
result = html5rdf.serialize(value, tree="dom")
return result


Expand Down Expand Up @@ -2012,14 +2031,21 @@ def _castPythonToLiteral( # noqa: N802
(Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
(timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
(xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
# This is a bit dirty, by accident the html5lib parser produces
# DocumentFragments, and the xml parser Documents, letting this
# decide what datatype to use makes roundtripping easier, but it a
# bit random.
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)),
(Fraction, (None, _OWL_RATIONAL)),
]

if html5rdf is not None:
# This is a bit dirty, by accident the html5rdf parser produces
# DocumentFragments, and the xml parser Documents, letting this
# decide what datatype to use makes roundtripping easier, but its a
# bit random.

# This must happen before _GenericPythonToXSDRules is assigned to
# _OriginalGenericPythonToXSDRules.
_GenericPythonToXSDRules.append(
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
)

_OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)

_SpecificPythonToXSDRules: List[
Expand Down Expand Up @@ -2069,10 +2095,17 @@ def _castPythonToLiteral( # noqa: N802
URIRef(_XSD_PFX + "double"): float,
URIRef(_XSD_PFX + "base64Binary"): b64decode,
URIRef(_XSD_PFX + "anyURI"): None,
_RDF_HTMLLITERAL: _parse_html,
_RDF_XMLLITERAL: _parseXML,
}

if html5rdf is not None:
# It is probably best to keep this close to the definition of
# _GenericPythonToXSDRules so nobody misses it.
XSDToPython[_RDF_HTMLLITERAL] = _parse_html
_XML_COMPARABLE: Tuple[URIRef, ...] = (_RDF_XMLLITERAL, _RDF_HTMLLITERAL)
else:
_XML_COMPARABLE = (_RDF_XMLLITERAL,)

_check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,
Expand Down
10 changes: 5 additions & 5 deletions test/test_literal/test_literal.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@


try:
import html5lib as _ # noqa: F401
import html5rdf as _ # noqa: F401

_HAVE_HTML5LIB = True
_HAVE_HTML5RDF = True
except ImportError:
_HAVE_HTML5LIB = False
_HAVE_HTML5RDF = False

import pytest

Expand Down Expand Up @@ -981,7 +981,7 @@ def __eq__(self, __value: object) -> bool:
(
lambda: Literal("<body>", datatype=RDF.HTML),
LiteralChecker(
..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "<body>"
..., None, RDF.HTML, True if _HAVE_HTML5RDF else None, "<body>"
),
),
(
Expand All @@ -990,7 +990,7 @@ def __eq__(self, __value: object) -> bool:
...,
None,
RDF.HTML,
False if _HAVE_HTML5LIB else None,
False if _HAVE_HTML5RDF else None,
"<table></table>",
),
),
Expand Down
13 changes: 9 additions & 4 deletions test/test_literal/test_literal_html5lib.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import xml.dom.minidom
from typing import Callable

import html5lib # noqa: F401
import pytest

import rdflib.term
Expand All @@ -10,8 +9,14 @@
from test.utils.literal import LiteralChecker
from test.utils.outcome import OutcomeChecker, OutcomePrimitives

try:
import html5rdf as _ # noqa: F401
except ImportError:
pytest.skip("html5rdf not installed", allow_module_level=True)

def test_has_html5lib() -> None:

def test_has_html5rdf() -> None:
assert rdflib.term._HAS_HTML5RDF is True
assert RDF.HTML in rdflib.term.XSDToPython
rule = next(
(
Expand All @@ -29,7 +34,7 @@ def test_has_html5lib() -> None:
["factory", "outcome"],
[
# Ill-typed literals, these have lexical forms that result in
# errors when parsed as HTML by html5lib.
# errors when parsed as HTML by html5rdf.
(
lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML),
LiteralChecker(
Expand All @@ -47,7 +52,7 @@ def test_has_html5lib() -> None:
),
),
# Well-typed literals, these have lexical forms that parse
# without errors with html5lib.
# without errors with html5rdf.
(
lambda: Literal("<table></table>", datatype=RDF.HTML),
LiteralChecker(..., None, RDF.HTML, False, "<table></table>"),
Expand Down
12 changes: 6 additions & 6 deletions test/test_literal/test_xmlliterals.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from rdflib import RDF, Literal

try:
import html5lib # noqa: F401
import html5rdf # noqa: F401

have_html5lib = True
have_html5rdf = True
except ImportError:
have_html5lib = False
have_html5rdf = False


def testPythonRoundtrip(): # noqa: N802
Expand Down Expand Up @@ -90,7 +90,7 @@ def testRoundtrip(): # noqa: N802
roundtrip("nt")


@pytest.mark.skipif(not have_html5lib, reason="requires html5lib")
@pytest.mark.skipif(not have_html5rdf, reason="requires html5rdf")
def testHTML(): # noqa: N802
l1 = Literal("<msg>hello</msg>", datatype=RDF.XMLLiteral)
assert l1.value is not None, "xml must have been parsed"
Expand Down Expand Up @@ -126,7 +126,7 @@ def testHTML(): # noqa: N802
textwrap.dedent(
"""\
<!DOCTYPE example>
<something/>
<something2/>
"""
)
),
Expand All @@ -137,7 +137,7 @@ def testHTML(): # noqa: N802
textwrap.dedent(
"""\
<!DOCTYPE example>
<something />
<something2 />
"""
)
),
Expand Down
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ setenv =
COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
MYPY_CACHE_DIR = {envdir}/.mypy_cache
docs: POETRY_ARGS_docs = --only=docs
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson
lxml: POETRY_ARGS_lxml = --extras=lxml
commands_pre =
py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
Expand Down Expand Up @@ -59,7 +59,7 @@ setenv =
PYTHONHASHSEED = 0
commands_pre =
poetry lock --check
poetry install --only=main --only=docs
poetry install --only=main --only=docs --extras=html
poetry env info
commands =
poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html
Expand Down
Loading