Skip to content

Commit

Permalink
html fix for lxml v5+
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 15, 2024
1 parent 2c03f06 commit 8ed8b2c
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 9 deletions.
19 changes: 15 additions & 4 deletions htmldate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,23 @@ def is_dubious_html(beginning: str) -> bool:
return "html" not in beginning


def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str:
"Repair faulty doctype strings to make then palatable for libxml2."
def repair_faulty_html(htmlstring: str, beginning: str) -> str:
"Repair faulty HTML strings to make then palatable for libxml2."
# libxml2/LXML issue: https://bugs.launchpad.net/lxml/+bug/1955915
if "doctype" in beginning:
firstline, _, rest = htmlstring.partition("\n")
return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
# other issue with malformed documents
i = 0
replace = False
for line in iter(htmlstring.splitlines()):
if line.startswith("<html") and line.endswith("/>"):
replace = True
i += 1
if i > 3:
break
if replace:
htmlstring = re.sub(r"(<html.+?)\s*/>", r"\1>", htmlstring, count=1)
return htmlstring


Expand Down Expand Up @@ -215,7 +226,7 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen
beginning = htmlobject[:50].lower()
check_flag = is_dubious_html(beginning)
# repair first
htmlobject = strip_faulty_doctypes(htmlobject, beginning)
htmlobject = repair_faulty_html(htmlobject, beginning)
# first pass: use Unicode string
fallback_parse = False
try:
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def get_version(package):
"charset_normalizer >= 3.3.2; python_version >= '3.7'",
"dateparser >= 1.1.2", # 1.1.3+ slower
# see tests on Github Actions
"lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml == 4.9.4 ; platform_system != 'Darwin' or python_version > '3.8'",
"lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml >= 5.1.0, < 6; platform_system != 'Darwin' or python_version > '3.8'",
"python-dateutil >= 2.8.2",
"urllib3 >= 1.26, < 2; python_version < '3.7'",
"urllib3 >= 1.26, < 3; python_version >= '3.7'",
Expand Down
13 changes: 10 additions & 3 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
fetch_url,
is_dubious_html,
load_html,
strip_faulty_doctypes,
repair_faulty_html,
)
from htmldate.validators import (
convert_date,
Expand All @@ -83,12 +83,19 @@
def test_input():
"""test if loaded strings/trees are handled properly"""
assert is_dubious_html("This is a string.") is True

htmlstring = "<!DOCTYPE html PUBLIC />\n<html/>"
beginning = htmlstring[:50].lower()
assert strip_faulty_doctypes(htmlstring, beginning) == "\n<html/>"
assert repair_faulty_html(htmlstring, beginning) == "\n<html/>"

htmlstring = "<html>\n</html>"
beginning = htmlstring[:50].lower()
assert strip_faulty_doctypes(htmlstring, beginning) == htmlstring
assert repair_faulty_html(htmlstring, beginning) == htmlstring

#htmlstring = '<!DOCTYPE html>\n<html lang="en-US"/>\n<head/>\n<body/>\n</html>'
#beginning = htmlstring[:50].lower()
#assert repair_faulty_html("", beginning) == "\n<html/>"

with pytest.raises(TypeError) as err:
assert load_html(123) is None
assert "incompatible" in str(err.value)
Expand Down

0 comments on commit 8ed8b2c

Please sign in to comment.