Skip to content

Commit

Permalink
Fixes eager doctype regex matching when doctype is not followed by a …
Browse files Browse the repository at this point in the history
…newline.
  • Loading branch information
UVMvmfee committed Dec 7, 2024
1 parent 7067937 commit 7671f3d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 1 deletion.
7 changes: 7 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ def test_input():
== '<!DOCTYPE html>\n<html lang="en-US">\n<head/>\n<body/>\n</html>'
)

htmlstring = '<!DOCTYPE html><html><head></head><body>Foo <br/> Bar</body></html>'
beginning = htmlstring[:50].lower()
assert (
repair_faulty_html(htmlstring, beginning)
== '<!DOCTYPE html><html><head></head><body>Foo <br/> Bar</body></html>\n'
)

with pytest.raises(TypeError) as err:
assert load_html(123) is None
assert 'incompatible' in str(err.value)
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@

UNICODE_ALIASES = {'utf-8', 'utf_8'}

DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I)
DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE[^>]*/[^<]*>", re.I)
FAULTY_HTML = re.compile(r"(<html.*?)\s*/>", re.I)
HTML_STRIP_TAGS = re.compile(r'(<!--.*?-->|<[^>]*>)')

Expand Down

0 comments on commit 7671f3d

Please sign in to comment.