Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update dependency beautifulsoup4 to v4.12.3 #11660

Merged
merged 2 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ext/bs4/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
"""

__author__ = "Leonard Richardson ([email protected])"
__version__ = "4.12.2"
__copyright__ = "Copyright (c) 2004-2023 Leonard Richardson"
__version__ = "4.12.3"
__copyright__ = "Copyright (c) 2004-2024 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"

Expand Down
13 changes: 9 additions & 4 deletions ext/bs4/builder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,15 +514,19 @@ class DetectsXMLParsedAsHTML(object):
XML_PREFIX_B = b'<?xml'

@classmethod
def warn_if_markup_looks_like_xml(cls, markup):
def warn_if_markup_looks_like_xml(cls, markup, stacklevel=3):
"""Perform a check on some markup to see if it looks like XML
that's not XHTML. If so, issue a warning.

This is much less reliable than doing the check while parsing,
but some of the tree builders can't do that.

:param stacklevel: The stacklevel of the code calling this
function.

:return: True if the markup looks like non-XHTML XML, False
otherwise.

"""
if isinstance(markup, bytes):
prefix = cls.XML_PREFIX_B
Expand All @@ -535,15 +539,16 @@ def warn_if_markup_looks_like_xml(cls, markup):
and markup.startswith(prefix)
and not looks_like_html.search(markup[:500])
):
cls._warn()
cls._warn(stacklevel=stacklevel+2)
return True
return False

@classmethod
def _warn(cls):
def _warn(cls, stacklevel=5):
"""Issue a warning about XML being parsed as HTML."""
warnings.warn(
XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning
XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning,
stacklevel=stacklevel
)

def _initialize_xml_detector(self):
Expand Down
4 changes: 3 additions & 1 deletion ext/bs4/builder/_html5lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ def prepare_markup(self, markup, user_specified_encoding,

# html5lib only parses HTML, so if it's given XML that's worth
# noting.
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
markup, stacklevel=3
)

yield (markup, None, None, False)

Expand Down
2 changes: 1 addition & 1 deletion ext/bs4/builder/_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,10 +378,10 @@ def feed(self, markup):
parser.soup = self.soup
try:
parser.feed(markup)
parser.close()
except AssertionError as e:
# html.parser raises AssertionError in rare cases to
# indicate a fatal problem with the markup, especially
# when there's an error in the doctype declaration.
raise ParserRejectedMarkup(e)
parser.close()
parser.already_closed_empty_element = []
4 changes: 3 additions & 1 deletion ext/bs4/builder/_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,9 @@ def prepare_markup(self, markup, user_specified_encoding=None,
self.processing_instruction_class = ProcessingInstruction
# We're in HTML mode, so if we're given XML, that's worth
# noting.
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
markup, stacklevel=3
)
else:
self.processing_instruction_class = XMLProcessingInstruction

Expand Down
7 changes: 6 additions & 1 deletion ext/bs4/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -1356,7 +1356,7 @@ def _clone(self):
This is the first step in the deepcopy process.
"""
clone = type(self)(
None, self.builder, self.name, self.namespace,
None, None, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml,
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
Expand Down Expand Up @@ -1845,6 +1845,11 @@ def _indent_string(self, s, indent_level, formatter,
return space_before + s + space_after

def _format_tag(self, eventual_encoding, formatter, opening):
if self.hidden:
# A hidden tag is invisible, although its contents
# are visible.
return ''

# A tag starts with the < character (see below).

# Then the / character, if this is a closing tag.
Expand Down
4 changes: 2 additions & 2 deletions ext/bs4/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(
void_element_close_prefix='/', cdata_containing_tags=None,
empty_attributes_are_booleans=False, indent=1,
):
"""Constructor.
r"""Constructor.

:param language: This should be Formatter.XML if you are formatting
XML markup and Formatter.HTML if you are formatting HTML markup.
Expand All @@ -76,7 +76,7 @@ def __init__(
negative, or "" will only insert newlines. Using a
positive integer indent indents that many spaces per
level. If indent is a string (such as "\t"), that string
is used to indent each level. The default behavior to
is used to indent each level. The default behavior is to
indent one space per level.
"""
self.language = language
Expand Down
2 changes: 1 addition & 1 deletion ext/bs4/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,7 +1105,7 @@ def test_find_by_prefixed_name(self):
doc = """<?xml version="1.0" encoding="utf-8"?>
<Document xmlns="http://example.com/ns0"
xmlns:ns1="http://example.com/ns1"
xmlns:ns2="http://example.com/ns2"
xmlns:ns2="http://example.com/ns2">
<ns1:tag>foo</ns1:tag>
<ns1:tag>bar</ns1:tag>
<ns2:tag key="value">baz</ns2:tag>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ΓΏΓΏ ΓΏ <css
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ΓΏ ><applet></applet><applet></applet><apple|><applet><applet><appl›„><applet><applet></applet></applet></applet></applet><applet></applet><apple>t<applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet>et><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><azplet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><plet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet><applet></applet></applet></applet></applet></appt></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet></applet><<meta charset=utf-8>
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- ΓΏΓΏ <math><select><mi><select><select>t
Binary file not shown.
95 changes: 90 additions & 5 deletions ext/bs4/tests/test_fuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,75 @@
BeautifulSoup,
ParserRejectedMarkup,
)
try:
from soupsieve.util import SelectorSyntaxError
import lxml
import html5lib
fully_fuzzable = True
except ImportError:
fully_fuzzable = False


@pytest.mark.skipif(not fully_fuzzable, reason="Prerequisites for fuzz tests are not installed.")
class TestFuzz(object):

# Test case markup files from fuzzers are given this extension so
# they can be included in builds.
TESTCASE_SUFFIX = ".testcase"

# Copied 20230512 from
# https://github.com/google/oss-fuzz/blob/4ac6a645a197a695fe76532251feb5067076b3f3/projects/bs4/bs4_fuzzer.py
#
# Copying the code lets us precisely duplicate the behavior of
# oss-fuzz. The downside is that this code changes over time, so
# multiple copies of the code must be kept around to run against
# older tests. I'm not sure what to do about this, but I may
# retire old tests after a time.
def fuzz_test_with_css(self, filename):
data = self.__markup(filename)
parsers = ['lxml-xml', 'html5lib', 'html.parser', 'lxml']
try:
idx = int(data[0]) % len(parsers)
except ValueError:
return

css_selector, data = data[1:10], data[10:]

try:
soup = BeautifulSoup(data[1:], features=parsers[idx])
except ParserRejectedMarkup:
return
except ValueError:
return

list(soup.find_all(True))
try:
soup.css.select(css_selector.decode('utf-8', 'replace'))
except SelectorSyntaxError:
return
soup.prettify()

# This class of error has been fixed by catching a less helpful
# exception from html.parser and raising ParserRejectedMarkup
# instead.
@pytest.mark.parametrize(
"filename", [
"clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912",
"crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a",
]
)
def test_rejected_markup(self, filename):
markup = self.__markup(filename)
with pytest.raises(ParserRejectedMarkup):
BeautifulSoup(markup, 'html.parser')

# This class of error has to do with very deeply nested documents
# which overflow the Python call stack when the tree is converted
# to a string. This is an issue with Beautiful Soup which was fixed
# as part of [bug=1471755].
#
# These test cases are in the older format that doesn't specify
# which parser to use or give a CSS selector.
@pytest.mark.parametrize(
"filename", [
"clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440",
Expand All @@ -46,18 +91,44 @@ def test_rejected_markup(self, filename):
"clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400",
]
)
def test_deeply_nested_document(self, filename):
def test_deeply_nested_document_without_css(self, filename):
# Parsing the document and encoding it back to a string is
# sufficient to demonstrate that the overflow problem has
# been fixed.
markup = self.__markup(filename)
BeautifulSoup(markup, 'html.parser').encode()

# This class of error has to do with very deeply nested documents
# which overflow the Python call stack when the tree is converted
# to a string. This is an issue with Beautiful Soup which was fixed
# as part of [bug=1471755].
@pytest.mark.parametrize(
"filename", [
"clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016",
"clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000",
"clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624",
]
)
def test_deeply_nested_document(self, filename):
self.fuzz_test_with_css(filename)

@pytest.mark.parametrize(
"filename", [
"clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256",
"clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824",
]
)
def test_soupsieve_errors(self, filename):
self.fuzz_test_with_css(filename)

# This class of error represents problems with html5lib's parser,
# not Beautiful Soup. I use
# https://github.com/html5lib/html5lib-python/issues/568 to notify
# the html5lib developers of these issues.
@pytest.mark.skip("html5lib problems")
#
# These test cases are in the older format that doesn't specify
# which parser to use or give a CSS selector.
@pytest.mark.skip(reason="html5lib-specific problems")
@pytest.mark.parametrize(
"filename", [
# b"""ÿ<!DOCTyPEV PUBLIC'''Ð'"""
Expand All @@ -68,7 +139,7 @@ def test_deeply_nested_document(self, filename):

# b'-<math><sElect><mi><sElect><sElect>'
"clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896",

# b'ñ<table><svg><html>'
"clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224",

Expand All @@ -79,10 +150,24 @@ def test_deeply_nested_document(self, filename):
"crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08"
]
)
def test_html5lib_parse_errors(self, filename):
def test_html5lib_parse_errors_without_css(self, filename):
markup = self.__markup(filename)
print(BeautifulSoup(markup, 'html5lib').encode())

# This class of error represents problems with html5lib's parser,
# not Beautiful Soup. I use
# https://github.com/html5lib/html5lib-python/issues/568 to notify
# the html5lib developers of these issues.
@pytest.mark.skip(reason="html5lib-specific problems")
@pytest.mark.parametrize(
"filename", [
# b'- \xff\xff <math>\x10<select><mi><select><select>t'
"clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640",
]
)
def test_html5lib_parse_errors(self, filename):
self.fuzz_test_with_css(filename)

def __markup(self, filename):
if not filename.endswith(self.TESTCASE_SUFFIX):
filename += self.TESTCASE_SUFFIX
Expand Down
13 changes: 13 additions & 0 deletions ext/bs4/tests/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,16 @@ def test_customization(self):
)
assert soup.a['class'] == 'foo'
assert soup.a['id'] == ['bar']

def test_hidden_tag_is_invisible(self):
# Setting .hidden on a tag makes it invisible in output, but
# leaves its contents visible.
#
# This is not a documented or supported feature of Beautiful
# Soup (e.g. NavigableString doesn't support .hidden even
# though it could), but some people use it and it's not
# hurting anything to verify that it keeps working.
#
soup = self.soup('<div id="1"><span id="2">a string</span></div>')
soup.span.hidden = True
assert '<div id="1">a string</div>' == str(soup.div)
2 changes: 1 addition & 1 deletion ext/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ext | **`adba`** | pymedusa/[37b0c74](https://github.com/pymedusa/adba/tree/37b0
ext | `appdirs` | [1.4.3](https://pypi.org/project/appdirs/1.4.3/) | `simpleanidb`, `subliminal` (cli only) | File: `appdirs.py`
ext | `attrs` | [18.2.0](https://pypi.org/project/attrs/18.2.0/) | `imdbpie` | Module: `attr`
ext | **`babelfish`** | [0.6.0](https://pypi.org/project/babelfish/0.6.0/) | **`medusa`**, `guessit`, `knowit`, `subliminal` | -
ext | `beautifulsoup4` | [4.12.2](https://pypi.org/project/beautifulsoup4/4.12.2/) | **`medusa`**, `subliminal` | Module: `bs4`
ext | `beautifulsoup4` | [4.12.3](https://pypi.org/project/beautifulsoup4/4.12.3/) | **`medusa`**, `subliminal` | Module: `bs4`
ext | `bencode.py` | [4.0.0](https://pypi.org/project/bencode.py/4.0.0/) | **`medusa`** | Modules: `bencodepy`, `bencode`<br>Monkey-patched, see `medusa/init/__init__.py`
ext | **`boto`** | [2.48.0](https://pypi.org/project/boto/2.48.0/) | `imdbpie` | -
ext | `CacheControl` | [0.13.1](https://pypi.org/project/CacheControl/0.13.1/) | **`medusa`** | Module: `cachecontrol`
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
adba @ https://codeload.github.com/pymedusa/adba/tar.gz/37b0c74e76b40b3dbde29e71da75a1808eb121de
babelfish==0.6.0
beautifulsoup4==4.12.2
beautifulsoup4==4.12.3
bencode.py==4.0.0
CacheControl==0.13.1
certifi==2023.7.22
Expand Down
Loading