From 663d749c5865dff3d8a8d674414beb50583bad42 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 25 Apr 2022 22:10:00 +0800 Subject: [PATCH 01/22] update HttpClient to error out by default when receiving 400-5xx responses --- docs/advanced/additional-requests.rst | 9 +++-- tests/test_requests.py | 21 ++++++++++- web_poet/requests.py | 54 ++++++++++++++++++++++++--- 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 85fc6cf8..8b55f96a 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -750,9 +750,12 @@ doc section) explicitly raises when implementing it for **web-poet** should be For frameworks that implement and use **web-poet**, exceptions that ocurred when handling the additional requests like `connection errors`, `time outs`, `TLS errors`, etc should be replaced by :class:`web_poet.exceptions.http.HttpRequestError` -by raising it explicitly. For responses that are successful but don't have a ``200`` -**status code**, this exception shouldn't be raised at all. Instead, the -:class:`~.HttpResponse` should simply reflect the response contents as is. +by raising it explicitly. + +For responses that are not really errors like in the 100-3xx status code range, +this exception shouldn't be raised at all. However, for responses with status +codes in the 400-5xx range, the implementing framework should properly raise +the exception. This is to ensure that Page Objects having additional requests using the :class:`~.HttpClient` is able to work in any type of HTTP downloader implementation. diff --git a/tests/test_requests.py b/tests/test_requests.py index 13ca72de..f01233d5 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1,7 +1,7 @@ from unittest import mock import pytest -from web_poet.exceptions import RequestBackendError +from web_poet.exceptions import RequestBackendError, HttpRequestError from web_poet.page_inputs import ( HttpRequest, HttpResponse, @@ -81,6 +81,25 @@ async def test_http_client_single_requests(async_mock): ] +@pytest.mark.asyncio +@pytest.mark.parametrize("method_name", ["request", "get", "post"]) +async def test_http_client_request_status_err(method_name): + client = HttpClient(async_mock) + + # Simulate 500 Internal Server Error responses + async def stub_request_downloader(*args, **kwargs): + async def stub(req): + return HttpResponse(req.url, body=b"", status=500) + return await stub(*args, **kwargs) + client._request_downloader = stub_request_downloader + + method = getattr(client, method_name) + + await method("url", allow_status=[500]) + with pytest.raises(HttpRequestError): + await method("url") + + @pytest.mark.asyncio async def test_http_client_keyword_enforcing(async_mock): """Only keyword args are allowed after the url param.""" diff --git a/web_poet/requests.py b/web_poet/requests.py index 2030680c..2b0e167c 100644 --- a/web_poet/requests.py +++ b/web_poet/requests.py @@ -21,7 +21,8 @@ HttpRequestHeaders, HttpRequestBody, ) -from web_poet.exceptions import RequestBackendError +from web_poet.exceptions import RequestBackendError, HttpRequestError +from web_poet.utils import as_list logger = logging.getLogger(__name__) @@ -82,6 +83,23 @@ class HttpClient: def __init__(self, request_downloader: Callable = None): self._request_downloader = request_downloader or _perform_request + @staticmethod + def _handle_status(response: HttpResponse, allow_status: List[int]) -> None: + if ( + response.status is None + or response.status < 400 + or response.status in allow_status + ): + return + + if 400 <= response.status < 500: + kind = "Client" + elif 500 <= response.status < 600: + kind = "Server" + + msg = f"{response.status} {kind} Error for {response.url}" + raise HttpRequestError(msg) + async def request( self, url: str, @@ -89,11 +107,16 @@ async def request( method: str = "GET", headers: Optional[_Headers] = None, body: Optional[_Body] = None, + allow_status: List[int] = None, ) -> HttpResponse: """This is a shortcut for creating a :class:`~.HttpRequest` instance and executing that request. - A :class:`~.HttpResponse` instance should then be returned. + A :class:`~.HttpResponse` instance should then be returned for successful + responses in the 100-3xx status code range. Otherwise, an exception of + type :class:`web_poet.exceptions.http.HttpRequestError` will be raised. + This behavior can be changed by suppressing the exceptions on select + status codes using the ``allow_status`` param. .. warning:: By convention, the request implementation supplied optionally to @@ -104,13 +127,25 @@ async def request( headers = headers or {} body = body or b"" req = HttpRequest(url=url, method=method, headers=headers, body=body) - return await self.execute(req) - async def get(self, url: str, *, headers: Optional[_Headers] = None) -> HttpResponse: + response = await self.execute(req) + self._handle_status(response, allow_status=as_list(allow_status)) + + return response + + async def get( + self, + url: str, + *, + headers: Optional[_Headers] = None, + allow_status: List[int] = None, + ) -> HttpResponse: """Similar to :meth:`~.HttpClient.request` but peforming a ``GET`` request. """ - return await self.request(url=url, method="GET", headers=headers) + return await self.request( + url=url, method="GET", headers=headers, allow_status=allow_status + ) async def post( self, @@ -118,11 +153,18 @@ async def post( *, headers: Optional[_Headers] = None, body: Optional[_Body] = None, + allow_status: List[int] = None, ) -> HttpResponse: """Similar to :meth:`~.HttpClient.request` but performing a ``POST`` request. """ - return await self.request(url=url, method="POST", headers=headers, body=body) + return await self.request( + url=url, + method="POST", + headers=headers, + body=body, + allow_status=allow_status, + ) async def execute(self, request: HttpRequest) -> HttpResponse: """Accepts a single instance of :class:`~.HttpRequest` and executes it From 0e898378133ddc1cd41110e4f760e17c9fe6e5b4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 26 Apr 2022 13:45:40 +0800 Subject: [PATCH 02/22] enable '*' wildcards in allow_status param --- tests/test_requests.py | 43 +++++++++++++++++++++++++++++++++++------- web_poet/requests.py | 41 +++++++++++++++++++++++++++++----------- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/tests/test_requests.py b/tests/test_requests.py index f01233d5..f920d457 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -81,24 +81,53 @@ async def test_http_client_single_requests(async_mock): ] +@pytest.fixture +def client_with_status(): + def _param_wrapper(status_code: int): + async def stub_request_downloader(*args, **kwargs): + async def stub(req): + return HttpResponse(req.url, body=b"", status=status_code) + return await stub(*args, **kwargs) + return stub_request_downloader + return _param_wrapper + + @pytest.mark.asyncio @pytest.mark.parametrize("method_name", ["request", "get", "post"]) -async def test_http_client_request_status_err(method_name): +async def test_http_client_allow_status(client_with_status, method_name): client = HttpClient(async_mock) # Simulate 500 Internal Server Error responses - async def stub_request_downloader(*args, **kwargs): - async def stub(req): - return HttpResponse(req.url, body=b"", status=500) - return await stub(*args, **kwargs) - client._request_downloader = stub_request_downloader + client._request_downloader = client_with_status(500) method = getattr(client, method_name) - await method("url", allow_status=[500]) + # Should handle single and multiple values + await method("url", allow_status=500) + await method("url", allow_status=[500, 503]) + + # As well as strings + await method("url", allow_status="500") + await method("url", allow_status=["500", "503"]) + with pytest.raises(HttpRequestError): await method("url") + with pytest.raises(HttpRequestError): + await method("url", allow_status=408) + + # Simulate 408 Request Timeout responses + client._request_downloader = client_with_status(408) + + # As long as "*" is present, then no errors would be raised + await method("url", allow_status="*") + await method("url", allow_status=[400, "*"]) + await method("url", allow_status=[400, 408, "*"]) + + # Globbing isn't supported + with pytest.raises(HttpRequestError): + await method("url", allow_status="4*") + @pytest.mark.asyncio async def test_http_client_keyword_enforcing(async_mock): diff --git a/web_poet/requests.py b/web_poet/requests.py index 2b0e167c..979a6b86 100644 --- a/web_poet/requests.py +++ b/web_poet/requests.py @@ -29,6 +29,7 @@ _StrMapping = Dict[str, str] _Headers = Union[_StrMapping, HttpRequestHeaders] _Body = Union[bytes, HttpRequestBody] +_Status = Union[str, int] # Frameworks that wants to support additional requests in ``web-poet`` should @@ -84,11 +85,21 @@ def __init__(self, request_downloader: Callable = None): self._request_downloader = request_downloader or _perform_request @staticmethod - def _handle_status(response: HttpResponse, allow_status: List[int]) -> None: + def _handle_status( + response: HttpResponse, + *, + allow_status: List[_Status] = None, + ) -> None: + allow_status_normalized = list(map(str, as_list(allow_status))) + allow_all_status = any( + [True for s in allow_status_normalized if "*" == s.strip()] + ) + if ( - response.status is None + allow_all_status + or response.status is None # allows serialized responses from tests or response.status < 400 - or response.status in allow_status + or str(response.status) in allow_status_normalized ): return @@ -107,16 +118,21 @@ async def request( method: str = "GET", headers: Optional[_Headers] = None, body: Optional[_Body] = None, - allow_status: List[int] = None, + allow_status: List[_Status] = None, ) -> HttpResponse: - """This is a shortcut for creating a :class:`~.HttpRequest` instance and executing - that request. + """This is a shortcut for creating a :class:`~.HttpRequest` instance and + executing that request. A :class:`~.HttpResponse` instance should then be returned for successful responses in the 100-3xx status code range. Otherwise, an exception of type :class:`web_poet.exceptions.http.HttpRequestError` will be raised. + This behavior can be changed by suppressing the exceptions on select - status codes using the ``allow_status`` param. + status codes using the ``allow_status`` param: + + * Passing status code values would not raise the exception when it + occurs. This would return the response as-is. + * Passing a "*" value would basically allow any status codes. .. warning:: By convention, the request implementation supplied optionally to @@ -129,7 +145,7 @@ async def request( req = HttpRequest(url=url, method=method, headers=headers, body=body) response = await self.execute(req) - self._handle_status(response, allow_status=as_list(allow_status)) + self._handle_status(response, allow_status=allow_status) return response @@ -138,13 +154,16 @@ async def get( url: str, *, headers: Optional[_Headers] = None, - allow_status: List[int] = None, + allow_status: List[_Status] = None, ) -> HttpResponse: """Similar to :meth:`~.HttpClient.request` but peforming a ``GET`` request. """ return await self.request( - url=url, method="GET", headers=headers, allow_status=allow_status + url=url, + method="GET", + headers=headers, + allow_status=allow_status, ) async def post( @@ -153,7 +172,7 @@ async def post( *, headers: Optional[_Headers] = None, body: Optional[_Body] = None, - allow_status: List[int] = None, + allow_status: List[_Status] = None, ) -> HttpResponse: """Similar to :meth:`~.HttpClient.request` but performing a ``POST`` request. From 263b28bec9414316b455d4a7c57d06278ff04912 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 26 Apr 2022 14:15:17 +0800 Subject: [PATCH 03/22] include allow_status in execute() --- tests/test_requests.py | 28 ++++++++++++++++------------ web_poet/requests.py | 24 +++++++++++++++++------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/tests/test_requests.py b/tests/test_requests.py index f920d457..504aba19 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -93,8 +93,8 @@ async def stub(req): @pytest.mark.asyncio -@pytest.mark.parametrize("method_name", ["request", "get", "post"]) -async def test_http_client_allow_status(client_with_status, method_name): +@pytest.mark.parametrize("method_name", ["request", "get", "post", "execute"]) +async def test_http_client_allow_status(async_mock, client_with_status, method_name): client = HttpClient(async_mock) # Simulate 500 Internal Server Error responses @@ -102,31 +102,35 @@ async def test_http_client_allow_status(client_with_status, method_name): method = getattr(client, method_name) + url_or_request = "url" + if method_name == "execute": + url_or_request = HttpRequest(url_or_request) + # Should handle single and multiple values - await method("url", allow_status=500) - await method("url", allow_status=[500, 503]) + await method(url_or_request, allow_status=500) + await method(url_or_request, allow_status=[500, 503]) # As well as strings - await method("url", allow_status="500") - await method("url", allow_status=["500", "503"]) + await method(url_or_request, allow_status="500") + await method(url_or_request, allow_status=["500", "503"]) with pytest.raises(HttpRequestError): - await method("url") + await method(url_or_request) with pytest.raises(HttpRequestError): - await method("url", allow_status=408) + await method(url_or_request, allow_status=408) # Simulate 408 Request Timeout responses client._request_downloader = client_with_status(408) # As long as "*" is present, then no errors would be raised - await method("url", allow_status="*") - await method("url", allow_status=[400, "*"]) - await method("url", allow_status=[400, 408, "*"]) + await method(url_or_request, allow_status="*") + await method(url_or_request, allow_status=[400, "*"]) + await method(url_or_request, allow_status=[400, 408, "*"]) # Globbing isn't supported with pytest.raises(HttpRequestError): - await method("url", allow_status="4*") + await method(url_or_request, allow_status="4*") @pytest.mark.asyncio diff --git a/web_poet/requests.py b/web_poet/requests.py index 979a6b86..8c0b5fdb 100644 --- a/web_poet/requests.py +++ b/web_poet/requests.py @@ -143,10 +143,7 @@ async def request( headers = headers or {} body = body or b"" req = HttpRequest(url=url, method=method, headers=headers, body=body) - - response = await self.execute(req) - self._handle_status(response, allow_status=allow_status) - + response = await self.execute(req, allow_status=allow_status) return response async def get( @@ -185,14 +182,27 @@ async def post( allow_status=allow_status, ) - async def execute(self, request: HttpRequest) -> HttpResponse: + async def execute( + self, request: HttpRequest, *, allow_status: List[_Status] = None + ) -> HttpResponse: """Accepts a single instance of :class:`~.HttpRequest` and executes it using the request implementation configured in the :class:`~.HttpClient` instance. - This returns a single :class:`~.HttpResponse`. + A :class:`~.HttpResponse` instance should then be returned for successful + responses in the 100-3xx status code range. Otherwise, an exception of + type :class:`web_poet.exceptions.http.HttpRequestError` will be raised. + + This behavior can be changed by suppressing the exceptions on select + status codes using the ``allow_status`` param: + + * Passing status code values would not raise the exception when it + occurs. This would return the response as-is. + * Passing a "*" value would basically allow any status codes. """ - return await self._request_downloader(request) + response = await self._request_downloader(request) + self._handle_status(response, allow_status=allow_status) + return response async def batch_execute( self, *requests: HttpRequest, return_exceptions: bool = False From b29c81545e50e7762858fb14454f019175d141d4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 26 Apr 2022 14:47:35 +0800 Subject: [PATCH 04/22] include allow_status in batch_execute() --- tests/test_requests.py | 47 +++++++++++++++++++++++++++++++++++------- web_poet/requests.py | 16 ++++++++++++-- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/tests/test_requests.py b/tests/test_requests.py index 504aba19..59d2cb60 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -108,7 +108,9 @@ async def test_http_client_allow_status(async_mock, client_with_status, method_n # Should handle single and multiple values await method(url_or_request, allow_status=500) - await method(url_or_request, allow_status=[500, 503]) + response = await method(url_or_request, allow_status=[500, 503]) + assert isinstance(response, HttpResponse) + assert response.status == 500 # As well as strings await method(url_or_request, allow_status="500") @@ -120,17 +122,13 @@ async def test_http_client_allow_status(async_mock, client_with_status, method_n with pytest.raises(HttpRequestError): await method(url_or_request, allow_status=408) - # Simulate 408 Request Timeout responses - client._request_downloader = client_with_status(408) - # As long as "*" is present, then no errors would be raised await method(url_or_request, allow_status="*") - await method(url_or_request, allow_status=[400, "*"]) - await method(url_or_request, allow_status=[400, 408, "*"]) + await method(url_or_request, allow_status=[500, "*"]) # Globbing isn't supported with pytest.raises(HttpRequestError): - await method(url_or_request, allow_status="4*") + await method(url_or_request, allow_status="5*") @pytest.mark.asyncio @@ -211,3 +209,38 @@ async def test_http_client_batch_execute_with_exception_raised(client_that_errs) ] with pytest.raises(ValueError): await client_that_errs.batch_execute(*requests) + + +@pytest.mark.asyncio +async def test_http_client_batch_execute_allow_status(async_mock, client_with_status): + client = HttpClient(async_mock) + + # Simulate 500 Internal Server Error responses + client._request_downloader = client_with_status(400) + + requests = [HttpRequest("url-1"), HttpRequest("url-2"), HttpRequest("url-3")] + + await client.batch_execute(*requests, allow_status=400) + await client.batch_execute(*requests, allow_status=[400, 403]) + await client.batch_execute(*requests, allow_status="400") + responses = await client.batch_execute(*requests, allow_status=["400", "403"]) + + assert all([isinstance(r, HttpResponse) for r in responses]) + assert all([r.status == 400 for r in responses]) + + with pytest.raises(HttpRequestError): + await client.batch_execute(*requests) + + with pytest.raises(HttpRequestError): + await client.batch_execute(*requests, allow_status=408) + + await client.batch_execute(*requests, return_exceptions=True, allow_status=400) + await client.batch_execute(*requests, return_exceptions=True, allow_status=[400, 403]) + await client.batch_execute(*requests, return_exceptions=True, allow_status="400") + await client.batch_execute(*requests, return_exceptions=True, allow_status=["400", "403"]) + + responses = await client.batch_execute(*requests, return_exceptions=True) + assert all([isinstance(r, HttpRequestError) for r in responses]) + + responses = await client.batch_execute(*requests, return_exceptions=True, allow_status=408) + assert all([isinstance(r, HttpRequestError) for r in responses]) diff --git a/web_poet/requests.py b/web_poet/requests.py index 8c0b5fdb..e09590e3 100644 --- a/web_poet/requests.py +++ b/web_poet/requests.py @@ -205,7 +205,10 @@ async def execute( return response async def batch_execute( - self, *requests: HttpRequest, return_exceptions: bool = False + self, + *requests: HttpRequest, + return_exceptions: bool = False, + allow_status: List[_Status] = None, ) -> List[Union[HttpResponse, Exception]]: """Similar to :meth:`~.HttpClient.execute` but accepts a collection of :class:`~.HttpRequest` instances that would be batch executed. @@ -220,9 +223,18 @@ async def batch_execute( successful :class:`~.HttpResponse`. This enables salvaging any usable responses despite any possible failures. This can be done by setting ``True`` to the ``return_exceptions`` parameter. + + Like :meth:`~.HttpClient.execute`, :class:`web_poet.exceptions.http.HttpRequestError` + will be raised for responses with status codes in the 400-5xx range. + The ``allow_status`` parameter could be used the same way here to prevent + these exceptions from being raised. + + You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``. + However, it would be returning :class:`web_poet.exceptions.http.HttpRequestError` + instead of :class:`~.HttpResponse`. """ - coroutines = [self._request_downloader(r) for r in requests] + coroutines = [self.execute(r, allow_status=allow_status) for r in requests] responses = await asyncio.gather( *coroutines, return_exceptions=return_exceptions ) From 3ebb2973aaaea79bff7b9f4f033628233b9d8b87 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 29 Apr 2022 12:15:40 +0800 Subject: [PATCH 05/22] reorganize the docs for additional requests --- docs/advanced/additional-requests.rst | 191 ++++++++++++++------------ 1 file changed, 106 insertions(+), 85 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 8b55f96a..263996b7 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -163,8 +163,8 @@ Let's check out an example to see its internals: print(response.json()) # {'data': 'value 👍'} Despite what the example above showcases, you won't be typically defining -:class:`~.HttpResponse` yourself as it's the implementing framework that's -responsible for it (see :ref:`advanced-downloader-impl`). Nonetheless, it's +:class:`~.HttpResponse` yourself as it's the implementing framework (see +:ref:`framework-expectations`) that's responsible for it. Nonetheless, it's important to understand its underlying structure in order to better access its methods. @@ -557,15 +557,13 @@ The key takeaways for this example are: first response from a group of requests as early as possible. However, the order could be shuffled. +.. _`exception-handling`: -Exception Handling -================== - -Overview --------- +Handling Exceptions in PO +========================= Let's have a look at how we could handle exceptions when performing additional -requests inside a Page Objects. For this example, let's improve the code snippet +requests inside Page Objects. For this example, let's improve the code snippet from the previous subsection named: :ref:`httpclient-get-example`. .. code-block:: python @@ -740,88 +738,26 @@ exceptions are included in it. If so, we're simply logging it down and ignoring it. In this way, perfectly good responses can still be processed through. -Behind the curtains -------------------- - -All exceptions that the HTTP Downloader Implementation (see :ref:`advanced-downloader-impl` -doc section) explicitly raises when implementing it for **web-poet** should be -:class:`web_poet.exceptions.http.HttpRequestError` *(or a subclass from it)*. - -For frameworks that implement and use **web-poet**, exceptions that ocurred when -handling the additional requests like `connection errors`, `time outs`, `TLS -errors`, etc should be replaced by :class:`web_poet.exceptions.http.HttpRequestError` -by raising it explicitly. - -For responses that are not really errors like in the 100-3xx status code range, -this exception shouldn't be raised at all. However, for responses with status -codes in the 400-5xx range, the implementing framework should properly raise -the exception. - -This is to ensure that Page Objects having additional requests using the -:class:`~.HttpClient` is able to work in any type of HTTP downloader implementation. - -For example, in Python, the common HTTP libraries have different types of base -exceptions when something has ocurred: - - * `aiohttp.ClientError `_ - * `requests.RequestException `_ - * `urllib.error.HTTPError `_ - -Imagine if Page Objects are **expected** to work in `different` backend implementations -like the ones above, then it would cause the code to look like: - -.. code-block:: python - - import attrs - import web_poet - - import aiohttp - import requests - import urllib - - - @attrs.define - class SomePage(web_poet.ItemWebPage): - http_client: web_poet.HttpClient +.. _framework-expectations: - async def to_item(self): - try: - response = await self.http_client.get("...") - except (aiohttp.ClientError, requests.RequestException, urllib.error.HTTPError): - # handle the error here +Framework Expectations +====================== -Such code could turn messy in no time especially when the number of HTTP backends -that Page Objects **should support** are steadily increasing. This means that Page -Objects aren't truly portable in different types of frameworks or environments. -Rather, they're only limited to work in the specific framework they're supported. +In the earlier sections, the tutorial was primarily focused on how Page Object +developers could use additional requests using the **web-poet**'s built-in +functionalities. However, as the docs have repeatedly mentioned, **web-poet** +doesn't know how to execute any of these HTTP requests at all. It would be +up to the framework that's handling **web-poet** to do so. -In order for Page Objects to easily work in different Downloader Implementations, -the framework that implements the HTTP Downloader backend should be able to raise -exceptions from the :mod:`web_poet.exceptions.http` module in lieu of the backend -specific ones `(e.g. aiohttp, requests, urllib, etc.)`. - -This makes the code much simpler: - -.. code-block:: python - - import attrs - import web_poet - - - @attrs.define - class SomePage(web_poet.ItemWebPage): - http_client: web_poet.HttpClient - - async def to_item(self): - try: - response = await self.http_client.get("...") - except web_poet.exceptions.HttpRequestError: - # handle the error here +In this section, we'll explore the guidelines for how frameworks should use +**web-poet**. If you're a Page Object developer, you can skip this part as it +mostly discusses the internals. However, reading through this section could +render a better understanding of **web-poet** as a whole. .. _advanced-downloader-impl: Downloader Implementation -========================= +------------------------- Please note that on its own, :class:`~.HttpClient` doesn't do anything. It doesn't know how to execute the request on its own. Thus, for frameworks or projects @@ -836,7 +772,7 @@ HTTP downloader implementation in two ways: .. _setup-contextvars: 1. Context Variable -------------------- +******************* :mod:`contextvars` is natively supported in :mod:`asyncio` in order to set and access context-aware values. This means that the framework using **web-poet** @@ -887,7 +823,7 @@ Setting this up would allow access to the request implementation in a 2. Dependency Injection ------------------------ +*********************** The framework using **web-poet** might be using other libraries which doesn't have a full support to :mod:`contextvars` `(e.g. Twisted)`. With that, an @@ -922,3 +858,88 @@ an :class:`~.HttpClient` instance: From the code sample above, we can see that every time an :class:`~.HttpClient` is created for Page Objects needing an ``http_client``, the specific **request implementation** from a given framework is injected to it. + +Exception Handling +------------------ + +In the previous :ref:`exception-handling` section, we can see how Page Object +developers could use the exception classes built inside **web-poet** to handle +various ways additional requests may fail. In this section, we'll see the +rationale and ways the framework should be able to do that. + +All exceptions that the HTTP Downloader Implementation (see :ref:`advanced-downloader-impl` +doc section) explicitly raises when implementing it for **web-poet** should be +:class:`web_poet.exceptions.http.HttpRequestError` *(or a subclass from it)*. + +For frameworks that implement and use **web-poet**, exceptions that ocurred when +handling the additional requests like `connection errors`, `time outs`, `TLS +errors`, etc should be replaced by :class:`web_poet.exceptions.http.HttpRequestError` +by raising it explicitly. + +For responses that are not really errors like in the 100-3xx status code range, +this exception shouldn't be raised at all. For responses with status codes in +the 400-5xx range, the **web-poet** raises the exception. However, the implementing +framework could override which status codes to allow using **web-poet**'s +``allow_status`` parameter in the :class:`~.HttpClient` methods. (see +:ref:`framework-expectations`). + +This is to ensure that Page Objects having additional requests using the +:class:`~.HttpClient` is able to work in any type of HTTP downloader implementation. + +For example, in Python, the common HTTP libraries have different types of base +exceptions when something has ocurred: + + * `aiohttp.ClientError `_ + * `requests.RequestException `_ + * `urllib.error.HTTPError `_ + +Imagine if Page Objects are **expected** to work in `different` backend implementations +like the ones above, then it would cause the code to look like: + +.. code-block:: python + + import attrs + import web_poet + + import aiohttp + import requests + import urllib + + + @attrs.define + class SomePage(web_poet.ItemWebPage): + http_client: web_poet.HttpClient + + async def to_item(self): + try: + response = await self.http_client.get("...") + except (aiohttp.ClientError, requests.RequestException, urllib.error.HTTPError): + # handle the error here + +Such code could turn messy in no time especially when the number of HTTP backends +that Page Objects **should support** are steadily increasing. This means that Page +Objects aren't truly portable in different types of frameworks or environments. +Rather, they're only limited to work in the specific framework they're supported. + +In order for Page Objects to easily work in different Downloader Implementations, +the framework that implements the HTTP Downloader backend should be able to raise +exceptions from the :mod:`web_poet.exceptions.http` module in lieu of the backend +specific ones `(e.g. aiohttp, requests, urllib, etc.)`. + +This makes the code much simpler: + +.. code-block:: python + + import attrs + import web_poet + + + @attrs.define + class SomePage(web_poet.ItemWebPage): + http_client: web_poet.HttpClient + + async def to_item(self): + try: + response = await self.http_client.get("...") + except web_poet.exceptions.HttpRequestError: + # handle the error here From 470bc6f417431168d0138762dbf76d12801163c4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 29 Apr 2022 15:43:25 +0800 Subject: [PATCH 06/22] introduce a new base HttpError exception --- docs/advanced/additional-requests.rst | 113 ++++++++++++++++++-------- docs/api_reference.rst | 1 + tests/test_requests.py | 30 +++++-- web_poet/exceptions/http.py | 57 +++++++++++-- web_poet/requests.py | 17 ++-- 5 files changed, 161 insertions(+), 57 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 263996b7..cc08d420 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -592,9 +592,15 @@ from the previous subsection named: :ref:`httpclient-get-example`. response: web_poet.HttpResponse = await self.http_client.get( f"https://api.example.com/v2/images?id={item['product_id']}" ) - except web_poet.exceptions.HttpRequestError: + except web_poet.exceptions.HttpRequestError as err: logger.warning( - f"Unable to request images for product ID: {item['product_id']}" + f"Unable to request images for product ID '{item['product_id']}' " + f"using this request: {err.request}" + ) + except web_poet.exceptions.HttpResponseError as err: + logger.warning( + f"Received a {err.response.status} response status for product ID " + f"'{item['product_id']}' from this URL: {err.request.url}" ) else: item["images"] = response.css(".product-images img::attr(src)").getall() @@ -603,16 +609,32 @@ from the previous subsection named: :ref:`httpclient-get-example`. In this code example, the code became more resilient on cases where it wasn't possible to retrieve more images using the website's public API. It could be -due to anything like `SSL errors`, `connection errors`, etc. +due to anything like `SSL errors`, `connection errors`, `page not found`, etc. + +Using :class:`~.HttpClient` to execute requests raises exceptions with the base +class of type :class:`web_poet.exceptions.http.HttpError` irregardless of how +the HTTP Downloader is implemented. From our example above, we could've simply +used the :class:`web_poet.exceptions.http.HttpError` base error. However, it's +ambiguous in the sense that the error could originate during the HTTP Request +execution or when receiving the HTTP Response. + +A more specific :class:`web_poet.exceptions.http.HttpRequestError` exception is +raised when the :class:`~.HttpRequest` was being handled while the +:class:`web_poet.exceptions.http.HttpResponseError` is raised when receiving +a response with an HTTP error. Notice from the example that the exceptions have +the attributes like ``request`` and ``response`` which are respective instance of +:class:`~.HttpRequest` and :class:`~.HttpResponse`. Accessing them would be useful +to debug and log the problems. + +Note that :class:`web_poet.exceptions.http.HttpResponseError` only occurs when +receiving responses with status codes in the ``400-5xx`` range. However, this +behavior could be altered by using the ``allow_status`` param in the methods of +:class:`~.HttpClient`. .. note:: - For now, using :class:`~.HttpClient` to execute requests only raises exceptions - of type :class:`web_poet.exceptions.http.HttpRequestError` irregardless of how - the HTTP Downloader is implemented. - In the future, more specific exceptions which inherits from the base - :class:`web_poet.exceptions.http.HttpRequestError` exception would be available. + :class:`web_poet.exceptions.http.HttpError` exception would be available. This should enable developers writing Page Objects to properly identify what went wrong and act specifically based on the problem. @@ -653,7 +675,7 @@ For this example, let's improve the code snippet from the previous subsection na try: responses: List[web_poet.HttpResponse] = await self.http_client.batch_execute(*requests) - except web_poet.exceptions.HttpRequestError: + except web_poet.exceptions.HttpError: logger.warning( f"Unable to request for more related products for product ID: {item['product_id']}" ) @@ -693,16 +715,20 @@ For this example, let's improve the code snippet from the previous subsection na Handling exceptions using :meth:`~.HttpClient.batch_execute` remains largely the same. However, the main difference is that you might be wasting perfectly good responses just -because a single request from the batch ruined it. +because a single request from the batch ruined it. Notice that we're using the base +exception class of :class:`web_poet.exceptions.http.HttpError` to account for any +type of errors, both during the HTTP Request execution and when receiving the +response. An alternative approach would be salvaging good responses altogether. For example, you've sent out 10 :class:`~.HttpRequest` and only 1 of them had an exception during processing. You can still get the data from 9 of the :class:`~.HttpResponse` by passing the parameter ``return_exceptions=True`` to :meth:`~.HttpClient.batch_execute`. -This means that any exceptions raised during request execution are returned alongside any +This means that any exceptions raised during the HTTP execution are returned alongside any of the successful responses. The return type of :meth:`~.HttpClient.batch_execute` could -be a mixture of :class:`~.HttpResponse` and :class:`web_poet.exceptions.http.HttpRequestError`. +be a mixture of :class:`~.HttpResponse` and :class:`web_poet.exceptions.http.HttpError` +(*and its exception subclasses*). Here's an example: @@ -710,13 +736,18 @@ Here's an example: # Revised code snippet from the to_item() method - responses: List[Union[web_poet.HttpResponse, web_poet.exceptions.HttpRequestError]] = ( + requests: List[web_poet.HttpRequest] = [ + self.create_request(item["product_id"], page_num=page_num) + for page_num in range(2, self.default_pagination_limit) + ] + + responses: List[Union[web_poet.HttpResponse, web_poet.exceptions.HttpError]] = ( await self.http_client.batch_execute(*requests, return_exceptions=True) ) related_product_ids = [] for i, response in enumerate(responses): - if isinstance(response, web_poet.exceptions.HttpRequestError): + if isinstance(response, web_poet.exceptions.HttpError): logger.warning( f"Unable to request related products for product ID '{item['product_id']}' " f"using this request: {requests[i]}. Reason: {response}." @@ -756,8 +787,8 @@ render a better understanding of **web-poet** as a whole. .. _advanced-downloader-impl: -Downloader Implementation -------------------------- +Providing the Downloader +------------------------ Please note that on its own, :class:`~.HttpClient` doesn't do anything. It doesn't know how to execute the request on its own. Thus, for frameworks or projects @@ -867,24 +898,12 @@ developers could use the exception classes built inside **web-poet** to handle various ways additional requests may fail. In this section, we'll see the rationale and ways the framework should be able to do that. -All exceptions that the HTTP Downloader Implementation (see :ref:`advanced-downloader-impl` -doc section) explicitly raises when implementing it for **web-poet** should be -:class:`web_poet.exceptions.http.HttpRequestError` *(or a subclass from it)*. - -For frameworks that implement and use **web-poet**, exceptions that ocurred when -handling the additional requests like `connection errors`, `time outs`, `TLS -errors`, etc should be replaced by :class:`web_poet.exceptions.http.HttpRequestError` -by raising it explicitly. - -For responses that are not really errors like in the 100-3xx status code range, -this exception shouldn't be raised at all. For responses with status codes in -the 400-5xx range, the **web-poet** raises the exception. However, the implementing -framework could override which status codes to allow using **web-poet**'s -``allow_status`` parameter in the :class:`~.HttpClient` methods. (see -:ref:`framework-expectations`). +Rationale +********* -This is to ensure that Page Objects having additional requests using the -:class:`~.HttpClient` is able to work in any type of HTTP downloader implementation. +Frameworks that handle **web-poet** should be able to ensure that Page Objects +having additional requests using the :class:`~.HttpClient` is able to work in any +type of HTTP downloader implementation. For example, in Python, the common HTTP libraries have different types of base exceptions when something has ocurred: @@ -917,7 +936,8 @@ like the ones above, then it would cause the code to look like: # handle the error here Such code could turn messy in no time especially when the number of HTTP backends -that Page Objects **should support** are steadily increasing. This means that Page +that Page Objects **should support** are steadily increasing. Not to mention the +plethora of exception types that HTTP libraries have. This means that Page Objects aren't truly portable in different types of frameworks or environments. Rather, they're only limited to work in the specific framework they're supported. @@ -941,5 +961,28 @@ This makes the code much simpler: async def to_item(self): try: response = await self.http_client.get("...") - except web_poet.exceptions.HttpRequestError: + except web_poet.exceptions.HttpError: # handle the error here + +Expected behavior for Exceptions +******************************** + +All exceptions that the HTTP Downloader Implementation (see :ref:`advanced-downloader-impl` +doc section) explicitly raises when implementing it for **web-poet** should be +:class:`web_poet.exceptions.http.HttpError` *(or a subclass from it)*. + +For frameworks that implement and use **web-poet**, exceptions that ocurred when +handling the additional requests like `connection errors`, `timeouts`, `TLS +errors`, etc should be replaced by :class:`web_poet.exceptions.http.HttpRequestError` +by raising it explicitly. + +For responses that are not really errors like in the ``100-3xx`` status code range, +no exception should be raised at all. For responses with status codes in +the ``400-5xx`` range, **web-poet** raises the :class:`web_poet.exceptions.http.HttpResponseError` +exception. + +From this distinction, the framework shouldn't raise :class:`web_poet.exceptions.http.HttpResponseError` +on its own at all, since the :class:`~.HttpClient` already handles that. However, +the implementing framework could modify which status codes to allow (*meaning no +exceptions raised*) using the ``allow_status`` parameter in the +:class:`~.HttpClient` methods. diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 775ff4a2..778661e5 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -62,6 +62,7 @@ Exceptions :members: .. automodule:: web_poet.exceptions.http + :show-inheritance: :members: .. _`api-overrides`: diff --git a/tests/test_requests.py b/tests/test_requests.py index 59d2cb60..aff98846 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1,7 +1,7 @@ from unittest import mock import pytest -from web_poet.exceptions import RequestBackendError, HttpRequestError +from web_poet.exceptions import RequestBackendError, HttpResponseError from web_poet.page_inputs import ( HttpRequest, HttpResponse, @@ -116,19 +116,25 @@ async def test_http_client_allow_status(async_mock, client_with_status, method_n await method(url_or_request, allow_status="500") await method(url_or_request, allow_status=["500", "503"]) - with pytest.raises(HttpRequestError): + with pytest.raises(HttpResponseError) as excinfo: await method(url_or_request) + assert isinstance(excinfo.value.request, HttpRequest) + assert isinstance(excinfo.value.response, HttpResponse) - with pytest.raises(HttpRequestError): + with pytest.raises(HttpResponseError) as err: await method(url_or_request, allow_status=408) + assert isinstance(excinfo.value.request, HttpRequest) + assert isinstance(excinfo.value.response, HttpResponse) # As long as "*" is present, then no errors would be raised await method(url_or_request, allow_status="*") await method(url_or_request, allow_status=[500, "*"]) # Globbing isn't supported - with pytest.raises(HttpRequestError): + with pytest.raises(HttpResponseError) as err: await method(url_or_request, allow_status="5*") + assert isinstance(excinfo.value.request, HttpRequest) + assert isinstance(excinfo.value.response, HttpResponse) @pytest.mark.asyncio @@ -228,11 +234,15 @@ async def test_http_client_batch_execute_allow_status(async_mock, client_with_st assert all([isinstance(r, HttpResponse) for r in responses]) assert all([r.status == 400 for r in responses]) - with pytest.raises(HttpRequestError): + with pytest.raises(HttpResponseError) as excinfo: await client.batch_execute(*requests) + assert isinstance(excinfo.value.request, HttpRequest) + assert isinstance(excinfo.value.response, HttpResponse) - with pytest.raises(HttpRequestError): + with pytest.raises(HttpResponseError) as excinfo: await client.batch_execute(*requests, allow_status=408) + assert isinstance(excinfo.value.request, HttpRequest) + assert isinstance(excinfo.value.response, HttpResponse) await client.batch_execute(*requests, return_exceptions=True, allow_status=400) await client.batch_execute(*requests, return_exceptions=True, allow_status=[400, 403]) @@ -240,7 +250,11 @@ async def test_http_client_batch_execute_allow_status(async_mock, client_with_st await client.batch_execute(*requests, return_exceptions=True, allow_status=["400", "403"]) responses = await client.batch_execute(*requests, return_exceptions=True) - assert all([isinstance(r, HttpRequestError) for r in responses]) + assert all([isinstance(r, HttpResponseError) for r in responses]) + assert all([isinstance(r.request, HttpRequest) for r in responses]) + assert all([isinstance(r.response, HttpResponse) for r in responses]) responses = await client.batch_execute(*requests, return_exceptions=True, allow_status=408) - assert all([isinstance(r, HttpRequestError) for r in responses]) + assert all([isinstance(r, HttpResponseError) for r in responses]) + assert all([isinstance(r.request, HttpRequest) for r in responses]) + assert all([isinstance(r.response, HttpResponse) for r in responses]) diff --git a/web_poet/exceptions/http.py b/web_poet/exceptions/http.py index 82158623..94526623 100644 --- a/web_poet/exceptions/http.py +++ b/web_poet/exceptions/http.py @@ -6,14 +6,59 @@ operations. """ +from web_poet.page_inputs import HttpResponse, HttpRequest -class HttpRequestError(IOError): - """Indicates that an exception has occurred when the HTTP Request was being - handled. - For responses that are successful but don't have a ``200`` **status code**, - this exception shouldn't be raised at all. Instead, the :class:`~.HttpResponse` - should simply reflect the response contents as is. +class HttpError(IOError): + """Indicates that an exception has occurred when handling an HTTP operation. + + This is used as a **base class** for more specific errors and could be vague + since it could denote problems either in the HTTP Request or Response. + + For more specific errors, it would be better to use :class:`.HttpRequestError` + and :class:`.HttpResponseError`. + + :param request: The :class:`~.HttpRequest` instance that was used. + :type request: HttpRequest + :param response: The :class:`~.HttpResponse` instance that was received, if + any. Note that this wouldn't exist if the problem ocurred when executing + the :class:`~.HttpRequest`. + :type response: HttpResponse + """ + + def __init__( + self, + *args, + request: HttpRequest = None, + response: HttpResponse = None, + **kwargs + ): + self.response = response + self.request = request + super().__init__(*args, **kwargs) + + +class HttpRequestError(HttpError): + """Indicates that an exception has occurred when the **HTTP Request** was + being handled. + """ + + pass + + +class HttpResponseError(HttpError): + """Indicates that an exception has occurred when the **HTTP Response** was + received. + + For responses that are in the status code ``100-3xx range``, this exception + shouldn't be raised at all. However, for responses in the ``400-5xx``, this + will be raised by **web-poet**. + + .. note:: + + Frameworks implementing **web-poet** should **NOT** raise this themselves + but rather, rely on the ``allow_status`` parameter found in the methods + of :class:`~.HttpClient`. """ pass diff --git a/web_poet/requests.py b/web_poet/requests.py index e09590e3..759092e2 100644 --- a/web_poet/requests.py +++ b/web_poet/requests.py @@ -21,7 +21,7 @@ HttpRequestHeaders, HttpRequestBody, ) -from web_poet.exceptions import RequestBackendError, HttpRequestError +from web_poet.exceptions import RequestBackendError, HttpResponseError from web_poet.utils import as_list logger = logging.getLogger(__name__) @@ -87,6 +87,7 @@ def __init__(self, request_downloader: Callable = None): @staticmethod def _handle_status( response: HttpResponse, + request: HttpRequest, *, allow_status: List[_Status] = None, ) -> None: @@ -109,7 +110,7 @@ def _handle_status( kind = "Server" msg = f"{response.status} {kind} Error for {response.url}" - raise HttpRequestError(msg) + raise HttpResponseError(msg, request=request, response=response) async def request( self, @@ -125,7 +126,7 @@ async def request( A :class:`~.HttpResponse` instance should then be returned for successful responses in the 100-3xx status code range. Otherwise, an exception of - type :class:`web_poet.exceptions.http.HttpRequestError` will be raised. + type :class:`web_poet.exceptions.http.HttpResponseError` will be raised. This behavior can be changed by suppressing the exceptions on select status codes using the ``allow_status`` param: @@ -191,7 +192,7 @@ async def execute( A :class:`~.HttpResponse` instance should then be returned for successful responses in the 100-3xx status code range. Otherwise, an exception of - type :class:`web_poet.exceptions.http.HttpRequestError` will be raised. + type :class:`web_poet.exceptions.http.HttpResponseError` will be raised. This behavior can be changed by suppressing the exceptions on select status codes using the ``allow_status`` param: @@ -201,7 +202,7 @@ async def execute( * Passing a "*" value would basically allow any status codes. """ response = await self._request_downloader(request) - self._handle_status(response, allow_status=allow_status) + self._handle_status(response, request, allow_status=allow_status) return response async def batch_execute( @@ -224,13 +225,13 @@ async def batch_execute( responses despite any possible failures. This can be done by setting ``True`` to the ``return_exceptions`` parameter. - Like :meth:`~.HttpClient.execute`, :class:`web_poet.exceptions.http.HttpRequestError` - will be raised for responses with status codes in the 400-5xx range. + Like :meth:`~.HttpClient.execute`, :class:`web_poet.exceptions.http.HttpResponseError` + will be raised for responses with status codes in the ``400-5xx`` range. The ``allow_status`` parameter could be used the same way here to prevent these exceptions from being raised. You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``. - However, it would be returning :class:`web_poet.exceptions.http.HttpRequestError` + However, it would be returning :class:`web_poet.exceptions.http.HttpResponseError` instead of :class:`~.HttpResponse`. """ From cc7d2582d1174121691ad3cf5ad0c22d6b152a34 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 29 Apr 2022 16:18:56 +0800 Subject: [PATCH 07/22] further improve docs --- docs/advanced/additional-requests.rst | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index cc08d420..0344c420 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -890,6 +890,24 @@ From the code sample above, we can see that every time an :class:`~.HttpClient` is created for Page Objects needing an ``http_client``, the specific **request implementation** from a given framework is injected to it. +Downloader Behavior +------------------- + +The Downloader should be able to accept an instance of :class:`~.HttpRequest` +as the input and return an instance of :class:`~.HttpResponse`. This is important +in order to handle and represent generic HTTP operations. The only time that +it won't be returning :class:`~.HttpResponse` would be when it's raising exceptions +(see :ref:`framework-exception-handling`). + +The Downloader should be able to properly resolve **redirections** except when +the method is ``HEAD``. This means that the :class:`~.HttpResponse` that it'll +be rendering is already the end of the redirection trail. + +Laslty, the Downloader should also be able to fully support :mod:`asyncio` to +enable developers to perform additional requests asynchronously. + +.. _framework-exception-handling: + Exception Handling ------------------ @@ -972,9 +990,9 @@ doc section) explicitly raises when implementing it for **web-poet** should be :class:`web_poet.exceptions.http.HttpError` *(or a subclass from it)*. For frameworks that implement and use **web-poet**, exceptions that ocurred when -handling the additional requests like `connection errors`, `timeouts`, `TLS -errors`, etc should be replaced by :class:`web_poet.exceptions.http.HttpRequestError` -by raising it explicitly. +handling the additional requests like `connection errors`, `TLS errors`, etc should +be replaced by :class:`web_poet.exceptions.http.HttpRequestError` by raising it +explicitly. For responses that are not really errors like in the ``100-3xx`` status code range, no exception should be raised at all. For responses with status codes in From 38a53c9f823478f223515527fc47e5f99efd5827 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 6 May 2022 21:50:11 +0800 Subject: [PATCH 08/22] update docs to refer to async/await instead of asyncio --- docs/advanced/additional-requests.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 0344c420..20094952 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -903,8 +903,8 @@ The Downloader should be able to properly resolve **redirections** except when the method is ``HEAD``. This means that the :class:`~.HttpResponse` that it'll be rendering is already the end of the redirection trail. -Laslty, the Downloader should also be able to fully support :mod:`asyncio` to -enable developers to perform additional requests asynchronously. +Lastly, the Downloader should also be able to fully support the ``async/await`` +syntax in order to enable developers to perform additional requests asynchronously. .. _framework-exception-handling: From 7548cfc7427f2d6f7b5727396d30f78ad1d29616 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 7 May 2022 11:26:52 +0800 Subject: [PATCH 09/22] Pull upstream changes from #37. This adjusts the code after the said PR refactored the project structure. Reference: https://github.com/scrapinghub/web-poet/pull/37 --- README.rst | 4 +- tests/test_requests.py | 8 +- web_poet/__init__.py | 6 +- web_poet/exceptions/http.py | 2 +- web_poet/page_inputs/__init__.py | 10 + web_poet/page_inputs/client.py | 207 ++++++++++++++++++ .../{page_inputs.py => page_inputs/http.py} | 15 +- web_poet/page_inputs/meta.py | 8 + web_poet/requests.py | 207 +----------------- 9 files changed, 239 insertions(+), 228 deletions(-) create mode 100644 web_poet/page_inputs/__init__.py create mode 100644 web_poet/page_inputs/client.py rename web_poet/{page_inputs.py => page_inputs/http.py} (96%) create mode 100644 web_poet/page_inputs/meta.py diff --git a/README.rst b/README.rst index 96c9441d..1e1cfedd 100644 --- a/README.rst +++ b/README.rst @@ -18,8 +18,8 @@ web-poet :target: https://codecov.io/gh/scrapinghub/web-poet :alt: Coverage report -.. image:: https://readthedocs.org/projects/web-poet/badge/?version=latest - :target: https://web-poet.readthedocs.io/en/latest/?badge=latest +.. image:: https://readthedocs.org/projects/web-poet/badge/?version=stable + :target: https://web-poet.readthedocs.io/en/stable/?badge=stable :alt: Documentation Status ``web-poet`` implements Page Object pattern for web scraping. diff --git a/tests/test_requests.py b/tests/test_requests.py index aff98846..9d3ee2e1 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -3,15 +3,13 @@ import pytest from web_poet.exceptions import RequestBackendError, HttpResponseError from web_poet.page_inputs import ( + HttpClient, HttpRequest, HttpResponse, HttpRequestBody, HttpRequestHeaders ) -from web_poet.requests import ( - HttpClient, - request_backend_var, -) +from web_poet.requests import request_backend_var @pytest.fixture @@ -47,7 +45,7 @@ async def test_perform_request_from_httpclient(async_mock): async def test_http_client_single_requests(async_mock): client = HttpClient(async_mock) - with mock.patch("web_poet.requests.HttpRequest") as mock_request: + with mock.patch("web_poet.page_inputs.client.HttpRequest") as mock_request: response = await client.request("url") response.url == "url" diff --git a/web_poet/__init__.py b/web_poet/__init__.py index cd71d07d..03943893 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,10 +1,8 @@ from .pages import WebPage, ItemPage, ItemWebPage, Injectable -from .requests import ( - request_backend_var, - HttpClient, -) +from .requests import request_backend_var from .page_inputs import ( Meta, + HttpClient, HttpRequest, HttpResponse, HttpRequestHeaders, diff --git a/web_poet/exceptions/http.py b/web_poet/exceptions/http.py index 94526623..0a13585f 100644 --- a/web_poet/exceptions/http.py +++ b/web_poet/exceptions/http.py @@ -6,7 +6,7 @@ operations. """ -from web_poet.page_inputs import HttpResponse, HttpRequest +from web_poet.page_inputs.http import HttpResponse, HttpRequest class HttpError(IOError): diff --git a/web_poet/page_inputs/__init__.py b/web_poet/page_inputs/__init__.py new file mode 100644 index 00000000..9491a5c0 --- /dev/null +++ b/web_poet/page_inputs/__init__.py @@ -0,0 +1,10 @@ +from .meta import Meta +from .client import HttpClient +from .http import ( + HttpRequest, + HttpResponse, + HttpRequestHeaders, + HttpResponseHeaders, + HttpRequestBody, + HttpResponseBody, +) diff --git a/web_poet/page_inputs/client.py b/web_poet/page_inputs/client.py new file mode 100644 index 00000000..f7c4b1af --- /dev/null +++ b/web_poet/page_inputs/client.py @@ -0,0 +1,207 @@ +"""This module has a full support for :mod:`asyncio` that enables developers to +perform asynchronous additional requests inside of Page Objects. + +Note that the implementation to fully execute any :class:`~.Request` is not +handled in this module. With that, the framework using **web-poet** must supply +the implementation. + +You can read more about this in the :ref:`advanced-downloader-impl` documentation. +""" + +import asyncio +import logging +from typing import Optional, Dict, List, Union, Callable + +from web_poet.requests import request_backend_var, _perform_request +from web_poet.page_inputs.http import ( + HttpRequest, + HttpRequestHeaders, + HttpRequestBody, + HttpResponse, +) +from web_poet.exceptions import RequestBackendError, HttpResponseError +from web_poet.utils import as_list + +logger = logging.getLogger(__name__) + +_StrMapping = Dict[str, str] +_Headers = Union[_StrMapping, HttpRequestHeaders] +_Body = Union[bytes, HttpRequestBody] +_Status = Union[str, int] + + +class HttpClient: + """A convenient client to easily execute requests. + + By default, it uses the request implementation assigned in the + ``web_poet.request_backend_var`` which is a :mod:`contextvars` instance to + download the actual requests. However, it can easily be overridable by + providing an optional ``request_downloader`` callable. + + Providing the request implementation by dependency injection would be a good + alternative solution when you want to avoid setting up :mod:`contextvars` + like ``web_poet.request_backend_var``. + + In any case, this doesn't contain any implementation about how to execute + any requests fed into it. When setting that up, make sure that the downloader + implementation returns a :class:`~.HttpResponse` instance. + """ + + def __init__(self, request_downloader: Callable = None): + self._request_downloader = request_downloader or _perform_request + + @staticmethod + def _handle_status( + response: HttpResponse, + request: HttpRequest, + *, + allow_status: List[_Status] = None, + ) -> None: + allow_status_normalized = list(map(str, as_list(allow_status))) + allow_all_status = any( + [True for s in allow_status_normalized if "*" == s.strip()] + ) + + if ( + allow_all_status + or response.status is None # allows serialized responses from tests + or response.status < 400 + or str(response.status) in allow_status_normalized + ): + return + + if 400 <= response.status < 500: + kind = "Client" + elif 500 <= response.status < 600: + kind = "Server" + + msg = f"{response.status} {kind} Error for {response.url}" + raise HttpResponseError(msg, request=request, response=response) + + async def request( + self, + url: str, + *, + method: str = "GET", + headers: Optional[_Headers] = None, + body: Optional[_Body] = None, + allow_status: List[_Status] = None, + ) -> HttpResponse: + """This is a shortcut for creating a :class:`~.HttpRequest` instance and + executing that request. + + A :class:`~.HttpResponse` instance should then be returned for successful + responses in the 100-3xx status code range. Otherwise, an exception of + type :class:`web_poet.exceptions.http.HttpResponseError` will be raised. + + This behavior can be changed by suppressing the exceptions on select + status codes using the ``allow_status`` param: + * Passing status code values would not raise the exception when it + occurs. This would return the response as-is. + * Passing a "*" value would basically allow any status codes. + + .. warning:: + By convention, the request implementation supplied optionally to + :class:`~.HttpClient` should return a :class:`~.HttpResponse` instance. + However, the underlying implementation supplied might change that, + depending on how the framework using **web-poet** implements it. + """ + headers = headers or {} + body = body or b"" + req = HttpRequest(url=url, method=method, headers=headers, body=body) + response = await self.execute(req, allow_status=allow_status) + return response + + async def get( + self, + url: str, + *, + headers: Optional[_Headers] = None, + allow_status: List[_Status] = None, + ) -> HttpResponse: + """Similar to :meth:`~.HttpClient.request` but peforming a ``GET`` + request. + """ + return await self.request( + url=url, + method="GET", + headers=headers, + allow_status=allow_status, + ) + + async def post( + self, + url: str, + *, + headers: Optional[_Headers] = None, + body: Optional[_Body] = None, + allow_status: List[_Status] = None, + ) -> HttpResponse: + """Similar to :meth:`~.HttpClient.request` but performing a ``POST`` + request. + """ + return await self.request( + url=url, + method="POST", + headers=headers, + body=body, + allow_status=allow_status, + ) + + async def execute( + self, request: HttpRequest, *, allow_status: List[_Status] = None + ) -> HttpResponse: + """Accepts a single instance of :class:`~.HttpRequest` and executes it + using the request implementation configured in the :class:`~.HttpClient` + instance. + + A :class:`~.HttpResponse` instance should then be returned for successful + responses in the 100-3xx status code range. Otherwise, an exception of + type :class:`web_poet.exceptions.http.HttpResponseError` will be raised. + + This behavior can be changed by suppressing the exceptions on select + status codes using the ``allow_status`` param: + + * Passing status code values would not raise the exception when it + occurs. This would return the response as-is. + * Passing a "*" value would basically allow any status codes. + """ + response = await self._request_downloader(request) + self._handle_status(response, request, allow_status=allow_status) + return response + + async def batch_execute( + self, + *requests: HttpRequest, + return_exceptions: bool = False, + allow_status: List[_Status] = None, + ) -> List[Union[HttpResponse, Exception]]: + """Similar to :meth:`~.HttpClient.execute` but accepts a collection of + :class:`~.HttpRequest` instances that would be batch executed. + + The order of the :class:`~.HttpResponses` would correspond to the order + of :class:`~.HttpRequest` passed. + + If any of the :class:`~.HttpRequest` raises an exception upon execution, + the exception is raised. + + To prevent this, the actual exception can be returned alongside any + successful :class:`~.HttpResponse`. This enables salvaging any usable + responses despite any possible failures. This can be done by setting + ``True`` to the ``return_exceptions`` parameter. + + Like :meth:`~.HttpClient.execute`, :class:`web_poet.exceptions.http.HttpResponseError` + will be raised for responses with status codes in the ``400-5xx`` range. + The ``allow_status`` parameter could be used the same way here to prevent + these exceptions from being raised. + + You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``. + However, it would be returning :class:`web_poet.exceptions.http.HttpResponseError` + instead of :class:`~.HttpResponse`. + """ + + coroutines = [self.execute(r, allow_status=allow_status) for r in requests] + responses = await asyncio.gather( + *coroutines, return_exceptions=return_exceptions + ) + return responses diff --git a/web_poet/page_inputs.py b/web_poet/page_inputs/http.py similarity index 96% rename from web_poet/page_inputs.py rename to web_poet/page_inputs/http.py index 478429f9..e7ef2aca 100644 --- a/web_poet/page_inputs.py +++ b/web_poet/page_inputs/http.py @@ -14,7 +14,8 @@ from web_poet.utils import memoizemethod_noargs T_headers = TypeVar("T_headers", bound="HttpResponseHeaders") -AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]] + +_AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]] class HttpRequestBody(bytes): @@ -99,7 +100,7 @@ class HttpResponseHeaders(_HttpHeaders): @classmethod def from_bytes_dict( - cls: Type[T_headers], arg: AnyStrDict, encoding: str = "utf-8" + cls: Type[T_headers], arg: _AnyStrDict, encoding: str = "utf-8" ) -> T_headers: """An alternative constructor for instantiation where the header-value pairs could be in raw bytes form. @@ -270,13 +271,3 @@ def _auto_detect_fun(self, body: bytes) -> Optional[str]: except UnicodeError: continue return resolve_encoding(enc) - - -class Meta(dict): - """Container class that could contain any arbitrary data to be passed into - a Page Object. - - Note that this is simply a subclass of Python's ``dict``. - """ - - pass diff --git a/web_poet/page_inputs/meta.py b/web_poet/page_inputs/meta.py new file mode 100644 index 00000000..bbc61cea --- /dev/null +++ b/web_poet/page_inputs/meta.py @@ -0,0 +1,8 @@ +class Meta(dict): + """Container class that could contain any arbitrary data to be passed into + a Page Object. + + Note that this is simply a subclass of Python's ``dict``. + """ + + pass diff --git a/web_poet/requests.py b/web_poet/requests.py index 759092e2..f2006846 100644 --- a/web_poet/requests.py +++ b/web_poet/requests.py @@ -1,37 +1,14 @@ -"""This module has a full support for :mod:`asyncio` that enables developers to -perform asynchronous additional requests inside of Page Objects. - -Note that the implementation to fully execute any :class:`~.Request` is not -handled in this module. With that, the framework using **web-poet** must supply -the implementation. - -You can read more about this in the :ref:`advanced-downloader-impl` documentation. -""" - -import asyncio import logging from contextvars import ContextVar -from typing import Optional, List, Callable, Union, Dict -import attrs - -from web_poet.page_inputs import ( - HttpResponse, +from web_poet.exceptions import RequestBackendError +from web_poet.page_inputs.http import ( HttpRequest, - HttpRequestHeaders, - HttpRequestBody, + HttpResponse, ) -from web_poet.exceptions import RequestBackendError, HttpResponseError -from web_poet.utils import as_list logger = logging.getLogger(__name__) -_StrMapping = Dict[str, str] -_Headers = Union[_StrMapping, HttpRequestHeaders] -_Body = Union[bytes, HttpRequestBody] -_Status = Union[str, int] - - # Frameworks that wants to support additional requests in ``web-poet`` should # set the appropriate implementation for requesting data. request_backend_var: ContextVar = ContextVar("request_backend") @@ -62,181 +39,3 @@ async def _perform_request(request: HttpRequest) -> HttpResponse: response_data: HttpResponse = await request_backend(request) return response_data - - -class HttpClient: - """A convenient client to easily execute requests. - - By default, it uses the request implementation assigned in the - ``web_poet.request_backend_var`` which is a :mod:`contextvars` instance to - download the actual requests. However, it can easily be overridable by - providing an optional ``request_downloader`` callable. - - Providing the request implementation by dependency injection would be a good - alternative solution when you want to avoid setting up :mod:`contextvars` - like ``web_poet.request_backend_var``. - - In any case, this doesn't contain any implementation about how to execute - any requests fed into it. When setting that up, make sure that the downloader - implementation returns a :class:`~.HttpResponse` instance. - """ - - def __init__(self, request_downloader: Callable = None): - self._request_downloader = request_downloader or _perform_request - - @staticmethod - def _handle_status( - response: HttpResponse, - request: HttpRequest, - *, - allow_status: List[_Status] = None, - ) -> None: - allow_status_normalized = list(map(str, as_list(allow_status))) - allow_all_status = any( - [True for s in allow_status_normalized if "*" == s.strip()] - ) - - if ( - allow_all_status - or response.status is None # allows serialized responses from tests - or response.status < 400 - or str(response.status) in allow_status_normalized - ): - return - - if 400 <= response.status < 500: - kind = "Client" - elif 500 <= response.status < 600: - kind = "Server" - - msg = f"{response.status} {kind} Error for {response.url}" - raise HttpResponseError(msg, request=request, response=response) - - async def request( - self, - url: str, - *, - method: str = "GET", - headers: Optional[_Headers] = None, - body: Optional[_Body] = None, - allow_status: List[_Status] = None, - ) -> HttpResponse: - """This is a shortcut for creating a :class:`~.HttpRequest` instance and - executing that request. - - A :class:`~.HttpResponse` instance should then be returned for successful - responses in the 100-3xx status code range. Otherwise, an exception of - type :class:`web_poet.exceptions.http.HttpResponseError` will be raised. - - This behavior can be changed by suppressing the exceptions on select - status codes using the ``allow_status`` param: - - * Passing status code values would not raise the exception when it - occurs. This would return the response as-is. - * Passing a "*" value would basically allow any status codes. - - .. warning:: - By convention, the request implementation supplied optionally to - :class:`~.HttpClient` should return a :class:`~.HttpResponse` instance. - However, the underlying implementation supplied might change that, - depending on how the framework using **web-poet** implements it. - """ - headers = headers or {} - body = body or b"" - req = HttpRequest(url=url, method=method, headers=headers, body=body) - response = await self.execute(req, allow_status=allow_status) - return response - - async def get( - self, - url: str, - *, - headers: Optional[_Headers] = None, - allow_status: List[_Status] = None, - ) -> HttpResponse: - """Similar to :meth:`~.HttpClient.request` but peforming a ``GET`` - request. - """ - return await self.request( - url=url, - method="GET", - headers=headers, - allow_status=allow_status, - ) - - async def post( - self, - url: str, - *, - headers: Optional[_Headers] = None, - body: Optional[_Body] = None, - allow_status: List[_Status] = None, - ) -> HttpResponse: - """Similar to :meth:`~.HttpClient.request` but performing a ``POST`` - request. - """ - return await self.request( - url=url, - method="POST", - headers=headers, - body=body, - allow_status=allow_status, - ) - - async def execute( - self, request: HttpRequest, *, allow_status: List[_Status] = None - ) -> HttpResponse: - """Accepts a single instance of :class:`~.HttpRequest` and executes it - using the request implementation configured in the :class:`~.HttpClient` - instance. - - A :class:`~.HttpResponse` instance should then be returned for successful - responses in the 100-3xx status code range. Otherwise, an exception of - type :class:`web_poet.exceptions.http.HttpResponseError` will be raised. - - This behavior can be changed by suppressing the exceptions on select - status codes using the ``allow_status`` param: - - * Passing status code values would not raise the exception when it - occurs. This would return the response as-is. - * Passing a "*" value would basically allow any status codes. - """ - response = await self._request_downloader(request) - self._handle_status(response, request, allow_status=allow_status) - return response - - async def batch_execute( - self, - *requests: HttpRequest, - return_exceptions: bool = False, - allow_status: List[_Status] = None, - ) -> List[Union[HttpResponse, Exception]]: - """Similar to :meth:`~.HttpClient.execute` but accepts a collection of - :class:`~.HttpRequest` instances that would be batch executed. - - The order of the :class:`~.HttpResponses` would correspond to the order - of :class:`~.HttpRequest` passed. - - If any of the :class:`~.HttpRequest` raises an exception upon execution, - the exception is raised. - - To prevent this, the actual exception can be returned alongside any - successful :class:`~.HttpResponse`. This enables salvaging any usable - responses despite any possible failures. This can be done by setting - ``True`` to the ``return_exceptions`` parameter. - - Like :meth:`~.HttpClient.execute`, :class:`web_poet.exceptions.http.HttpResponseError` - will be raised for responses with status codes in the ``400-5xx`` range. - The ``allow_status`` parameter could be used the same way here to prevent - these exceptions from being raised. - - You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``. - However, it would be returning :class:`web_poet.exceptions.http.HttpResponseError` - instead of :class:`~.HttpResponse`. - """ - - coroutines = [self.execute(r, allow_status=allow_status) for r in requests] - responses = await asyncio.gather( - *coroutines, return_exceptions=return_exceptions - ) - return responses From c5244b9fe04201529f3d78733e7cdeffcf7a4d54 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 7 May 2022 12:09:27 +0800 Subject: [PATCH 10/22] update docs to make language clear about how HttpResponseError should be raised --- docs/advanced/additional-requests.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 20094952..60286e28 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -1000,7 +1000,4 @@ the ``400-5xx`` range, **web-poet** raises the :class:`web_poet.exceptions.http. exception. From this distinction, the framework shouldn't raise :class:`web_poet.exceptions.http.HttpResponseError` -on its own at all, since the :class:`~.HttpClient` already handles that. However, -the implementing framework could modify which status codes to allow (*meaning no -exceptions raised*) using the ``allow_status`` parameter in the -:class:`~.HttpClient` methods. +on its own at all, since the :class:`~.HttpClient` already handles that. From a209b4d83932261026937c9f27122d987d97f550 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 7 May 2022 12:10:45 +0800 Subject: [PATCH 11/22] remove response as a kwarg input for HttpError and HttpRequestError --- tests/test_exceptions.py | 21 +++++++++++++++++++++ web_poet/exceptions/http.py | 22 +++++++++------------- 2 files changed, 30 insertions(+), 13 deletions(-) create mode 100644 tests/test_exceptions.py diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py new file mode 100644 index 00000000..afecb2f5 --- /dev/null +++ b/tests/test_exceptions.py @@ -0,0 +1,21 @@ +from web_poet.page_inputs import HttpRequest, HttpResponse +from web_poet.exceptions import HttpError, HttpResponseError + +URL = "https://example.com" + + +def test_http_error_init(): + + request = HttpRequest(URL) + + exc = HttpError(request=request) + assert exc.request == request + +def test_http_response_error_init(): + + request = HttpRequest(URL) + response = HttpResponse(URL, b"") + + exc = HttpResponseError(request=request, response=response) + assert exc.request == request + assert exc.response == response diff --git a/web_poet/exceptions/http.py b/web_poet/exceptions/http.py index 0a13585f..07acc552 100644 --- a/web_poet/exceptions/http.py +++ b/web_poet/exceptions/http.py @@ -20,20 +20,9 @@ class HttpError(IOError): :param request: The :class:`~.HttpRequest` instance that was used. :type request: HttpRequest - :param response: The :class:`~.HttpResponse` instance that was received, if - any. Note that this wouldn't exist if the problem ocurred when executing - the :class:`~.HttpRequest`. - :type response: HttpResponse """ - def __init__( - self, - *args, - request: HttpRequest = None, - response: HttpResponse = None, - **kwargs - ): - self.response = response + def __init__(self, *args, request: HttpRequest = None, **kwargs): self.request = request super().__init__(*args, **kwargs) @@ -59,6 +48,13 @@ class HttpResponseError(HttpError): Frameworks implementing **web-poet** should **NOT** raise this themselves but rather, rely on the ``allow_status`` parameter found in the methods of :class:`~.HttpClient`. + + :param response: The :class:`~.HttpResponse` instance that was received, if + any. Note that this wouldn't exist if the problem ocurred when executing + the :class:`~.HttpRequest`. + :type response: HttpResponse """ - pass + def __init__(self, *args, response: HttpResponse = None, **kwargs): + self.response = response + super().__init__(*args, **kwargs) From 7cfe916bfa619a6158073b7dd0032d2dfa71897b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 7 May 2022 12:14:40 +0800 Subject: [PATCH 12/22] add missing param docstrings to HttpRequestError and HttpResponseError --- web_poet/exceptions/http.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/web_poet/exceptions/http.py b/web_poet/exceptions/http.py index 07acc552..ba75e0db 100644 --- a/web_poet/exceptions/http.py +++ b/web_poet/exceptions/http.py @@ -30,6 +30,9 @@ def __init__(self, *args, request: HttpRequest = None, **kwargs): class HttpRequestError(HttpError): """Indicates that an exception has occurred when the **HTTP Request** was being handled. + + :param request: The :class:`~.HttpRequest` instance that was used. + :type request: HttpRequest """ pass @@ -49,6 +52,8 @@ class HttpResponseError(HttpError): but rather, rely on the ``allow_status`` parameter found in the methods of :class:`~.HttpClient`. + :param request: The :class:`~.HttpRequest` instance that was used. + :type request: HttpRequest :param response: The :class:`~.HttpResponse` instance that was received, if any. Note that this wouldn't exist if the problem ocurred when executing the :class:`~.HttpRequest`. From 50a287b84fe14d1c54fbee325fd34fc2796f2a01 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 7 May 2022 12:15:10 +0800 Subject: [PATCH 13/22] Update web_poet/exceptions/http.py docstring Co-authored-by: Mikhail Korobov --- web_poet/exceptions/http.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/web_poet/exceptions/http.py b/web_poet/exceptions/http.py index ba75e0db..56d29ad7 100644 --- a/web_poet/exceptions/http.py +++ b/web_poet/exceptions/http.py @@ -48,9 +48,10 @@ class HttpResponseError(HttpError): .. note:: - Frameworks implementing **web-poet** should **NOT** raise this themselves - but rather, rely on the ``allow_status`` parameter found in the methods - of :class:`~.HttpClient`. + Frameworks implementing **web-poet** should **NOT** raise this exception. + + This exception is raised by web-poet itself, based on ``allow_status`` + parameter found in the methods of :class:`~.HttpClient`. :param request: The :class:`~.HttpRequest` instance that was used. :type request: HttpRequest From b889fb974f935fe0c27902cfdc9cbb2b186fad0f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 7 May 2022 15:54:13 +0800 Subject: [PATCH 14/22] update docs to follow RFC 2119 --- docs/advanced/additional-requests.rst | 42 +++++++++++++++------------ 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 60286e28..2c0647bd 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -23,6 +23,10 @@ properly extract data for some websites. with today's websites which relies on a lot of page interactions to display its contents. +The key words "MUST”, "MUST NOT”, "REQUIRED”, "SHALL”, "SHALL NOT”, "SHOULD”, +"SHOULD NOT”, "RECOMMENDED”, "MAY”, and "OPTIONAL” in this document are to be +interpreted as described in RFC `2119 `_. + .. _`httprequest-example`: HttpRequest @@ -115,7 +119,7 @@ HttpResponse :class:`~.HttpResponse` is what comes after a :class:`~.HttpRequest` has been executed. It's typically returned by the methods from :class:`~.HttpClient` (see :ref:`httpclient` tutorial section) which holds the information regarding the response. -It's also the required input for Page Objects inheriting from the :class:`~.ItemWebPage` +It's also the REQUIRED input for Page Objects inheriting from the :class:`~.ItemWebPage` class as explained from the :ref:`from-ground-up` tutorial. .. note:: @@ -304,7 +308,7 @@ Executing a HttpRequest instance return item As the example suggests, we're performing an additional request that allows us -to extract more images in a product page that might not be otherwise be possible. +to extract more images in a product page that MAY NOT be otherwise be possible. This is because in order to do so, an additional button needs to be clicked which fetches the complete set of product images via AJAX. @@ -536,7 +540,7 @@ The key takeaways for this example are: .. tip:: The :meth:`~.HttpClient.batch_execute` method can accept different varieties - of :class:`~.HttpRequest` that might not be related with one another. For + of :class:`~.HttpRequest` that MAY NOT be related with one another. For example, it could be a mixture of ``GET`` and ``POST`` requests or even representing requests for various parts of the page altogether. @@ -635,7 +639,7 @@ behavior could be altered by using the ``allow_status`` param in the methods of In the future, more specific exceptions which inherits from the base :class:`web_poet.exceptions.http.HttpError` exception would be available. - This should enable developers writing Page Objects to properly identify what + This SHOULD enable developers writing Page Objects to properly identify what went wrong and act specifically based on the problem. Let's take another example when executing requests in batch as opposed to using @@ -714,7 +718,7 @@ For this example, let's improve the code snippet from the previous subsection na return response_page.css("#main .related-products ::attr(product-id)").getall() Handling exceptions using :meth:`~.HttpClient.batch_execute` remains largely the same. -However, the main difference is that you might be wasting perfectly good responses just +However, the main difference is that you MAY be wasting perfectly good responses just because a single request from the batch ruined it. Notice that we're using the base exception class of :class:`web_poet.exceptions.http.HttpError` to account for any type of errors, both during the HTTP Request execution and when receiving the @@ -780,7 +784,7 @@ functionalities. However, as the docs have repeatedly mentioned, **web-poet** doesn't know how to execute any of these HTTP requests at all. It would be up to the framework that's handling **web-poet** to do so. -In this section, we'll explore the guidelines for how frameworks should use +In this section, we'll explore the guidelines for how frameworks MUST use **web-poet**. If you're a Page Object developer, you can skip this part as it mostly discusses the internals. However, reading through this section could render a better understanding of **web-poet** as a whole. @@ -856,7 +860,7 @@ Setting this up would allow access to the request implementation in a 2. Dependency Injection *********************** -The framework using **web-poet** might be using other libraries which doesn't +The framework using **web-poet** MAY be using other libraries which doesn't have a full support to :mod:`contextvars` `(e.g. Twisted)`. With that, an alternative approach would be to supply the request implementation when creating an :class:`~.HttpClient` instance: @@ -893,17 +897,17 @@ implementation** from a given framework is injected to it. Downloader Behavior ------------------- -The Downloader should be able to accept an instance of :class:`~.HttpRequest` +The Downloader MUST be able to accept an instance of :class:`~.HttpRequest` as the input and return an instance of :class:`~.HttpResponse`. This is important in order to handle and represent generic HTTP operations. The only time that it won't be returning :class:`~.HttpResponse` would be when it's raising exceptions (see :ref:`framework-exception-handling`). -The Downloader should be able to properly resolve **redirections** except when +The Downloader MUST be able to properly resolve **redirections** except when the method is ``HEAD``. This means that the :class:`~.HttpResponse` that it'll be rendering is already the end of the redirection trail. -Lastly, the Downloader should also be able to fully support the ``async/await`` +Lastly, the Downloader MUST also be able to fully support the ``async/await`` syntax in order to enable developers to perform additional requests asynchronously. .. _framework-exception-handling: @@ -913,13 +917,13 @@ Exception Handling In the previous :ref:`exception-handling` section, we can see how Page Object developers could use the exception classes built inside **web-poet** to handle -various ways additional requests may fail. In this section, we'll see the -rationale and ways the framework should be able to do that. +various ways additional requests MAY fail. In this section, we'll see the +rationale and ways the framework MUST be able to do that. Rationale ********* -Frameworks that handle **web-poet** should be able to ensure that Page Objects +Frameworks that handle **web-poet** MUST be able to ensure that Page Objects having additional requests using the :class:`~.HttpClient` is able to work in any type of HTTP downloader implementation. @@ -954,13 +958,13 @@ like the ones above, then it would cause the code to look like: # handle the error here Such code could turn messy in no time especially when the number of HTTP backends -that Page Objects **should support** are steadily increasing. Not to mention the +that Page Objects SHOULD support are steadily increasing. Not to mention the plethora of exception types that HTTP libraries have. This means that Page Objects aren't truly portable in different types of frameworks or environments. Rather, they're only limited to work in the specific framework they're supported. In order for Page Objects to easily work in different Downloader Implementations, -the framework that implements the HTTP Downloader backend should be able to raise +the framework that implements the HTTP Downloader backend MUST be able to raise exceptions from the :mod:`web_poet.exceptions.http` module in lieu of the backend specific ones `(e.g. aiohttp, requests, urllib, etc.)`. @@ -986,18 +990,18 @@ Expected behavior for Exceptions ******************************** All exceptions that the HTTP Downloader Implementation (see :ref:`advanced-downloader-impl` -doc section) explicitly raises when implementing it for **web-poet** should be +doc section) explicitly raises when implementing it for **web-poet** MUST be :class:`web_poet.exceptions.http.HttpError` *(or a subclass from it)*. For frameworks that implement and use **web-poet**, exceptions that ocurred when -handling the additional requests like `connection errors`, `TLS errors`, etc should +handling the additional requests like `connection errors`, `TLS errors`, etc MUST be replaced by :class:`web_poet.exceptions.http.HttpRequestError` by raising it explicitly. For responses that are not really errors like in the ``100-3xx`` status code range, -no exception should be raised at all. For responses with status codes in +exception MUST NOT be raised at all. For responses with status codes in the ``400-5xx`` range, **web-poet** raises the :class:`web_poet.exceptions.http.HttpResponseError` exception. -From this distinction, the framework shouldn't raise :class:`web_poet.exceptions.http.HttpResponseError` +From this distinction, the framework MUST NOT raise :class:`web_poet.exceptions.http.HttpResponseError` on its own at all, since the :class:`~.HttpClient` already handles that. From 1401a514106514d2910bcb6dc15d4d5909d5b453 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 7 May 2022 16:18:09 +0800 Subject: [PATCH 15/22] update _handle_status() to spell out actual response error --- tests/test_requests.py | 12 ++++++++++-- web_poet/page_inputs/client.py | 9 +++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/test_requests.py b/tests/test_requests.py index 9d3ee2e1..f5da661c 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -118,11 +118,13 @@ async def test_http_client_allow_status(async_mock, client_with_status, method_n await method(url_or_request) assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) + assert str(excinfo.value).startswith("500 INTERNAL_SERVER_ERROR response for") with pytest.raises(HttpResponseError) as err: - await method(url_or_request, allow_status=408) + await method(url_or_request, allow_status=406) assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) + assert str(excinfo.value).startswith("500 INTERNAL_SERVER_ERROR response for") # As long as "*" is present, then no errors would be raised await method(url_or_request, allow_status="*") @@ -133,6 +135,7 @@ async def test_http_client_allow_status(async_mock, client_with_status, method_n await method(url_or_request, allow_status="5*") assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) + assert str(excinfo.value).startswith("500 INTERNAL_SERVER_ERROR response for") @pytest.mark.asyncio @@ -236,11 +239,13 @@ async def test_http_client_batch_execute_allow_status(async_mock, client_with_st await client.batch_execute(*requests) assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) + assert str(excinfo.value).startswith("400 BAD_REQUEST response for") with pytest.raises(HttpResponseError) as excinfo: - await client.batch_execute(*requests, allow_status=408) + await client.batch_execute(*requests, allow_status=406) assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) + assert str(excinfo.value).startswith("400 BAD_REQUEST response for") await client.batch_execute(*requests, return_exceptions=True, allow_status=400) await client.batch_execute(*requests, return_exceptions=True, allow_status=[400, 403]) @@ -251,8 +256,11 @@ async def test_http_client_batch_execute_allow_status(async_mock, client_with_st assert all([isinstance(r, HttpResponseError) for r in responses]) assert all([isinstance(r.request, HttpRequest) for r in responses]) assert all([isinstance(r.response, HttpResponse) for r in responses]) + assert all([str(r).startswith("400 BAD_REQUEST response for") for r in responses]) responses = await client.batch_execute(*requests, return_exceptions=True, allow_status=408) assert all([isinstance(r, HttpResponseError) for r in responses]) assert all([isinstance(r.request, HttpRequest) for r in responses]) assert all([isinstance(r.response, HttpResponse) for r in responses]) + assert all([str(r).startswith("400 BAD_REQUEST response for") for r in responses]) + diff --git a/web_poet/page_inputs/client.py b/web_poet/page_inputs/client.py index f7c4b1af..4256e816 100644 --- a/web_poet/page_inputs/client.py +++ b/web_poet/page_inputs/client.py @@ -11,6 +11,7 @@ import asyncio import logging from typing import Optional, Dict, List, Union, Callable +from http import HTTPStatus from web_poet.requests import request_backend_var, _perform_request from web_poet.page_inputs.http import ( @@ -70,12 +71,8 @@ def _handle_status( ): return - if 400 <= response.status < 500: - kind = "Client" - elif 500 <= response.status < 600: - kind = "Server" - - msg = f"{response.status} {kind} Error for {response.url}" + status = HTTPStatus(response.status) + msg = f"{response.status} {status.name} response for {response.url}" raise HttpResponseError(msg, request=request, response=response) async def request( From 0eefdf872f4aa9cbf5ad9c52a29626dbbe49e666 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Sat, 7 May 2022 16:40:17 +0800 Subject: [PATCH 16/22] update docs regarding how HttpRequestError may be raised --- docs/api_reference.rst | 10 +++++++++- web_poet/page_inputs/client.py | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 778661e5..3b4c0f29 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -7,7 +7,15 @@ API Reference Page Inputs =========== -.. automodule:: web_poet.page_inputs +.. automodule:: web_poet.page_inputs.client + :members: + :undoc-members: + +.. automodule:: web_poet.page_inputs.http + :members: + :undoc-members: + +.. automodule:: web_poet.page_inputs.meta :members: :undoc-members: diff --git a/web_poet/page_inputs/client.py b/web_poet/page_inputs/client.py index 4256e816..6e12b6e6 100644 --- a/web_poet/page_inputs/client.py +++ b/web_poet/page_inputs/client.py @@ -87,12 +87,16 @@ async def request( """This is a shortcut for creating a :class:`~.HttpRequest` instance and executing that request. + A :class:`web_poet.exceptions.http.HttpRequestError` will be raised on + cases like *connection errors*, *connection and read timeouts*, etc. + A :class:`~.HttpResponse` instance should then be returned for successful responses in the 100-3xx status code range. Otherwise, an exception of type :class:`web_poet.exceptions.http.HttpResponseError` will be raised. This behavior can be changed by suppressing the exceptions on select status codes using the ``allow_status`` param: + * Passing status code values would not raise the exception when it occurs. This would return the response as-is. * Passing a "*" value would basically allow any status codes. @@ -152,6 +156,9 @@ async def execute( using the request implementation configured in the :class:`~.HttpClient` instance. + A :class:`web_poet.exceptions.http.HttpRequestError` will be raised on + cases like *connection errors*, *connection and read timeouts*, etc. + A :class:`~.HttpResponse` instance should then be returned for successful responses in the 100-3xx status code range. Otherwise, an exception of type :class:`web_poet.exceptions.http.HttpResponseError` will be raised. @@ -195,6 +202,9 @@ async def batch_execute( You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``. However, it would be returning :class:`web_poet.exceptions.http.HttpResponseError` instead of :class:`~.HttpResponse`. + + Lastly, a :class:`web_poet.exceptions.http.HttpRequestError` may be raised + on cases like *connection errors*, *connection and read timeouts*, etc. """ coroutines = [self.execute(r, allow_status=allow_status) for r in requests] From ea5e5dace9d6bdce05636d3962ad0e53ef814eda Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 16 May 2022 12:33:38 +0800 Subject: [PATCH 17/22] Update docs/advanced/additional-requests.rst Co-authored-by: Mikhail Korobov --- docs/advanced/additional-requests.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 2c0647bd..3764f395 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -903,9 +903,10 @@ in order to handle and represent generic HTTP operations. The only time that it won't be returning :class:`~.HttpResponse` would be when it's raising exceptions (see :ref:`framework-exception-handling`). -The Downloader MUST be able to properly resolve **redirections** except when -the method is ``HEAD``. This means that the :class:`~.HttpResponse` that it'll -be rendering is already the end of the redirection trail. +The Downloader MUST resolve Location-based **redirections** when the HTTP +method is not ``HEAD``. In other words, for non-``HEAD`` requests the +returned :class:`~.HttpResponse` must be the final response, after all redirects. +For ``HEAD`` requests redirects MUST NOT be resolved. Lastly, the Downloader MUST also be able to fully support the ``async/await`` syntax in order to enable developers to perform additional requests asynchronously. From afb0c0fba9549c83e1f8e3ea4e30158d7c213c57 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 16 May 2022 12:36:20 +0800 Subject: [PATCH 18/22] Update docs/advanced/additional-requests.rst Co-authored-by: Mikhail Korobov --- docs/advanced/additional-requests.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 3764f395..4e3f371a 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -908,7 +908,7 @@ method is not ``HEAD``. In other words, for non-``HEAD`` requests the returned :class:`~.HttpResponse` must be the final response, after all redirects. For ``HEAD`` requests redirects MUST NOT be resolved. -Lastly, the Downloader MUST also be able to fully support the ``async/await`` +Lastly, the Downloader MUST support the ``async/await`` syntax in order to enable developers to perform additional requests asynchronously. .. _framework-exception-handling: From 29aa8cc6d81fec598451e8493e91045661e9d9b8 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 16 May 2022 12:41:59 +0800 Subject: [PATCH 19/22] revert some terms from RFC 2119 --- docs/advanced/additional-requests.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 4e3f371a..bd7b8516 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -308,7 +308,7 @@ Executing a HttpRequest instance return item As the example suggests, we're performing an additional request that allows us -to extract more images in a product page that MAY NOT be otherwise be possible. +to extract more images in a product page that might not be otherwise be possible. This is because in order to do so, an additional button needs to be clicked which fetches the complete set of product images via AJAX. @@ -718,7 +718,7 @@ For this example, let's improve the code snippet from the previous subsection na return response_page.css("#main .related-products ::attr(product-id)").getall() Handling exceptions using :meth:`~.HttpClient.batch_execute` remains largely the same. -However, the main difference is that you MAY be wasting perfectly good responses just +However, the main difference is that you may be wasting perfectly good responses just because a single request from the batch ruined it. Notice that we're using the base exception class of :class:`web_poet.exceptions.http.HttpError` to account for any type of errors, both during the HTTP Request execution and when receiving the From 94846260e6c9902edd2c446b580bdac4994e3401 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 17 May 2022 11:24:51 +0800 Subject: [PATCH 20/22] add more tests --- tests/test_exceptions.py | 24 ++++++++++++++++++++++-- tests/test_requests.py | 3 +-- web_poet/exceptions/http.py | 4 ++-- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index afecb2f5..346ae70f 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -1,17 +1,37 @@ +import pytest + from web_poet.page_inputs import HttpRequest, HttpResponse -from web_poet.exceptions import HttpError, HttpResponseError +from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError URL = "https://example.com" def test_http_error_init(): + exc = HttpError() + assert exc.request is None request = HttpRequest(URL) - exc = HttpError(request=request) assert exc.request == request + +def test_http_request_error_init(): + exc = HttpRequestError() + assert exc.request is None + + request = HttpRequest(URL) + exc = HttpRequestError(request=request) + assert exc.request == request + + response = HttpResponse(URL, b"") + with pytest.raises(TypeError): + HttpRequestError(request=request, response=response) + + def test_http_response_error_init(): + exc = HttpResponseError() + assert exc.request is None + assert exc.response is None request = HttpRequest(URL) response = HttpResponse(URL, b"") diff --git a/tests/test_requests.py b/tests/test_requests.py index f5da661c..9e6fef57 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -222,7 +222,7 @@ async def test_http_client_batch_execute_with_exception_raised(client_that_errs) async def test_http_client_batch_execute_allow_status(async_mock, client_with_status): client = HttpClient(async_mock) - # Simulate 500 Internal Server Error responses + # Simulate 400 Bad Request client._request_downloader = client_with_status(400) requests = [HttpRequest("url-1"), HttpRequest("url-2"), HttpRequest("url-3")] @@ -263,4 +263,3 @@ async def test_http_client_batch_execute_allow_status(async_mock, client_with_st assert all([isinstance(r.request, HttpRequest) for r in responses]) assert all([isinstance(r.response, HttpResponse) for r in responses]) assert all([str(r).startswith("400 BAD_REQUEST response for") for r in responses]) - diff --git a/web_poet/exceptions/http.py b/web_poet/exceptions/http.py index 56d29ad7..12315e92 100644 --- a/web_poet/exceptions/http.py +++ b/web_poet/exceptions/http.py @@ -49,8 +49,8 @@ class HttpResponseError(HttpError): .. note:: Frameworks implementing **web-poet** should **NOT** raise this exception. - - This exception is raised by web-poet itself, based on ``allow_status`` + + This exception is raised by web-poet itself, based on ``allow_status`` parameter found in the methods of :class:`~.HttpClient`. :param request: The :class:`~.HttpRequest` instance that was used. From 8b01cf7590f904047dba1851fb4d0ff18a43da69 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 20 May 2022 20:25:25 +0800 Subject: [PATCH 21/22] simplify init params of http exceptions --- web_poet/exceptions/http.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/web_poet/exceptions/http.py b/web_poet/exceptions/http.py index 12315e92..1036abd5 100644 --- a/web_poet/exceptions/http.py +++ b/web_poet/exceptions/http.py @@ -22,9 +22,9 @@ class HttpError(IOError): :type request: HttpRequest """ - def __init__(self, *args, request: HttpRequest = None, **kwargs): + def __init__(self, msg: str = None, request: HttpRequest = None): self.request = request - super().__init__(*args, **kwargs) + super().__init__(msg) class HttpRequestError(HttpError): @@ -61,6 +61,6 @@ class HttpResponseError(HttpError): :type response: HttpResponse """ - def __init__(self, *args, response: HttpResponse = None, **kwargs): + def __init__(self, msg: str = None, response: HttpResponse = None, **kwargs): self.response = response - super().__init__(*args, **kwargs) + super().__init__(msg, **kwargs) From 43bb5d9998c0e0ca51141f34a5c1b57bf2c6c628 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 20 May 2022 20:49:38 +0800 Subject: [PATCH 22/22] create default messages for Http Exceptions --- tests/test_exceptions.py | 3 +++ web_poet/exceptions/http.py | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index 346ae70f..e407cd5a 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -9,6 +9,7 @@ def test_http_error_init(): exc = HttpError() assert exc.request is None + assert exc.args request = HttpRequest(URL) exc = HttpError(request=request) @@ -18,6 +19,7 @@ def test_http_error_init(): def test_http_request_error_init(): exc = HttpRequestError() assert exc.request is None + assert exc.args request = HttpRequest(URL) exc = HttpRequestError(request=request) @@ -32,6 +34,7 @@ def test_http_response_error_init(): exc = HttpResponseError() assert exc.request is None assert exc.response is None + assert exc.args request = HttpRequest(URL) response = HttpResponse(URL, b"") diff --git a/web_poet/exceptions/http.py b/web_poet/exceptions/http.py index 1036abd5..f8f8a2f8 100644 --- a/web_poet/exceptions/http.py +++ b/web_poet/exceptions/http.py @@ -24,6 +24,8 @@ class HttpError(IOError): def __init__(self, msg: str = None, request: HttpRequest = None): self.request = request + if msg is None: + msg = f"An Error ocurred when executing this HTTP Request: {self.request}" super().__init__(msg) @@ -61,6 +63,8 @@ class HttpResponseError(HttpError): :type response: HttpResponse """ - def __init__(self, msg: str = None, response: HttpResponse = None, **kwargs): + def __init__(self, msg: str = None, response: HttpResponse = None, request: HttpRequest = None): self.response = response - super().__init__(msg, **kwargs) + if msg is None: + msg = f"Unexpected HTTP Response received: {self.response}" + super().__init__(msg, request=request)