diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..8dd2a30 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +omit = crawlera_fetch/_utils.py # already tested in upstream Scrapy diff --git a/README.md b/README.md index 9195620..3fe5cdf 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,18 @@ Crawlera middleware won't be able to handle them. The endpoint of a specific Crawlera instance +* `CRAWLERA_FETCH_ON_ERROR` (type `enum.Enum` - `crawlera_fetch.OnError`, + default `OnError.Raise`) + + What to do if an error occurs while downloading or decoding a response. Possible values are: + * `OnError.Raise` (raise a `crawlera_fetch.CrawleraFetchException` exception) + * `OnError.Warn` (log a warning and return the raw upstream response) + * `OnError.Retry` (retry the failed request, up to `CRAWLERA_FETCH_RETRY_TIMES` times) + * `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`) + **_Deprecated, please use `CRAWLERA_FETCH_ON_ERROR`_** + Whether or not the middleware will raise an exception if an error occurs while downloading or decoding a response. If `False`, a warning will be logged and the raw upstream response will be returned upon encountering an error. @@ -76,6 +86,19 @@ Crawlera middleware won't be able to handle them. Default values to be sent to the Crawlera Fetch API. For instance, set to `{"device": "mobile"}` to render all requests with a mobile profile. +* `CRAWLERA_FETCH_SHOULD_RETRY` (type `Optional[Callable, str]`, default `None`) + + A boolean callable that determines whether a request should be retried by the middleware. + If the setting value is a `str`, an attribute by that name will be looked up on the spider + object doing the crawl. The callable should accept the following arguments: + `response: scrapy.http.response.Response, request: scrapy.http.request.Request, spider: scrapy.spiders.Spider`. + If the return value evaluates to `True`, the request will be retried by the middleware. + +* `CRAWLERA_FETCH_RETRY_TIMES` (type `Optional[int]`, default `None`) + + The maximum number of times a request should be retried. + If `None`, the value is taken from the `RETRY_TIMES` setting. + ### Spider attributes * `crawlera_fetch_enabled` (type `bool`, default `False`) diff --git a/crawlera_fetch/__init__.py b/crawlera_fetch/__init__.py index f04b93b..e87e23b 100644 --- a/crawlera_fetch/__init__.py +++ b/crawlera_fetch/__init__.py @@ -1,2 +1,2 @@ from .logformatter import CrawleraFetchLogFormatter # noqa: F401 -from .middleware import CrawleraFetchMiddleware, DownloadSlotPolicy # noqa: F401 +from .middleware import CrawleraFetchMiddleware, DownloadSlotPolicy, OnError # noqa: F401 diff --git a/crawlera_fetch/_utils.py b/crawlera_fetch/_utils.py new file mode 100644 index 0000000..f33cf6e --- /dev/null +++ b/crawlera_fetch/_utils.py @@ -0,0 +1,59 @@ +from logging import Logger +from typing import Optional, Union + +from scrapy import Request, Spider +from scrapy.utils.python import global_object_name + + +# disable black formatting to avoid syntax error on py35 +# fmt: off +def _get_retry_request( + request: Request, + *, + spider: Spider, + reason: Union[str, Exception] = "unspecified", + max_retry_times: Optional[int] = None, + priority_adjust: Optional[int] = None, + logger: Logger, + stats_base_key: str # black wants to put a comma at the end, but py35 doesn't like it +) -> Optional[Request]: + # fmt: on + """ + Fallback implementation, taken verbatim from https://github.com/scrapy/scrapy/pull/4902 + """ + settings = spider.crawler.settings + stats = spider.crawler.stats + retry_times = request.meta.get("retry_times", 0) + 1 + if max_retry_times is None: + max_retry_times = request.meta.get("max_retry_times") + if max_retry_times is None: + max_retry_times = settings.getint("RETRY_TIMES") + if retry_times <= max_retry_times: + logger.debug( + "Retrying %(request)s (failed %(retry_times)d times): %(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + new_request = request.copy() + new_request.meta["retry_times"] = retry_times + new_request.dont_filter = True + if priority_adjust is None: + priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST") + new_request.priority = request.priority + priority_adjust + + if callable(reason): + reason = reason() + if isinstance(reason, Exception): + reason = global_object_name(reason.__class__) + + stats.inc_value("{}/count".format(stats_base_key)) + stats.inc_value("{}/reason_count/{}".format(stats_base_key, reason)) + return new_request + else: + stats.inc_value("{}/max_reached".format(stats_base_key)) + logger.error( + "Gave up retrying %(request)s (failed %(retry_times)d times): " "%(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + return None diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index b2a2467..b0e871f 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -4,11 +4,13 @@ import logging import os import time +import warnings from enum import Enum -from typing import Optional, Type, TypeVar +from typing import Callable, Optional, Type, TypeVar, Union import scrapy from scrapy.crawler import Crawler +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http.request import Request from scrapy.http.response import Response from scrapy.responsetypes import responsetypes @@ -18,13 +20,21 @@ from scrapy.utils.reqser import request_from_dict, request_to_dict from w3lib.http import basic_auth_header +try: + from scrapy.downloadermiddlewares.retry import get_retry_request # available on Scrapy >= 2.5 +except ImportError: + from crawlera_fetch._utils import _get_retry_request as get_retry_request -logger = logging.getLogger("crawlera-fetch-middleware") +__all__ = [ + "CrawleraFetchException", + "CrawleraFetchMiddleware", + "DownloadSlotPolicy", + "OnError", +] +logger = logging.getLogger("crawlera-fetch-middleware") MiddlewareTypeVar = TypeVar("MiddlewareTypeVar", bound="CrawleraFetchMiddleware") - - META_KEY = "crawlera_fetch" @@ -34,6 +44,12 @@ class DownloadSlotPolicy(Enum): Default = "default" +class OnError(Enum): + Warn = "warn" + Raise = "raise" + Retry = "retry" + + class CrawleraFetchException(Exception): pass @@ -74,12 +90,57 @@ def _read_settings(self, spider: Spider) -> None: self.download_slot_policy = settings.get( "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain ) - - self.raise_on_error = settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR", True) - self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {}) - def spider_opened(self, spider): + # what to do when errors happen? + self.on_error_action = None # type: Optional[OnError] + if "CRAWLERA_FETCH_RAISE_ON_ERROR" in settings: + warnings.warn( + "CRAWLERA_FETCH_RAISE_ON_ERROR is deprecated, " + "please use CRAWLERA_FETCH_ON_ERROR instead", + category=ScrapyDeprecationWarning, + stacklevel=2, + ) + if settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR"): + self.on_error_action = OnError.Raise + else: + self.on_error_action = OnError.Warn + if "CRAWLERA_FETCH_ON_ERROR" in settings: + if isinstance(settings["CRAWLERA_FETCH_ON_ERROR"], OnError): + self.on_error_action = settings["CRAWLERA_FETCH_ON_ERROR"] + else: + logger.warning( + "Invalid type for CRAWLERA_FETCH_ON_ERROR setting:" + " expected crawlera_fetch.OnError, got %s", + type(settings["CRAWLERA_FETCH_ON_ERROR"]), + ) + if self.on_error_action is None: + self.on_error_action = OnError.Raise + + # should we retry? + self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY") + if self.should_retry is not None: + if isinstance(self.should_retry, str): + try: + self.should_retry = getattr(spider, self.should_retry) + except AttributeError: + logger.warning( + "Could not find a '%s' callable on the spider - user retries are disabled", + self.should_retry, + ) + self.should_retry = None + elif not isinstance(self.should_retry, Callable): # type: ignore[arg-type] + logger.warning( + "Invalid type for retry function: expected Callable" + " or str, got %s - user retries are disabled", + type(self.should_retry), + ) + self.should_retry = None + self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES") + if not self.retry_times: + self.retry_times = settings.getint("RETRY_TIMES") + + def spider_opened(self, spider: Spider) -> None: try: spider_attr = getattr(spider, "crawlera_fetch_enabled") except AttributeError: @@ -163,6 +224,21 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request] request.meta[META_KEY] = crawlera_meta return request.replace(url=self.url, method="POST", body=body_json) + def _get_retry_request( + self, + request: Request, + reason: Union[Exception, str], + stats_base_key: str, + ) -> Optional[Request]: + return get_retry_request( + request=request, + reason=reason, + stats_base_key=stats_base_key, + spider=self.crawler.spider, + max_retry_times=self.retry_times, + logger=logger, + ) + def process_response(self, request: Request, response: Response, spider: Spider) -> Response: if not self.enabled: return response @@ -193,11 +269,19 @@ def process_response(self, request: Request, response: Response, spider: Spider) response.status, message, ) - if self.raise_on_error: + if self.on_error_action == OnError.Raise: raise CrawleraFetchException(log_msg) - else: + elif self.on_error_action == OnError.Warn: logger.warning(log_msg) return response + elif self.on_error_action == OnError.Retry: + return self._get_retry_request( + request=request, + reason=message, + stats_base_key="crawlera_fetch/retry/error", + ) + else: + raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting") try: json_response = json.loads(response.text) @@ -213,14 +297,24 @@ def process_response(self, request: Request, response: Response, spider: Spider) exc.lineno, exc.colno, ) - if self.raise_on_error: + if self.on_error_action == OnError.Raise: raise CrawleraFetchException(log_msg) from exc - else: + elif self.on_error_action == OnError.Warn: logger.warning(log_msg) return response + elif self.on_error_action == OnError.Retry: + return self._get_retry_request( + request=request, + reason=exc, + stats_base_key="crawlera_fetch/retry/error", + ) + else: + raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting") - server_error = json_response.get("crawlera_error") or json_response.get("error_code") original_status = json_response.get("original_status") + self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status)) + + server_error = json_response.get("crawlera_error") or json_response.get("error_code") request_id = json_response.get("id") or json_response.get("uncork_id") if server_error: message = json_response.get("body") or json_response.get("message") @@ -237,13 +331,19 @@ def process_response(self, request: Request, response: Response, spider: Spider) message, request_id or "unknown", ) - if self.raise_on_error: + if self.on_error_action == OnError.Raise: raise CrawleraFetchException(log_msg) - else: + elif self.on_error_action == OnError.Warn: logger.warning(log_msg) return response - - self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status)) + elif self.on_error_action == OnError.Retry: + return self._get_retry_request( + request=request, + reason=server_error, + stats_base_key="crawlera_fetch/retry/error", + ) + else: + raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting") crawlera_meta["upstream_response"] = { "status": response.status, @@ -260,7 +360,7 @@ def process_response(self, request: Request, response: Response, spider: Spider) url=json_response["url"], body=resp_body, ) - return response.replace( + response = response.replace( cls=respcls, request=original_request, headers=json_response["headers"], @@ -268,6 +368,14 @@ def process_response(self, request: Request, response: Response, spider: Spider) body=resp_body, status=original_status or 200, ) + if self.should_retry is not None: + if self.should_retry(response=response, request=request, spider=spider): + return self._get_retry_request( + request=request, + reason="should-retry", + stats_base_key="crawlera_fetch/retry/should-retry", + ) + return response def _set_download_slot(self, request: Request, spider: Spider) -> None: if self.download_slot_policy == DownloadSlotPolicy.Domain: diff --git a/tests/data/responses.py b/tests/data/responses.py index 3af9ba7..4e86c7a 100644 --- a/tests/data/responses.py +++ b/tests/data/responses.py @@ -10,105 +10,241 @@ from tests.utils import foo_spider, mocked_time -test_responses = [] +def get_test_responses(include_unprocessed=True): + test_responses = [] -test_responses.append( - { - "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], - status=200, - headers={ - "Content-Type": "application/json", - "Content-Encoding": "gzip", - "Transfer-Encoding": "chunked", - "Date": "Fri, 24 Apr 2020 18:06:42 GMT", - "Proxy-Connection": "close", - "Connection": "close", - }, - request=Request( + test_responses.append( + { + "original": HtmlResponse( url=SETTINGS["CRAWLERA_FETCH_URL"], - meta={ - "crawlera_fetch": { - "timing": {"start_ts": mocked_time()}, - "original_request": request_to_dict( - Request("https://fake.host.com"), - spider=foo_spider, - ), + status=200, + headers={ + "Content-Type": "application/json", + "Content-Encoding": "gzip", + "Transfer-Encoding": "chunked", + "Date": "Fri, 24 Apr 2020 18:06:42 GMT", + "Proxy-Connection": "close", + "Connection": "close", + }, + request=Request( + url=SETTINGS["CRAWLERA_FETCH_URL"], + meta={ + "crawlera_fetch": { + "timing": {"start_ts": mocked_time()}, + "original_request": request_to_dict( + Request("https://fake.host.com"), + spider=foo_spider, + ), + } + }, + ), + body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 + ), + "expected": TextResponse( + url="https://fake.host.com", + status=123, + headers={"Fake-Header": "true"}, + body=b"""foobar""", # noqa: E501 + ), + } + ) + + test_responses.append( + { + "original": HtmlResponse( + url=SETTINGS["CRAWLERA_FETCH_URL"], + status=200, + headers={ + "Content-Type": "application/json", + "Content-Encoding": "gzip", + "Transfer-Encoding": "chunked", + "Date": "Fri, 24 Apr 2020 18:06:42 GMT", + "Proxy-Connection": "close", + "Connection": "close", + }, + request=Request( + url=SETTINGS["CRAWLERA_FETCH_URL"], + meta={ + "crawlera_fetch": { + "timing": {"start_ts": mocked_time()}, + "original_request": request_to_dict( + Request("https://httpbin.org/get"), + spider=foo_spider, + ), + } + }, + ), + body=b"""{"url":"https://httpbin.org/get","original_status":200,"headers":{"X-Crawlera-Slave":"196.16.27.20:8800","X-Crawlera-Version":"1.43.0-","status":"200","date":"Fri, 24 Apr 2020 18:06:42 GMT","content-type":"application/json","content-length":"756","server":"gunicorn/19.9.0","access-control-allow-origin":"*","access-control-allow-credentials":"true"},"crawlera_status":"success","body_encoding":"plain","body":"
{\\n  \\"args\\": {}, \\n  \\"headers\\": {\\n    \\"Accept\\": \\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\\", \\n    \\"Accept-Encoding\\": \\"gzip, deflate, br\\", \\n    \\"Accept-Language\\": \\"en-US,en;q=0.9\\", \\n    \\"Cache-Control\\": \\"no-cache\\", \\n    \\"Host\\": \\"httpbin.org\\", \\n    \\"Pragma\\": \\"no-cache\\", \\n    \\"Sec-Fetch-Mode\\": \\"navigate\\", \\n    \\"Sec-Fetch-Site\\": \\"none\\", \\n    \\"Sec-Fetch-User\\": \\"?1\\", \\n    \\"Upgrade-Insecure-Requests\\": \\"1\\", \\n    \\"User-Agent\\": \\"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36\\", \\n    \\"X-Amzn-Trace-Id\\": \\"Root=1-5ea32ab2-93f521ee8238c744c88e3fec\\"\\n  }, \\n  \\"origin\\": \\"173.0.152.100\\", \\n  \\"url\\": \\"https://httpbin.org/get\\"\\n}\\n
"}""", # noqa: E501 + ), + "expected": HtmlResponse( + url="https://httpbin.org/get", + status=200, + headers={ + "X-Crawlera-Slave": "196.16.27.20:8800", + "X-Crawlera-Version": "1.43.0-", + "status": "200", + "date": "Fri, 24 Apr 2020 18:06:42 GMT", + "content-type": "application/json", + "content-length": "756", + "server": "gunicorn/19.9.0", + "access-control-allow-origin": "*", + "access-control-allow-credentials": "true", + }, + body=b"""
{\n  "args": {}, \n  "headers": {\n    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", \n    "Accept-Encoding": "gzip, deflate, br", \n    "Accept-Language": "en-US,en;q=0.9", \n    "Cache-Control": "no-cache", \n    "Host": "httpbin.org", \n    "Pragma": "no-cache", \n    "Sec-Fetch-Mode": "navigate", \n    "Sec-Fetch-Site": "none", \n    "Sec-Fetch-User": "?1", \n    "Upgrade-Insecure-Requests": "1", \n    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36", \n    "X-Amzn-Trace-Id": "Root=1-5ea32ab2-93f521ee8238c744c88e3fec"\n  }, \n  "origin": "173.0.152.100", \n  "url": "https://httpbin.org/get"\n}\n
""", # noqa: E501 + ), + } + ) + + test_responses.append( + { + "original": HtmlResponse( + url=SETTINGS["CRAWLERA_FETCH_URL"], + status=200, + headers={ + "Content-Type": "application/json", + "Content-Encoding": "gzip", + "Transfer-Encoding": "chunked", + "Date": "Fri, 24 Apr 2020 18:22:10 GMT", + "Proxy-Connection": "close", + "Connection": "close", + }, + request=Request( + url=SETTINGS["CRAWLERA_FETCH_URL"], + meta={ + "crawlera_fetch": { + "timing": {"start_ts": mocked_time()}, + "original_request": request_to_dict( + Request("https://example.org"), + spider=foo_spider, + ), + } + }, + ), + body=b"""{"url":"https://example.org","original_status":200,"headers":{"X-Crawlera-Slave":"192.241.80.236:3128","X-Crawlera-Version":"1.43.0-","status":"200","content-encoding":"gzip","accept-ranges":"bytes","age":"108944","cache-control":"max-age=604800","content-type":"text/html; charset=UTF-8","date":"Fri, 24 Apr 2020 18:22:10 GMT","etag":"\\"3147526947\\"","expires":"Fri, 01 May 2020 18:22:10 GMT","last-modified":"Thu, 17 Oct 2019 07:18:26 GMT","server":"ECS (dab/4B85)","vary":"Accept-Encoding","content-length":"648"},"crawlera_status":"success","body_encoding":"plain","body":"\\n Example Domain\\n\\n \\n \\n \\n \\n\\n\\n\\n
\\n

Example Domain

\\n

This domain is for use in illustrative examples in documents. You may use this\\n domain in literature without prior coordination or asking for permission.

\\n

More information...

\\n
\\n\\n\\n"}""", # noqa: E501 + ), + "expected": HtmlResponse( + url="https://example.org", + status=200, + headers={ + "X-Crawlera-Slave": "192.241.80.236:3128", + "X-Crawlera-Version": "1.43.0-", + "status": "200", + "content-encoding": "gzip", + "accept-ranges": "bytes", + "age": "108944", + "cache-control": "max-age=604800", + "content-type": "text/html; charset=UTF-8", + "date": "Fri, 24 Apr 2020 18:22:10 GMT", + "etag": '"3147526947"', + "expires": "Fri, 01 May 2020 18:22:10 GMT", + "last-modified": "Thu, 17 Oct 2019 07:18:26 GMT", + "server": "ECS (dab/4B85)", + "vary": "Accept-Encoding", + "content-length": "648", + }, + body=b"""\n Example Domain\n\n \n \n \n \n\n\n\n
\n

Example Domain

\n

This domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.

\n

More information...

\n
\n\n\n""", # noqa: E501 + ), + } + ) + + response_body_test = b"Hello middleware test!" + test_responses.append( + { + "original": HtmlResponse( + url=SETTINGS["CRAWLERA_FETCH_URL"], + status=200, + headers={ + "Content-Type": "application/json", + "Content-Encoding": "gzip", + "Date": "Fri, 24 Apr 2020 18:22:10 GMT", + }, + request=Request( + url=SETTINGS["CRAWLERA_FETCH_URL"], + meta={ + "crawlera_fetch": { + "timing": {"start_ts": mocked_time()}, + "original_request": request_to_dict( + Request("http://httpbin.org/ip"), + spider=foo_spider, + ), + } + }, + ), + body=json.dumps( + { + "id": "8498642e-1de3-40dd-b32f-f6eb6131e45e", + "body": base64.b64encode(response_body_test).decode(), + "original_status": 200, + "url": "http://httpbin.org/ip", + "original_url": "http://httpbin.org/ip", + "headers": { + "Content-Encoding": "gzip", + "Content-Type": "text/html", + "Date": "Fri, 24 Apr 2020 18:06:42 GMT", + }, } + ).encode(), + ), + "expected": HtmlResponse( + url="http://httpbin.org/ip", + status=200, + headers={ + "content-encoding": "gzip", + "content-type": "text/html", + "date": "Fri, 24 Apr 2020 18:06:42 GMT", }, + body=response_body_test, ), - body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 - ), - "expected": TextResponse( - url="https://fake.host.com", - status=123, - headers={"Fake-Header": "true"}, - body=b"""foobar""", # noqa: E501 - ), - } -) + } + ) -test_responses.append( - { - "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + if include_unprocessed: + unprocessed = HtmlResponse( + url="https://example.org", status=200, headers={ - "Content-Type": "application/json", + "Content-Type": "text/html", "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", - "Proxy-Connection": "close", - "Connection": "close", }, + request=Request("https://example.org"), + body=b"""""", + ) + test_responses.append({"original": unprocessed, "expected": unprocessed}) + + return test_responses + + +def get_test_responses_error(): + return [ + TextResponse( + url="https://crawlera.com/fake/api/endpoint", request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( - Request("https://httpbin.org/get"), + Request("https://example.org"), spider=foo_spider, ), } }, ), - body=b"""{"url":"https://httpbin.org/get","original_status":200,"headers":{"X-Crawlera-Slave":"196.16.27.20:8800","X-Crawlera-Version":"1.43.0-","status":"200","date":"Fri, 24 Apr 2020 18:06:42 GMT","content-type":"application/json","content-length":"756","server":"gunicorn/19.9.0","access-control-allow-origin":"*","access-control-allow-credentials":"true"},"crawlera_status":"success","body_encoding":"plain","body":"
{\\n  \\"args\\": {}, \\n  \\"headers\\": {\\n    \\"Accept\\": \\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\\", \\n    \\"Accept-Encoding\\": \\"gzip, deflate, br\\", \\n    \\"Accept-Language\\": \\"en-US,en;q=0.9\\", \\n    \\"Cache-Control\\": \\"no-cache\\", \\n    \\"Host\\": \\"httpbin.org\\", \\n    \\"Pragma\\": \\"no-cache\\", \\n    \\"Sec-Fetch-Mode\\": \\"navigate\\", \\n    \\"Sec-Fetch-Site\\": \\"none\\", \\n    \\"Sec-Fetch-User\\": \\"?1\\", \\n    \\"Upgrade-Insecure-Requests\\": \\"1\\", \\n    \\"User-Agent\\": \\"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36\\", \\n    \\"X-Amzn-Trace-Id\\": \\"Root=1-5ea32ab2-93f521ee8238c744c88e3fec\\"\\n  }, \\n  \\"origin\\": \\"173.0.152.100\\", \\n  \\"url\\": \\"https://httpbin.org/get\\"\\n}\\n
"}""", # noqa: E501 - ), - "expected": HtmlResponse( - url="https://httpbin.org/get", - status=200, - headers={ - "X-Crawlera-Slave": "196.16.27.20:8800", - "X-Crawlera-Version": "1.43.0-", - "status": "200", - "date": "Fri, 24 Apr 2020 18:06:42 GMT", - "content-type": "application/json", - "content-length": "756", - "server": "gunicorn/19.9.0", - "access-control-allow-origin": "*", - "access-control-allow-credentials": "true", - }, - body=b"""
{\n  "args": {}, \n  "headers": {\n    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", \n    "Accept-Encoding": "gzip, deflate, br", \n    "Accept-Language": "en-US,en;q=0.9", \n    "Cache-Control": "no-cache", \n    "Host": "httpbin.org", \n    "Pragma": "no-cache", \n    "Sec-Fetch-Mode": "navigate", \n    "Sec-Fetch-Site": "none", \n    "Sec-Fetch-User": "?1", \n    "Upgrade-Insecure-Requests": "1", \n    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36", \n    "X-Amzn-Trace-Id": "Root=1-5ea32ab2-93f521ee8238c744c88e3fec"\n  }, \n  "origin": "173.0.152.100", \n  "url": "https://httpbin.org/get"\n}\n
""", # noqa: E501 - ), - } -) - -test_responses.append( - { - "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], - status=200, headers={ - "Content-Type": "application/json", - "Content-Encoding": "gzip", - "Transfer-Encoding": "chunked", - "Date": "Fri, 24 Apr 2020 18:22:10 GMT", + "X-Crawlera-Error": "bad_proxy_auth", + "Proxy-Authenticate": 'Basic realm="Crawlera"', + "Content-Length": "0", + "Date": "Mon, 04 May 2020 13:06:15 GMT", "Proxy-Connection": "close", "Connection": "close", }, + ), + TextResponse( + url="https://crawlera.com/fake/api/endpoint", request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": {"start_ts": mocked_time()}, @@ -119,65 +255,17 @@ } }, ), - body=b"""{"url":"https://example.org","original_status":200,"headers":{"X-Crawlera-Slave":"192.241.80.236:3128","X-Crawlera-Version":"1.43.0-","status":"200","content-encoding":"gzip","accept-ranges":"bytes","age":"108944","cache-control":"max-age=604800","content-type":"text/html; charset=UTF-8","date":"Fri, 24 Apr 2020 18:22:10 GMT","etag":"\\"3147526947\\"","expires":"Fri, 01 May 2020 18:22:10 GMT","last-modified":"Thu, 17 Oct 2019 07:18:26 GMT","server":"ECS (dab/4B85)","vary":"Accept-Encoding","content-length":"648"},"crawlera_status":"success","body_encoding":"plain","body":"\\n Example Domain\\n\\n \\n \\n \\n \\n\\n\\n\\n
\\n

Example Domain

\\n

This domain is for use in illustrative examples in documents. You may use this\\n domain in literature without prior coordination or asking for permission.

\\n

More information...

\\n
\\n\\n\\n"}""", # noqa: E501 + body=b'{"Bad": "JSON', ), - "expected": HtmlResponse( - url="https://example.org", - status=200, - headers={ - "X-Crawlera-Slave": "192.241.80.236:3128", - "X-Crawlera-Version": "1.43.0-", - "status": "200", - "content-encoding": "gzip", - "accept-ranges": "bytes", - "age": "108944", - "cache-control": "max-age=604800", - "content-type": "text/html; charset=UTF-8", - "date": "Fri, 24 Apr 2020 18:22:10 GMT", - "etag": '"3147526947"', - "expires": "Fri, 01 May 2020 18:22:10 GMT", - "last-modified": "Thu, 17 Oct 2019 07:18:26 GMT", - "server": "ECS (dab/4B85)", - "vary": "Accept-Encoding", - "content-length": "648", - }, - body=b"""\n Example Domain\n\n \n \n \n \n\n\n\n
\n

Example Domain

\n

This domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.

\n

More information...

\n
\n\n\n""", # noqa: E501 - ), - } -) - -unprocessed = HtmlResponse( - url="https://example.org", - status=200, - headers={ - "Content-Type": "text/html", - "Content-Encoding": "gzip", - "Transfer-Encoding": "chunked", - "Date": "Fri, 24 Apr 2020 18:06:42 GMT", - }, - request=Request("https://example.org"), - body=b"""""", -) -test_responses.append({"original": unprocessed, "expected": unprocessed}) - -response_body_test = b"Hello middleware test!" -test_responses.append( - { - "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], - status=200, - headers={ - "Content-Type": "application/json", - "Content-Encoding": "gzip", - "Date": "Fri, 24 Apr 2020 18:22:10 GMT", - }, + TextResponse( + url="https://crawlera.com/fake/api/endpoint", request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( - Request("http://httpbin.org/ip"), + Request("https://example.org"), spider=foo_spider, ), } @@ -185,28 +273,15 @@ ), body=json.dumps( { - "id": "8498642e-1de3-40dd-b32f-f6eb6131e45e", - "body": base64.b64encode(response_body_test).decode(), - "original_status": 200, - "url": "http://httpbin.org/ip", - "original_url": "http://httpbin.org/ip", - "headers": { - "Content-Encoding": "gzip", - "Content-Type": "text/html", - "Date": "Fri, 24 Apr 2020 18:06:42 GMT", - }, + "url": "https://example.org", + "original_status": 503, + "headers": {}, + "crawlera_status": "fail", + "crawlera_error": "serverbusy", + "body_encoding": "plain", + "body": "Server busy: too many outstanding requests", } - ).encode(), - ), - "expected": HtmlResponse( - url="http://httpbin.org/ip", - status=200, - headers={ - "content-encoding": "gzip", - "content-type": "text/html", - "date": "Fri, 24 Apr 2020 18:06:42 GMT", - }, - body=response_body_test, + ), + encoding="utf8", ), - } -) + ] diff --git a/tests/test_requests.py b/tests/test_requests.py index b36423a..c7bbcc3 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1,6 +1,4 @@ import json -import os -from contextlib import contextmanager from unittest.mock import patch from scrapy import Request @@ -8,19 +6,7 @@ from crawlera_fetch import DownloadSlotPolicy from tests.data.requests import get_test_requests -from tests.utils import foo_spider, get_test_middleware, mocked_time - - -@contextmanager -def shub_jobkey_env_variable(): - SHUB_JOBKEY_OLD = os.environ.get("SHUB_JOBKEY") - os.environ["SHUB_JOBKEY"] = "1/2/3" - try: - yield - finally: - del os.environ["SHUB_JOBKEY"] - if SHUB_JOBKEY_OLD: - os.environ["SHUB_JOBKEY"] = SHUB_JOBKEY_OLD +from tests.utils import foo_spider, get_test_middleware, mocked_time, shub_jobkey_env_variable def test_process_request_disabled(): diff --git a/tests/test_responses.py b/tests/test_responses.py index 2ddf87b..0aea814 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -3,18 +3,17 @@ import pytest from scrapy import Request from scrapy.http.response.text import TextResponse -from scrapy.utils.reqser import request_to_dict from testfixtures import LogCapture -from crawlera_fetch.middleware import CrawleraFetchException +from crawlera_fetch.middleware import CrawleraFetchException, OnError -from tests.data.responses import test_responses -from tests.utils import foo_spider, get_test_middleware, mocked_time +from tests.data.responses import get_test_responses, get_test_responses_error +from tests.utils import foo_spider, get_test_middleware def test_process_response_disabled(): middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ENABLED": False}) - for case in test_responses: + for case in get_test_responses(): response = case["original"] assert middleware.process_response(response.request, response, foo_spider) is response @@ -22,7 +21,7 @@ def test_process_response_disabled(): def test_process_response(): middleware = get_test_middleware() - for case in test_responses: + for case in get_test_responses(): original = case["original"] expected = case["expected"] @@ -64,89 +63,86 @@ def test_process_response_skip(): def test_process_response_error(): - response_list = [ - TextResponse( - url="https://crawlera.com/fake/api/endpoint", - request=Request( - url="https://crawlera.com/fake/api/endpoint", - meta={ - "crawlera_fetch": { - "timing": {"start_ts": mocked_time()}, - "original_request": request_to_dict( - Request("https://example.org"), - spider=foo_spider, - ), - } - }, - ), - headers={ - "X-Crawlera-Error": "bad_proxy_auth", - "Proxy-Authenticate": 'Basic realm="Crawlera"', - "Content-Length": "0", - "Date": "Mon, 04 May 2020 13:06:15 GMT", - "Proxy-Connection": "close", - "Connection": "close", - }, + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Raise}) + for response in get_test_responses_error(): + with pytest.raises(CrawleraFetchException): + middleware.process_response(response.request, response, foo_spider) + + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + + +def test_process_response_error_default_raise(): + with LogCapture() as logs: + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": "invalid"}) + logs.check_present( + ( + "crawlera-fetch-middleware", + "WARNING", + "Invalid type for CRAWLERA_FETCH_ON_ERROR setting: expected crawlera_fetch.OnError, got ", # noqa: E501 + ), + ) + + for response in get_test_responses_error(): + with pytest.raises(CrawleraFetchException): + middleware.process_response(response.request, response, foo_spider) + + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + + +def test_process_response_warn(): + with LogCapture() as logs: + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Warn}) + for response in get_test_responses_error(): + processed = middleware.process_response(response.request, response, foo_spider) + assert response is processed + + logs.check_present( + ( + "crawlera-fetch-middleware", + "WARNING", + "Error downloading (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), - TextResponse( - url="https://crawlera.com/fake/api/endpoint", - request=Request( - url="https://crawlera.com/fake/api/endpoint", - meta={ - "crawlera_fetch": { - "timing": {"start_ts": mocked_time()}, - "original_request": request_to_dict( - Request("https://example.org"), - spider=foo_spider, - ), - } - }, - ), - body=b'{"Bad": "JSON', + ( + "crawlera-fetch-middleware", + "WARNING", + "Error decoding (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), - TextResponse( - url="https://crawlera.com/fake/api/endpoint", - request=Request( - url="https://crawlera.com/fake/api/endpoint", - meta={ - "crawlera_fetch": { - "timing": {"start_ts": mocked_time()}, - "original_request": request_to_dict( - Request("https://example.org"), - spider=foo_spider, - ), - } - }, - ), - body=json.dumps( - { - "url": "https://example.org", - "original_status": 503, - "headers": {}, - "crawlera_status": "fail", - "crawlera_error": "serverbusy", - "body_encoding": "plain", - "body": "Server busy: too many outstanding requests", - } - ), - encoding="utf8", + ( + "crawlera-fetch-middleware", + "WARNING", + "Error downloading (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)", # noqa: E501 ), - ] + ) + + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 - middleware_raise = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) - for response in response_list: + +def test_process_response_error_deprecated(): + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) + for response in get_test_responses_error(): with pytest.raises(CrawleraFetchException): - middleware_raise.process_response(response.request, response, foo_spider) + middleware.process_response(response.request, response, foo_spider) + + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error") == 3 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 - middleware_log = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) +def test_process_response_warn_deprecated(): with LogCapture() as logs: - for response in response_list: - processed = middleware_log.process_response(response.request, response, foo_spider) + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) + for response in get_test_responses_error(): + processed = middleware.process_response(response.request, response, foo_spider) assert response is processed logs.check_present( @@ -167,7 +163,7 @@ def test_process_response_error(): ), ) - assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 diff --git a/tests/test_retry.py b/tests/test_retry.py new file mode 100644 index 0000000..b255020 --- /dev/null +++ b/tests/test_retry.py @@ -0,0 +1,108 @@ +from scrapy.http.request import Request +from scrapy.http.response import Response +from testfixtures import LogCapture + +from crawlera_fetch.middleware import OnError + +from tests.data.responses import get_test_responses, get_test_responses_error +from tests.utils import foo_spider, get_test_middleware + + +def test_process_response_should_retry_function(): + def retry_function(response, request, spider): + return True + + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": retry_function}) + assert middleware.should_retry is not None + for case in get_test_responses(include_unprocessed=False): + response = case["original"] + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Request) + assert result.url == response.request.url + + base_key = "crawlera_fetch/retry/should-retry" + assert middleware.stats.get_value(base_key + "/count") == 4 + assert middleware.stats.get_value(base_key + "/reason_count/should-retry") == 4 + + +def test_process_response_should_retry_spider_method(): + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": "should_retry"}) + assert middleware.should_retry is not None + for case in get_test_responses(include_unprocessed=False): + response = case["original"] + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Request) + assert result.url == response.request.url + + base_key = "crawlera_fetch/retry/should-retry" + assert middleware.stats.get_value(base_key + "/count") == 4 + assert middleware.stats.get_value(base_key + "/reason_count/should-retry") == 4 + + +def test_process_response_should_retry_spider_method_non_existent(): + with LogCapture() as logs: + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": "not_found"}) + + assert middleware.should_retry is None + logs.check_present( + ( + "crawlera-fetch-middleware", + "WARNING", + "Could not find a 'not_found' callable on the spider - user retries are disabled", + ) + ) + + for case in get_test_responses(include_unprocessed=False): + response = case["original"] + expected = case["expected"] + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Response) + assert type(result) is type(expected) + assert result.url == expected.url + assert result.status == expected.status + + base_key = "crawlera_fetch/retry/should-retry" + assert middleware.stats.get_value(base_key + "/count") is None + assert middleware.stats.get_value(base_key + "/reason_count/should-retry") is None + + +def test_process_response_should_retry_invalid_type(): + with LogCapture() as logs: + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": 1}) + + assert middleware.should_retry is None + logs.check_present( + ( + "crawlera-fetch-middleware", + "WARNING", + "Invalid type for retry function: expected Callable" + " or str, got - user retries are disabled", + ) + ) + + for case in get_test_responses(include_unprocessed=False): + response = case["original"] + expected = case["expected"] + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Response) + assert type(result) is type(expected) + assert result.url == expected.url + assert result.status == expected.status + + base_key = "crawlera_fetch/retry/should-retry" + assert middleware.stats.get_value(base_key + "/count") is None + assert middleware.stats.get_value(base_key + "/reason_count/should-retry") is None + + +def test_process_response_error(): + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Retry}) + for response in get_test_responses_error(): + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Request) + assert result.url == response.request.url + + base_key = "crawlera_fetch/retry/error" + assert middleware.stats.get_value(base_key + "/count") == 3 + assert middleware.stats.get_value(base_key + "/reason_count/bad_proxy_auth") == 1 + assert middleware.stats.get_value(base_key + "/reason_count/json.decoder.JSONDecodeError") == 1 + assert middleware.stats.get_value(base_key + "/reason_count/serverbusy") == 1 diff --git a/tests/utils.py b/tests/utils.py index 6b010c9..959f023 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,7 @@ -from urllib.parse import urlparse +import os +from contextlib import contextmanager from unittest.mock import Mock +from urllib.parse import urlparse from scrapy import Spider from scrapy.utils.test import get_crawler @@ -27,6 +29,9 @@ class FooSpider(Spider): def foo_callback(self, response): pass + def should_retry(self, response, request, spider): + return True + foo_spider = FooSpider() @@ -37,6 +42,7 @@ def get_test_middleware(settings=None): foo_spider = FooSpider() foo_spider.crawler = get_crawler(FooSpider, settings_dict=settings_dict) + foo_spider.crawler.spider = foo_spider foo_spider.crawler.engine = MockEngine() middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) @@ -46,3 +52,15 @@ def get_test_middleware(settings=None): mocked_time = Mock(return_value=1234567890.123) + + +@contextmanager +def shub_jobkey_env_variable(): + SHUB_JOBKEY_OLD = os.environ.get("SHUB_JOBKEY") + os.environ["SHUB_JOBKEY"] = "1/2/3" + try: + yield + finally: + del os.environ["SHUB_JOBKEY"] + if SHUB_JOBKEY_OLD: + os.environ["SHUB_JOBKEY"] = SHUB_JOBKEY_OLD diff --git a/tox.ini b/tox.ini index 21c7d05..45a9a5a 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,14 @@ envlist = flake8,black,typing,py deps = -rtests/requirements.txt commands = - py.test --verbose --cov=crawlera_fetch --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: crawlera_fetch tests} + py.test \ + --verbose \ + --cov=crawlera_fetch \ + --cov-config=.coveragerc \ + --cov-report=term-missing \ + --cov-report=html \ + --cov-report=xml \ + {posargs:crawlera_fetch tests} [testenv:flake8] deps = flake8>=3.7.9 @@ -21,7 +28,7 @@ commands = black --check crawlera_fetch tests [testenv:typing] deps = mypy==0.770 basepython = python3.8 -commands = mypy --ignore-missing-imports --follow-imports=skip crawlera_fetch tests +commands = mypy --show-error-codes --ignore-missing-imports --follow-imports=skip crawlera_fetch tests [testenv:py35-pinned] deps =