From d43e8e64bc21ef4a5c0f06df3733f425b7da2a45 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 8 Feb 2022 14:52:00 +0800 Subject: [PATCH 01/71] add basic integration for web-poet's support on additional requests --- scrapy_poet/backend.py | 37 +++++++++++++++++++++++++++++++++++++ scrapy_poet/middleware.py | 6 ++++++ 2 files changed, 43 insertions(+) create mode 100644 scrapy_poet/backend.py diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py new file mode 100644 index 00000000..15596d32 --- /dev/null +++ b/scrapy_poet/backend.py @@ -0,0 +1,37 @@ +import logging +from contextvars import ContextVar + +import scrapy +import scrapy_poet +from web_poet.page_inputs import ResponseData + + +logger = logging.getLogger(__name__) + + +def enable_backend(): + from web_poet import request_backend_var + request_backend_var.set(scrapy_poet_backend) + + +# Binds the Scrapy Downloader here. The scrapy_poet.InjectionMiddleware will +# update this later when the spider starts. +scrapy_downloader_var = ContextVar("downloader") + + +async def scrapy_poet_backend(url): + """To use this, frameworks must run: ``scrapy_poet.enable_backend()``.""" + + if isinstance(url, str): + request = scrapy.Request(url) + + try: + scrapy_downloader = scrapy_downloader_var.get() + deferred = scrapy_downloader(request) + + response = await scrapy.utils.defer.deferred_to_future(deferred) + + return ResponseData(url=response.url, html=response.text) + + except scrapy.exceptions.IgnoreRequest: + logger.warning(f"Additional Request Ignored: {request}") diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index afc631f1..cca18980 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -15,6 +15,7 @@ from .overrides import PerDomainOverridesRegistry from .page_input_providers import HttpResponseProvider from .injection import Injector +from .backend import enable_backend, scrapy_downloader_var logger = logging.getLogger(__name__) @@ -48,11 +49,16 @@ def __init__(self, crawler: Crawler) -> None: def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> InjectionMiddlewareTV: o = cls(crawler) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) + crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) return o def spider_closed(self, spider: Spider) -> None: self.injector.close() + def spider_opened(self, spider): + scrapy_downloader_var.set(spider.crawler.engine.download) + enable_backend() + def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` From 9bc60d0445c54f1ebd421bb1a1888032256674a9 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 8 Feb 2022 17:22:12 +0800 Subject: [PATCH 02/71] create provider for web-poet's new HttpClient and GenericRequest --- scrapy_poet/backend.py | 9 +++++---- scrapy_poet/middleware.py | 3 ++- scrapy_poet/page_input_providers.py | 11 +++++++++++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 15596d32..ba299458 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -1,9 +1,11 @@ import logging from contextvars import ContextVar +import attr import scrapy import scrapy_poet from web_poet.page_inputs import ResponseData +from web_poet.requests import GenericRequest logger = logging.getLogger(__name__) @@ -16,14 +18,13 @@ def enable_backend(): # Binds the Scrapy Downloader here. The scrapy_poet.InjectionMiddleware will # update this later when the spider starts. -scrapy_downloader_var = ContextVar("downloader") +scrapy_downloader_var: ContextVar = ContextVar("downloader") -async def scrapy_poet_backend(url): +async def scrapy_poet_backend(request: GenericRequest): """To use this, frameworks must run: ``scrapy_poet.enable_backend()``.""" - if isinstance(url, str): - request = scrapy.Request(url) + request = scrapy.Request(**attr.asdict(request)) try: scrapy_downloader = scrapy_downloader_var.get() diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index cca18980..9304568e 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -13,7 +13,7 @@ from scrapy.utils.misc import create_instance, load_object from .api import DummyResponse from .overrides import PerDomainOverridesRegistry -from .page_input_providers import HttpResponseProvider +from .page_input_providers import HttpResponseProvider, HttpClientProvider from .injection import Injector from .backend import enable_backend, scrapy_downloader_var @@ -23,6 +23,7 @@ DEFAULT_PROVIDERS = { HttpResponseProvider: 500 + HttpClientProvider: 600, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 5eac478d..570de58d 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -20,7 +20,9 @@ from scrapy.utils.request import request_fingerprint from scrapy_poet.injection_errors import MalformedProvidedClassesError +from scrapy_poet.backend import scrapy_poet_backend from web_poet import HttpResponse, HttpResponseHeaders +from web_poet.requests import HttpClient class PageObjectInputProvider: @@ -198,3 +200,12 @@ def deserialize(self, data: Any) -> Sequence[Any]: ) for response_data in data ] + + +class HttpClientProvider(PageObjectInputProvider): + """This class provides ``web_poet.requests.HttpClient`` instances.""" + provided_classes = {HttpClient} + + def __call__(self, to_provide: Set[Callable]): + """Build a ``ResponseData`` instance using a Scrapy ``Response``""" + return [HttpClient(request_downloader=scrapy_poet_backend)] From 75051769cd42dbaeba08ff97abb08802a5debcb8 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 8 Feb 2022 18:22:29 +0800 Subject: [PATCH 03/71] enable tox dep in draft branch of web-poet for CI --- scrapy_poet/page_input_providers.py | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 570de58d..78e046a1 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -207,5 +207,5 @@ class HttpClientProvider(PageObjectInputProvider): provided_classes = {HttpClient} def __call__(self, to_provide: Set[Callable]): - """Build a ``ResponseData`` instance using a Scrapy ``Response``""" + """Creates an ``HttpClient``` instance using Scrapy's downloader.""" return [HttpClient(request_downloader=scrapy_poet_backend)] diff --git a/tox.ini b/tox.ini index 178fb43e..6ee99372 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ deps = pytest-cov scrapy >= 2.1.0 pytest-twisted - web-poet @ git+https://git@github.com/scrapinghub/web-poet@from_bytes#egg=web-poet + web-poet @ git+https://git@github.com/scrapinghub/web-poet@additional-requests#egg=web-poet commands = py.test \ From c30546fbbf6a80490c8e241bc369bf0333277647 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 9 Feb 2022 13:11:37 +0800 Subject: [PATCH 04/71] use the new status and headers from ResponseData --- scrapy_poet/backend.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index ba299458..606bc10b 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -5,7 +5,7 @@ import scrapy import scrapy_poet from web_poet.page_inputs import ResponseData -from web_poet.requests import GenericRequest +from web_poet.requests import GenericRequest, RequestBackendError logger = logging.getLogger(__name__) @@ -24,7 +24,13 @@ def enable_backend(): async def scrapy_poet_backend(request: GenericRequest): """To use this, frameworks must run: ``scrapy_poet.enable_backend()``.""" - request = scrapy.Request(**attr.asdict(request)) + try: + request = scrapy.Request(**attr.asdict(request)) + except TypeError: + raise RequestBackendError( + f"The given GenericRequest isn't compatible with `scrapy.Request`. " + f"Ensure that it doesn't contain extra fields: {request}" + ) try: scrapy_downloader = scrapy_downloader_var.get() @@ -32,7 +38,12 @@ async def scrapy_poet_backend(request: GenericRequest): response = await scrapy.utils.defer.deferred_to_future(deferred) - return ResponseData(url=response.url, html=response.text) + return ResponseData( + url=response.url, + html=response.text, + status=response.status, + headers=response.headers, + ) except scrapy.exceptions.IgnoreRequest: logger.warning(f"Additional Request Ignored: {request}") From c7918eb90e66b7363f7b8eca6b51b7eccd3d3ca7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 9 Feb 2022 15:57:43 +0800 Subject: [PATCH 05/71] accept either web_poet.GenericRequest and scrapy.Request --- scrapy_poet/backend.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 606bc10b..964a61ef 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -1,5 +1,6 @@ import logging from contextvars import ContextVar +from typing import Union import attr import scrapy @@ -21,16 +22,21 @@ def enable_backend(): scrapy_downloader_var: ContextVar = ContextVar("downloader") -async def scrapy_poet_backend(request: GenericRequest): - """To use this, frameworks must run: ``scrapy_poet.enable_backend()``.""" +async def scrapy_poet_backend(request: Union[GenericRequest, scrapy.Request]): + """To use this, frameworks must run: ``scrapy_poet.enable_backend()``. - try: - request = scrapy.Request(**attr.asdict(request)) - except TypeError: - raise RequestBackendError( - f"The given GenericRequest isn't compatible with `scrapy.Request`. " - f"Ensure that it doesn't contain extra fields: {request}" - ) + The request could either be ``web_poet.GenericRequest`` or even a + ``scrapy.Request`` to give developers more fine grain control. + """ + + if not isinstance(request, scrapyRequest): + try: + request = scrapy.Request(**attr.asdict(request)) + except TypeError: + raise RequestBackendError( + f"The given GenericRequest isn't compatible with `scrapy.Request`. " + f"Ensure that it doesn't contain extra fields: {request}" + ) try: scrapy_downloader = scrapy_downloader_var.get() From 7f539bba583f29e4bfb45480b3bfb0c87291d836 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 9 Feb 2022 22:25:40 +0800 Subject: [PATCH 06/71] create provider for web_poet.page_inputs.Meta --- scrapy_poet/backend.py | 2 +- scrapy_poet/middleware.py | 3 ++- scrapy_poet/page_input_providers.py | 11 ++++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 964a61ef..560c95b4 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -29,7 +29,7 @@ async def scrapy_poet_backend(request: Union[GenericRequest, scrapy.Request]): ``scrapy.Request`` to give developers more fine grain control. """ - if not isinstance(request, scrapyRequest): + if not isinstance(request, scrapy.Request): try: request = scrapy.Request(**attr.asdict(request)) except TypeError: diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 9304568e..05c4f513 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -13,7 +13,7 @@ from scrapy.utils.misc import create_instance, load_object from .api import DummyResponse from .overrides import PerDomainOverridesRegistry -from .page_input_providers import HttpResponseProvider, HttpClientProvider +from .page_input_providers import HttpResponseProvider, HttpClientProvider, MetaProvider from .injection import Injector from .backend import enable_backend, scrapy_downloader_var @@ -24,6 +24,7 @@ DEFAULT_PROVIDERS = { HttpResponseProvider: 500 HttpClientProvider: 600, + MetaProvider: 700, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 78e046a1..aa9ae613 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -21,7 +21,7 @@ from scrapy_poet.injection_errors import MalformedProvidedClassesError from scrapy_poet.backend import scrapy_poet_backend -from web_poet import HttpResponse, HttpResponseHeaders +from web_poet import HttpResponse, HttpResponseHeaders, Meta from web_poet.requests import HttpClient @@ -209,3 +209,12 @@ class HttpClientProvider(PageObjectInputProvider): def __call__(self, to_provide: Set[Callable]): """Creates an ``HttpClient``` instance using Scrapy's downloader.""" return [HttpClient(request_downloader=scrapy_poet_backend)] + + +class MetaProvider(PageObjectInputProvider): + """This class provides ``web_poet.page_inputs.Meta`` instances.""" + provided_classes = {Meta} + + def __call__(self, to_provide: Set[Callable], request: Request): + """Creates an ``HttpClient``` instance using Scrapy's downloader.""" + return [Meta(**request.meta)] From ed2c489c09e74bd7d8b5cac7b84f97aa9d358207 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 17 Feb 2022 12:45:17 +0800 Subject: [PATCH 07/71] use 'po_args' inside a Request meta instead of using the entire meta --- scrapy_poet/backend.py | 2 +- scrapy_poet/page_input_providers.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 560c95b4..c4e909b1 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -13,7 +13,7 @@ def enable_backend(): - from web_poet import request_backend_var + from web_poet.requests import request_backend_var request_backend_var.set(scrapy_poet_backend) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index aa9ae613..f0973d5d 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -207,7 +207,9 @@ class HttpClientProvider(PageObjectInputProvider): provided_classes = {HttpClient} def __call__(self, to_provide: Set[Callable]): - """Creates an ``HttpClient``` instance using Scrapy's downloader.""" + """Creates an ``web_poet.requests.HttpClient``` instance using Scrapy's + downloader. + """ return [HttpClient(request_downloader=scrapy_poet_backend)] @@ -216,5 +218,7 @@ class MetaProvider(PageObjectInputProvider): provided_classes = {Meta} def __call__(self, to_provide: Set[Callable], request: Request): - """Creates an ``HttpClient``` instance using Scrapy's downloader.""" - return [Meta(**request.meta)] + """Creates a ``web_poet.requests.Meta`` instance based on the data found + from the ``meta["po_args"]`` field of a ``scrapy.http.Response``instance. + """ + return [Meta(**request.meta.get("po_args", {}))] From 0bd3b8085d16700b9f313d5301c627176e03c198 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 17 Feb 2022 18:08:50 +0800 Subject: [PATCH 08/71] use web-poet's new Request container --- scrapy_poet/backend.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index c4e909b1..3b717e9d 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -6,7 +6,7 @@ import scrapy import scrapy_poet from web_poet.page_inputs import ResponseData -from web_poet.requests import GenericRequest, RequestBackendError +from web_poet.requests import Request, RequestBackendError logger = logging.getLogger(__name__) @@ -22,10 +22,10 @@ def enable_backend(): scrapy_downloader_var: ContextVar = ContextVar("downloader") -async def scrapy_poet_backend(request: Union[GenericRequest, scrapy.Request]): +async def scrapy_poet_backend(request: Union[Request, scrapy.Request]): """To use this, frameworks must run: ``scrapy_poet.enable_backend()``. - The request could either be ``web_poet.GenericRequest`` or even a + The request could either be ``web_poet.Request`` or even a ``scrapy.Request`` to give developers more fine grain control. """ @@ -34,7 +34,7 @@ async def scrapy_poet_backend(request: Union[GenericRequest, scrapy.Request]): request = scrapy.Request(**attr.asdict(request)) except TypeError: raise RequestBackendError( - f"The given GenericRequest isn't compatible with `scrapy.Request`. " + f"The given Request isn't compatible with `scrapy.Request`. " f"Ensure that it doesn't contain extra fields: {request}" ) From 54885049555b09c8f5c2cc8d9213809e7abc6045 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 18 Feb 2022 15:19:05 +0800 Subject: [PATCH 09/71] sync dep to WIP branch to run tox tests --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 6ee99372..464aacfc 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ deps = pytest-cov scrapy >= 2.1.0 pytest-twisted - web-poet @ git+https://git@github.com/scrapinghub/web-poet@additional-requests#egg=web-poet + web-poet @ git+https://git@github.com/scrapinghub/web-poet@meta#egg=web-poet commands = py.test \ From b5e9c5623ac8fc8d84f6cd4a7b910360ad4b5d6b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 21 Feb 2022 15:56:38 +0800 Subject: [PATCH 10/71] add tests --- scrapy_poet/backend.py | 3 +- tests/test_backend.py | 71 +++++++++++++++++++++++++++++++++++++++++ tests/test_providers.py | 56 ++++++++++++++++++++++++++------ tox.ini | 5 +++ 4 files changed, 125 insertions(+), 10 deletions(-) create mode 100644 tests/test_backend.py diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 3b717e9d..051def10 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -5,6 +5,7 @@ import attr import scrapy import scrapy_poet +from scrapy.utils.defer import deferred_to_future from web_poet.page_inputs import ResponseData from web_poet.requests import Request, RequestBackendError @@ -42,7 +43,7 @@ async def scrapy_poet_backend(request: Union[Request, scrapy.Request]): scrapy_downloader = scrapy_downloader_var.get() deferred = scrapy_downloader(request) - response = await scrapy.utils.defer.deferred_to_future(deferred) + response = await deferred_to_future(deferred) return ResponseData( url=response.url, diff --git a/tests/test_backend.py b/tests/test_backend.py new file mode 100644 index 00000000..7a64204f --- /dev/null +++ b/tests/test_backend.py @@ -0,0 +1,71 @@ +import attr +import pytest +from unittest import mock + +import web_poet +import scrapy +from scrapy_poet.backend import scrapy_downloader_var +from scrapy_poet.backend import scrapy_poet_backend, RequestBackendError + + +class AsyncMock(mock.MagicMock): + """workaround since python 3.7 doesn't ship with asyncmock.""" + + async def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs) + + +@pytest.mark.asyncio +async def test_scrapy_poet_backend_var_unset(): + """The ContextVar must be set first.""" + req = web_poet.Request("https://example.com") + + with pytest.raises(LookupError): + await scrapy_poet_backend(req) + + +@pytest.mark.asyncio +async def test_scrapy_poet_backend_incompatible_request(): + """The Request must have fields that are a subset of `scrapy.Request`.""" + + @attr.define + class Request(web_poet.Request): + incompatible_field: str = "value" + + req = Request("https://example.com") + + with pytest.raises(RequestBackendError): + await scrapy_poet_backend(req) + + +@pytest.mark.asyncio +async def test_scrapy_poet_backend(): + req = web_poet.Request("https://example.com") + + with mock.patch( + "scrapy_poet.backend.deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + + mock_downloader = mock.MagicMock(return_value=AsyncMock) + scrapy_downloader_var.set(mock_downloader) + + response = await scrapy_poet_backend(req) + + mock_downloader.assert_called_once() + assert isinstance(response, web_poet.ResponseData) + + +@pytest.mark.asyncio +async def test_scrapy_poet_backend_ignored_request(): + """It should handle IgnoreRequest from Scrapy gracefully.""" + req = web_poet.Request("https://example.com") + + with mock.patch( + "scrapy_poet.backend.deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + + mock_downloader = mock.MagicMock(return_value=AsyncMock) + mock_downloader.side_effect = scrapy.exceptions.IgnoreRequest + scrapy_downloader_var.set(mock_downloader) + + await scrapy_poet_backend(req) diff --git a/tests/test_providers.py b/tests/test_providers.py index f05030ea..25da959b 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -11,9 +11,15 @@ from scrapy.crawler import Crawler from scrapy.settings import Settings from scrapy.utils.test import get_crawler -from scrapy_poet.page_input_providers import CacheDataProviderMixin, PageObjectInputProvider +from scrapy_poet.page_input_providers import ( + CacheDataProviderMixin, + PageObjectInputProvider, + HttpClientProvider, + MetaProvider, +) +from scrapy_poet.backend import scrapy_poet_backend from tests.utils import crawl_single_item, HtmlResource -from web_poet import HttpResponse +from web_poet import HttpResponse, HttpClient class ProductHtml(HtmlResource): @@ -29,6 +35,7 @@ class ProductHtml(HtmlResource): """ + class NonProductHtml(HtmlResource): html = """ @@ -61,7 +68,9 @@ def __init__(self, crawler: Crawler): assert isinstance(crawler, Crawler) super().__init__(crawler) - def __call__(self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider): + def __call__( + self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider + ): assert isinstance(spider, scrapy.Spider) ret: List[Any] = [] if Price in to_provide: @@ -152,8 +161,9 @@ class NameFirstMultiProviderSpider(PriceFirstMultiProviderSpider): def test_name_first_spider(settings, tmp_path): cache = tmp_path / "cache.sqlite3" settings.set("SCRAPY_POET_CACHE", str(cache)) - item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, ProductHtml, - settings) + item, _, _ = yield crawl_single_item( + NameFirstMultiProviderSpider, ProductHtml, settings + ) assert cache.exists() assert item == { Price: Price("22€"), @@ -164,8 +174,9 @@ def test_name_first_spider(settings, tmp_path): # Let's see that the cache is working. We use a different and wrong resource, # but it should be ignored by the cached version used - item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, NonProductHtml, - settings) + item, _, _ = yield crawl_single_item( + NameFirstMultiProviderSpider, NonProductHtml, settings + ) assert item == { Price: Price("22€"), Name: Name("Chocolate"), @@ -176,8 +187,9 @@ def test_name_first_spider(settings, tmp_path): @inlineCallbacks def test_price_first_spider(settings): - item, _, _ = yield crawl_single_item(PriceFirstMultiProviderSpider, ProductHtml, - settings) + item, _, _ = yield crawl_single_item( + PriceFirstMultiProviderSpider, ProductHtml, settings + ) assert item == { Price: Price("22€"), Name: Name("Chocolate"), @@ -194,3 +206,29 @@ def test_response_data_provider_fingerprint(settings): # The fingerprint should be readable since it's JSON-encoded. fp = rdp.fingerprint(scrapy.http.Response, request) assert json.loads(fp) + + +def test_http_client_provider(settings): + crawler = get_crawler(Spider, settings) + provider = HttpClientProvider(crawler) + + results = provider(set()) + + assert isinstance(results[0], HttpClient) + assert results[0].request_downloader == scrapy_poet_backend + + +def test_meta_provider(settings): + crawler = get_crawler(Spider, settings) + provider = MetaProvider(crawler) + request = scrapy.http.Request("https://example.com") + + results = provider(set(), request) + + assert results[0] == {} + + expected_data = {"key": "value"} + request.meta.update({"po_args": expected_data}) + results = provider(set(), request) + + assert results[0] == expected_data diff --git a/tox.ini b/tox.ini index 464aacfc..00c3eb35 100644 --- a/tox.ini +++ b/tox.ini @@ -1,13 +1,18 @@ [tox] envlist = py37,py38,py39,py310,mypy,docs +[pytest] +asyncio_mode = strict + [testenv] deps = pytest pytest-cov + pytest-asyncio scrapy >= 2.1.0 pytest-twisted web-poet @ git+https://git@github.com/scrapinghub/web-poet@meta#egg=web-poet + scrapy @ git+https://github.com/scrapy/scrapy.git@30d5779#egg=scrapy commands = py.test \ From 4dd19b8d9516cbf466d3d820fc9dff4f1fc1226f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 11 Mar 2022 13:33:48 +0800 Subject: [PATCH 11/71] remove ContextVar approach and use Dependency Injection in Provider instead --- scrapy_poet/backend.py | 53 +++++++++++------------------ scrapy_poet/middleware.py | 6 ---- scrapy_poet/page_input_providers.py | 9 ++--- tests/test_backend.py | 44 ++++++++++++------------ tests/test_providers.py | 21 +++++++----- tests/utils.py | 8 +++++ 6 files changed, 67 insertions(+), 74 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 051def10..cb708577 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -1,10 +1,7 @@ import logging -from contextvars import ContextVar -from typing import Union import attr import scrapy -import scrapy_poet from scrapy.utils.defer import deferred_to_future from web_poet.page_inputs import ResponseData from web_poet.requests import Request, RequestBackendError @@ -13,24 +10,14 @@ logger = logging.getLogger(__name__) -def enable_backend(): - from web_poet.requests import request_backend_var - request_backend_var.set(scrapy_poet_backend) - - -# Binds the Scrapy Downloader here. The scrapy_poet.InjectionMiddleware will -# update this later when the spider starts. -scrapy_downloader_var: ContextVar = ContextVar("downloader") - - -async def scrapy_poet_backend(request: Union[Request, scrapy.Request]): - """To use this, frameworks must run: ``scrapy_poet.enable_backend()``. - - The request could either be ``web_poet.Request`` or even a - ``scrapy.Request`` to give developers more fine grain control. - """ +def create_scrapy_backend(backend): + async def scrapy_backend(request: Request): + if not isinstance(request, Request): + raise TypeError( + f"The request should be 'web_poet.Request' but received " + f"one of type: '{type(request)}'." + ) - if not isinstance(request, scrapy.Request): try: request = scrapy.Request(**attr.asdict(request)) except TypeError: @@ -39,18 +26,16 @@ async def scrapy_poet_backend(request: Union[Request, scrapy.Request]): f"Ensure that it doesn't contain extra fields: {request}" ) - try: - scrapy_downloader = scrapy_downloader_var.get() - deferred = scrapy_downloader(request) - - response = await deferred_to_future(deferred) - - return ResponseData( - url=response.url, - html=response.text, - status=response.status, - headers=response.headers, - ) + try: + deferred = backend(request) + response = await deferred_to_future(deferred) + return ResponseData( + url=response.url, + html=response.text, + status=response.status, + headers=response.headers, + ) - except scrapy.exceptions.IgnoreRequest: - logger.warning(f"Additional Request Ignored: {request}") + except scrapy.exceptions.IgnoreRequest: + logger.warning(f"Additional Request Ignored: {request}") + return scrapy_backend diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 05c4f513..fc2bbddd 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -15,7 +15,6 @@ from .overrides import PerDomainOverridesRegistry from .page_input_providers import HttpResponseProvider, HttpClientProvider, MetaProvider from .injection import Injector -from .backend import enable_backend, scrapy_downloader_var logger = logging.getLogger(__name__) @@ -51,16 +50,11 @@ def __init__(self, crawler: Crawler) -> None: def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> InjectionMiddlewareTV: o = cls(crawler) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) - crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) return o def spider_closed(self, spider: Spider) -> None: self.injector.close() - def spider_opened(self, spider): - scrapy_downloader_var.set(spider.crawler.engine.download) - enable_backend() - def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index f0973d5d..1ae885af 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -10,7 +10,7 @@ """ import abc import json -from typing import Set, Union, Callable, ClassVar, List, Any, Sequence +from typing import Set, Union, Callable, ClassVar, Any, Sequence import attr from scrapy import Request @@ -20,7 +20,7 @@ from scrapy.utils.request import request_fingerprint from scrapy_poet.injection_errors import MalformedProvidedClassesError -from scrapy_poet.backend import scrapy_poet_backend +from scrapy_poet.backend import create_scrapy_backend from web_poet import HttpResponse, HttpResponseHeaders, Meta from web_poet.requests import HttpClient @@ -206,11 +206,12 @@ class HttpClientProvider(PageObjectInputProvider): """This class provides ``web_poet.requests.HttpClient`` instances.""" provided_classes = {HttpClient} - def __call__(self, to_provide: Set[Callable]): + def __call__(self, to_provide: Set[Callable], crawler: Crawler): """Creates an ``web_poet.requests.HttpClient``` instance using Scrapy's downloader. """ - return [HttpClient(request_downloader=scrapy_poet_backend)] + backend = create_scrapy_backend(crawler.engine.download) + return [HttpClient(request_downloader=backend)] class MetaProvider(PageObjectInputProvider): diff --git a/tests/test_backend.py b/tests/test_backend.py index 7a64204f..9a8485e0 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -4,28 +4,18 @@ import web_poet import scrapy -from scrapy_poet.backend import scrapy_downloader_var -from scrapy_poet.backend import scrapy_poet_backend, RequestBackendError +from scrapy_poet.backend import RequestBackendError, create_scrapy_backend +from tests.utils import AsyncMock -class AsyncMock(mock.MagicMock): - """workaround since python 3.7 doesn't ship with asyncmock.""" - - async def __call__(self, *args, **kwargs): - return super().__call__(*args, **kwargs) - - -@pytest.mark.asyncio -async def test_scrapy_poet_backend_var_unset(): - """The ContextVar must be set first.""" - req = web_poet.Request("https://example.com") - - with pytest.raises(LookupError): - await scrapy_poet_backend(req) +@pytest.fixture +def scrapy_backend(): + mock_backend = AsyncMock() + return create_scrapy_backend(mock_backend) @pytest.mark.asyncio -async def test_scrapy_poet_backend_incompatible_request(): +async def test_incompatible_request(scrapy_backend): """The Request must have fields that are a subset of `scrapy.Request`.""" @attr.define @@ -35,7 +25,17 @@ class Request(web_poet.Request): req = Request("https://example.com") with pytest.raises(RequestBackendError): - await scrapy_poet_backend(req) + await scrapy_backend(req) + + +@pytest.mark.asyncio +async def test_incompatible_scrapy_request(scrapy_backend): + """The Request must be web_poet.Request and not anything else.""" + + req = scrapy.Request("https://example.com") + + with pytest.raises(TypeError): + await scrapy_backend(req) @pytest.mark.asyncio @@ -47,9 +47,9 @@ async def test_scrapy_poet_backend(): ) as mock_dtf: mock_downloader = mock.MagicMock(return_value=AsyncMock) - scrapy_downloader_var.set(mock_downloader) + scrapy_backend = create_scrapy_backend(mock_downloader) - response = await scrapy_poet_backend(req) + response = await scrapy_backend(req) mock_downloader.assert_called_once() assert isinstance(response, web_poet.ResponseData) @@ -66,6 +66,6 @@ async def test_scrapy_poet_backend_ignored_request(): mock_downloader = mock.MagicMock(return_value=AsyncMock) mock_downloader.side_effect = scrapy.exceptions.IgnoreRequest - scrapy_downloader_var.set(mock_downloader) + scrapy_backend = create_scrapy_backend(mock_downloader) - await scrapy_poet_backend(req) + await scrapy_backend(req) diff --git a/tests/test_providers.py b/tests/test_providers.py index 25da959b..c4be0546 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,7 +1,9 @@ from typing import Any, List, Set, Callable, Sequence +from unittest import mock import attr import json +import pytest from pytest_twisted import inlineCallbacks from scrapy_poet import HttpResponseProvider from twisted.python.failure import Failure @@ -17,8 +19,7 @@ HttpClientProvider, MetaProvider, ) -from scrapy_poet.backend import scrapy_poet_backend -from tests.utils import crawl_single_item, HtmlResource +from tests.utils import crawl_single_item, HtmlResource, AsyncMock from web_poet import HttpResponse, HttpClient @@ -208,15 +209,19 @@ def test_response_data_provider_fingerprint(settings): assert json.loads(fp) -def test_http_client_provider(settings): +@pytest.mark.asyncio +async def test_http_client_provider(settings): crawler = get_crawler(Spider, settings) - provider = HttpClientProvider(crawler) + crawler.engine = AsyncMock() - results = provider(set()) - - assert isinstance(results[0], HttpClient) - assert results[0].request_downloader == scrapy_poet_backend + with mock.patch( + "scrapy_poet.page_input_providers.create_scrapy_backend" + ) as mock_factory: + provider = HttpClientProvider(crawler) + results = provider(set(), crawler) + assert isinstance(results[0], HttpClient) + results[0].request_downloader == mock_factory.return_value def test_meta_provider(settings): crawler = get_crawler(Spider, settings) diff --git a/tests/utils.py b/tests/utils.py index 7dd46b7d..2c6a7294 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,5 @@ from typing import Dict +from unittest import mock from pytest_twisted import inlineCallbacks from scrapy.exceptions import CloseSpider @@ -84,3 +85,10 @@ def parse(*args, **kwargs): # Mimic type annotations parse.__annotations__ = callback.__annotations__ return parse + + +class AsyncMock(mock.MagicMock): + """workaround since python 3.7 doesn't ship with asyncmock.""" + + async def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs) From 2a155f525de9cd490e7f94d1a18d3ce79a0f81c7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 11 Mar 2022 15:23:06 +0800 Subject: [PATCH 12/71] update CHANGELOG to new support on additional requests --- CHANGELOG.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 53e88092..f6c97ef9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,12 @@ TBR --- * Use the new ``web_poet.HttpResponse`` which replaces ``web_poet.ResponseData``. +* Support for the new features in ``web_poet>=0.2.0`` for supporting additional + requests inside Page Objects: + + * Created new providers for ``web_poet.Meta`` and ``web_poet.HttpClient``. + * Using the said additional requests needs ``async/await`` support in + ``asyncio``. This raises the minimum scrapy requirement to ``scrapy>=2.6.0``. 0.3.0 (2022-01-28) From e8f4c101a57d7cd33077f0b41c69a36683e7a980 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 15 Mar 2022 15:08:01 +0800 Subject: [PATCH 13/71] add docs for supporting web-poet's HttpClient and Meta --- README.rst | 24 +++ docs/index.rst | 3 +- docs/intro/advanced-tutorial.rst | 167 ++++++++++++++++++ .../{tutorial.rst => basic-tutorial.rst} | 8 +- docs/intro/install.rst | 2 +- docs/requirements.txt | 2 +- scrapy_poet/debug.log | 1 + scrapy_poet/page_input_providers.py | 4 +- setup.py | 2 +- tox.ini | 3 +- 10 files changed, 204 insertions(+), 12 deletions(-) create mode 100644 docs/intro/advanced-tutorial.rst rename docs/intro/{tutorial.rst => basic-tutorial.rst} (99%) create mode 100644 scrapy_poet/debug.log diff --git a/README.rst b/README.rst index 739f51ab..13b7e1ab 100644 --- a/README.rst +++ b/README.rst @@ -36,3 +36,27 @@ License is BSD 3-clause. * Issue tracker: https://github.com/scrapinghub/scrapy-poet/issues .. _`web-poet`: https://github.com/scrapinghub/web-poet + + +Quick Start +*********** + +Installation +============ + +.. code-block:: + + pip install scrapy-poet + +Requires **Python 3.7+** and **Scrapy >= 2.6.0**. + +Usage in a Scrapy Project +========================= + +Add the following inside Scrapy's ``settings.py`` file: + +.. code-block:: python + + DOWNLOADER_MIDDLEWARES = { + "scrapy_poet.InjectionMiddleware": 543, + } diff --git a/docs/index.rst b/docs/index.rst index 30c17ff4..1271bbe5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -35,7 +35,8 @@ To get started, see :ref:`intro-install` and :ref:`intro-tutorial`. :maxdepth: 1 intro/install - intro/tutorial + intro/basic-tutorial + intro/advanced-tutorial .. toctree:: :caption: Advanced diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst new file mode 100644 index 00000000..a87d4bbc --- /dev/null +++ b/docs/intro/advanced-tutorial.rst @@ -0,0 +1,167 @@ +.. _`intro-advanced-tutorial`: + +================= +Advanced Tutorial +================= + +This section intends to go over the supported features in **web-poet** by +**scrapy-poet**: + + * ``web_poet.HttpClient`` + * ``web_poet.Meta`` + +These are mainly achieved by **scrapy-poet** implementing **providers** for them: + + * :class:`scrapy_poet.page_input_providers.HttpClientProvider` + * :class:`scrapy_poet.page_input_providers.MetaProvider` + + +Additional Requests +=================== + +Using Page Objects using additional requests doesn't need anything special from +the spider. It would work as-is because of the readily available +:class:`scrapy_poet.page_input_providers.HttpClientProvider` that is enabled +out of the box. + +This supplies the Page Object with the necessary ``web_poet.HttpClient`` instance. +Take note the HTTP Downloader implementation that **scrapy-poet** provides to +``web_poet.HttpClient`` would be the **Scrapy Downloader**. + +.. tip:: + + This means that the additional requests inside a Page Object will have access + to the **Downloader Middlewares** that the Spider is using. + + +Suppose we have the following Page Object: + +.. code-block:: python + + import attr + import web_poet + + + @attr.define + class ProductPage(web_poet.ItemWebPage): + http_client: web_poet.HttpClient + + async def to_item(self): + item = { + "url": self.url, + "name": self.css("#main h3.name ::text").get(), + "product_id": self.css("#product ::attr(product-id)").get(), + } + + # Simulates clicking on a button that says "View All Images" + response: web_poet.ResponseData = await self.http_client.get( + f"https://api.example.com/v2/images?id={item['product_id']}" + ) + page = web_poet.WebPage(response) + item["images"] = page.css(".product-images img::attr(src)").getall() + return item + + +It can be directly used inside the spider as: + +.. code-block:: python + + import scrapy + + + def ProductSpider(scrapy.Spider): + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + } + } + + start_urls = [ + "https://example.com/category/product/item?id=123", + "https://example.com/category/product/item?id=989", + ] + + async def parse(self, response, page: ProductPage): + return await page.to_item() + +Note that we needed to update the ``parse()`` method to be an ``async`` method, +since the ``to_item()`` method of the Page Object we're using is an ``async`` +method as well. + +This is also the primary reason why **scrapy-poet** requires ``scrapy>=2.6.0`` +since it's the minimum version that has full :mod:`asyncio` support. + + +Meta +==== + +Using ``web_poet.Meta`` allows the Scrapy spider to pass any arbitrary information +into the Page Object. + +Suppose we update the earlier Page Object to control the additional request. +This basically acts as a switch to update the behavior of the Page Object: + +.. code-block:: python + + import attr + import web_poet + + + @attr.define + class ProductPage(web_poet.ItemWebPage): + http_client: web_poet.HttpClient + meta: web_poet.Meta + + async def to_item(self): + item = { + "url": self.url, + "name": self.css("#main h3.name ::text").get(), + "product_id": self.css("#product ::attr(product-id)").get(), + } + + # Simulates clicking on a button that says "View All Images" + if self.meta.get("enable_extracting_all_images") + response: web_poet.ResponseData = await self.http_client.get( + f"https://api.example.com/v2/images?id={item['product_id']}" + ) + page = web_poet.WebPage(response) + item["images"] = page.css(".product-images img::attr(src)").getall() + + return item + +Passing the ``enable_extracting_all_images`` meta value from the spider into +the Page Object can be achieved by using **Scrapy's** ``Request.meta`` attribute. +Specifically, any ``dict`` value inside the ``po_args`` parameter inside +**Scrapy's** ``Request.meta`` will be passed into ``web_poet.Meta``. + +Let's see it in action: + +.. code-block:: python + + import scrapy + + + def ProductSpider(scrapy.Spider): + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + } + } + + start_urls = [ + "https://example.com/category/product/item?id=123", + "https://example.com/category/product/item?id=989", + ] + + def start_requests(self): + for url in start_urls: + yield scrapy.Request( + url=url, + callback=self.parse, + meta={"po_args": {"enable_extracting_all_images": True}} + ) + + async def parse(self, response, page: ProductPage): + return await page.to_item() diff --git a/docs/intro/tutorial.rst b/docs/intro/basic-tutorial.rst similarity index 99% rename from docs/intro/tutorial.rst rename to docs/intro/basic-tutorial.rst index b11f76b5..9ee1fb08 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -1,8 +1,8 @@ -.. _`intro-tutorial`: +.. _`intro-basic-tutorial`: -======== -Tutorial -======== +============== +Basic Tutorial +============== In this tutorial, we’ll assume that ``scrapy-poet`` is already installed on your system. If that’s not the case, see :ref:`intro-install`. diff --git a/docs/intro/install.rst b/docs/intro/install.rst index 5cec1f17..f3d6187e 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -16,7 +16,7 @@ If you’re already familiar with installation of Python packages, you can insta pip install scrapy-poet -Scrapy 2.1.0 or above is required and it has to be installed separately. +Scrapy 2.6.0 or above is required and it has to be installed separately. Things that are good to know ============================ diff --git a/docs/requirements.txt b/docs/requirements.txt index e6337937..99443b22 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ -Scrapy >= 2.1.0 +Scrapy >= 2.6.0 Sphinx >= 3.0.3 sphinx-rtd-theme >= 0.4 diff --git a/scrapy_poet/debug.log b/scrapy_poet/debug.log new file mode 100644 index 00000000..e5cd4dd5 --- /dev/null +++ b/scrapy_poet/debug.log @@ -0,0 +1 @@ +/home/k/.pyenv/versions/3.7.9/bin/python3: can't open file 'multiple_spider_in_one_process.py': [Errno 2] No such file or directory diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 1ae885af..99cd3472 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -207,7 +207,7 @@ class HttpClientProvider(PageObjectInputProvider): provided_classes = {HttpClient} def __call__(self, to_provide: Set[Callable], crawler: Crawler): - """Creates an ``web_poet.requests.HttpClient``` instance using Scrapy's + """Creates an ``web_poet.requests.HttpClient`` instance using Scrapy's downloader. """ backend = create_scrapy_backend(crawler.engine.download) @@ -220,6 +220,6 @@ class MetaProvider(PageObjectInputProvider): def __call__(self, to_provide: Set[Callable], request: Request): """Creates a ``web_poet.requests.Meta`` instance based on the data found - from the ``meta["po_args"]`` field of a ``scrapy.http.Response``instance. + from the ``meta["po_args"]`` field of a ``scrapy.http.Response`` instance. """ return [Meta(**request.meta.get("po_args", {}))] diff --git a/setup.py b/setup.py index 52401bf2..fae62aaa 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ 'andi >= 0.4.1', 'attrs', 'parsel', - 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@from_bytes#egg=web-poet', + 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@additional-requests#egg=web-poet', 'tldextract', 'sqlitedict', ], diff --git a/tox.ini b/tox.ini index 00c3eb35..fa4614d8 100644 --- a/tox.ini +++ b/tox.ini @@ -9,10 +9,9 @@ deps = pytest pytest-cov pytest-asyncio - scrapy >= 2.1.0 + scrapy >= 2.6.0 pytest-twisted web-poet @ git+https://git@github.com/scrapinghub/web-poet@meta#egg=web-poet - scrapy @ git+https://github.com/scrapy/scrapy.git@30d5779#egg=scrapy commands = py.test \ From 8340ced341adfc7aff613ded7caaa6e28e957393 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 29 Mar 2022 11:19:11 +0800 Subject: [PATCH 14/71] Update to use HttpReponse which replaces ResponseData Reference PR: https://github.com/scrapinghub/web-poet/pull/30 --- docs/intro/advanced-tutorial.rst | 10 ++++------ scrapy_poet/backend.py | 13 +++++-------- scrapy_poet/middleware.py | 2 +- scrapy_poet/page_input_providers.py | 10 ++-------- scrapy_poet/utils.py | 14 ++++++++++++++ setup.py | 2 +- tests/test_backend.py | 22 ++++++++++++++++++++-- tox.ini | 2 +- 8 files changed, 48 insertions(+), 27 deletions(-) diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index a87d4bbc..7fea1ac5 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -54,11 +54,10 @@ Suppose we have the following Page Object: } # Simulates clicking on a button that says "View All Images" - response: web_poet.ResponseData = await self.http_client.get( + response: web_poet.HttpResponse = await self.http_client.get( f"https://api.example.com/v2/images?id={item['product_id']}" ) - page = web_poet.WebPage(response) - item["images"] = page.css(".product-images img::attr(src)").getall() + item["images"] = response.css(".product-images img::attr(src)").getall() return item @@ -122,11 +121,10 @@ This basically acts as a switch to update the behavior of the Page Object: # Simulates clicking on a button that says "View All Images" if self.meta.get("enable_extracting_all_images") - response: web_poet.ResponseData = await self.http_client.get( + response: web_poet.HttpResponse = await self.http_client.get( f"https://api.example.com/v2/images?id={item['product_id']}" ) - page = web_poet.WebPage(response) - item["images"] = page.css(".product-images img::attr(src)").getall() + item["images"] = response.css(".product-images img::attr(src)").getall() return item diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index cb708577..6103bfbf 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -3,9 +3,11 @@ import attr import scrapy from scrapy.utils.defer import deferred_to_future -from web_poet.page_inputs import ResponseData +from web_poet.page_inputs import HttpResponse, HttpResponseHeaders from web_poet.requests import Request, RequestBackendError +from scrapy_poet.utils import scrapy_response_to_http_response + logger = logging.getLogger(__name__) @@ -28,13 +30,8 @@ async def scrapy_backend(request: Request): try: deferred = backend(request) - response = await deferred_to_future(deferred) - return ResponseData( - url=response.url, - html=response.text, - status=response.status, - headers=response.headers, - ) + response: scrapy.http.Response = await deferred_to_future(deferred) + return scrapy_response_to_http_response(response) except scrapy.exceptions.IgnoreRequest: logger.warning(f"Additional Request Ignored: {request}") diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index fc2bbddd..20b7169b 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -21,7 +21,7 @@ DEFAULT_PROVIDERS = { - HttpResponseProvider: 500 + HttpResponseProvider: 500, HttpClientProvider: 600, MetaProvider: 700, } diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 99cd3472..073d60c7 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -19,6 +19,7 @@ from scrapy.utils.reqser import request_to_dict from scrapy.utils.request import request_fingerprint +from scrapy_poet.utils import scrapy_response_to_http_response from scrapy_poet.injection_errors import MalformedProvidedClassesError from scrapy_poet.backend import create_scrapy_backend from web_poet import HttpResponse, HttpResponseHeaders, Meta @@ -164,14 +165,7 @@ class HttpResponseProvider(PageObjectInputProvider, CacheDataProviderMixin): def __call__(self, to_provide: Set[Callable], response: Response): """Builds a ``HttpResponse`` instance using a Scrapy ``Response``""" - return [ - HttpResponse( - url=response.url, - body=response.body, - status=response.status, - headers=HttpResponseHeaders.from_bytes(response.headers), - ) - ] + return [scrapy_response_to_http_response(response)] def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: request_keys = {"url", "method", "body"} diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 8cdcb6f0..7ea99742 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,5 +1,7 @@ import os +from web_poet import HttpResponse, HttpResponseHeaders +from scrapy.http import Response from scrapy.utils.project import project_data_dir, inside_project from tldextract import tldextract @@ -28,3 +30,15 @@ def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") - if createdir: os.makedirs(path, exist_ok=True) return path + + +def scrapy_response_to_http_response(response: Response): + """Convenience method to convert a ``scrapy.http.Response`` into a + ``web_poet.HttpResponse``. + """ + return HttpResponse( + url=response.url, + body=response.body, + status=response.status, + headers=HttpResponseHeaders.from_bytes(response.headers), + ) diff --git a/setup.py b/setup.py index fae62aaa..e1d54e5b 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ 'andi >= 0.4.1', 'attrs', 'parsel', - 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@additional-requests#egg=web-poet', + 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@tmp-dep-AR-1#egg=web-poet', 'tldextract', 'sqlitedict', ], diff --git a/tests/test_backend.py b/tests/test_backend.py index 9a8485e0..ee027560 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -38,21 +38,39 @@ async def test_incompatible_scrapy_request(scrapy_backend): await scrapy_backend(req) +@pytest.fixture +def fake_http_response(): + return web_poet.HttpResponse( + "https://example.com", + b"some content", + 200, + {"Content-Type": "text/html; charset=utf-8"}, + ) + + @pytest.mark.asyncio -async def test_scrapy_poet_backend(): +async def test_scrapy_poet_backend(fake_http_response): req = web_poet.Request("https://example.com") with mock.patch( "scrapy_poet.backend.deferred_to_future", new_callable=AsyncMock ) as mock_dtf: + mock_dtf.return_value = fake_http_response + mock_downloader = mock.MagicMock(return_value=AsyncMock) scrapy_backend = create_scrapy_backend(mock_downloader) response = await scrapy_backend(req) mock_downloader.assert_called_once() - assert isinstance(response, web_poet.ResponseData) + assert isinstance(response, web_poet.HttpResponse) + + assert response.url == "https://example.com" + assert response.text == "some content" + assert response.status == 200 + assert response.headers.get("Content-Type") == "text/html; charset=utf-8" + assert len(response.headers) == 1 @pytest.mark.asyncio diff --git a/tox.ini b/tox.ini index fa4614d8..0c1b6708 100644 --- a/tox.ini +++ b/tox.ini @@ -11,7 +11,7 @@ deps = pytest-asyncio scrapy >= 2.6.0 pytest-twisted - web-poet @ git+https://git@github.com/scrapinghub/web-poet@meta#egg=web-poet + web-poet @ git+https://git@github.com/scrapinghub/web-poet@tmp-dep-AR-1#egg=web-poet commands = py.test \ From ae4d8a5b20e0ea753289c53a249276ff18f5a54a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 29 Mar 2022 11:33:43 +0800 Subject: [PATCH 15/71] remove unused imports --- scrapy_poet/backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 6103bfbf..5dba5808 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -3,7 +3,6 @@ import attr import scrapy from scrapy.utils.defer import deferred_to_future -from web_poet.page_inputs import HttpResponse, HttpResponseHeaders from web_poet.requests import Request, RequestBackendError from scrapy_poet.utils import scrapy_response_to_http_response From ba0d8fe46a663606b2e9e8afcfc431e50d336801 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 8 Feb 2022 14:52:00 +0800 Subject: [PATCH 16/71] add basic integration for web-poet's support on additional requests --- scrapy_poet/middleware.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 20b7169b..3ea591fd 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -15,6 +15,7 @@ from .overrides import PerDomainOverridesRegistry from .page_input_providers import HttpResponseProvider, HttpClientProvider, MetaProvider from .injection import Injector +from .backend import enable_backend, scrapy_downloader_var logger = logging.getLogger(__name__) @@ -50,11 +51,16 @@ def __init__(self, crawler: Crawler) -> None: def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> InjectionMiddlewareTV: o = cls(crawler) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) + crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) return o def spider_closed(self, spider: Spider) -> None: self.injector.close() + def spider_opened(self, spider): + scrapy_downloader_var.set(spider.crawler.engine.download) + enable_backend() + def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` From 81df66420ddeff9d106e89a13007b07a20fb1e0f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 8 Feb 2022 17:22:12 +0800 Subject: [PATCH 17/71] create provider for web-poet's new HttpClient and GenericRequest --- scrapy_poet/backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 5dba5808..c72c0ec0 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -7,7 +7,6 @@ from scrapy_poet.utils import scrapy_response_to_http_response - logger = logging.getLogger(__name__) From 131609023eb292304a8fd7c64750e4e4ee368cfd Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 21 Feb 2022 15:56:38 +0800 Subject: [PATCH 18/71] add tests --- tests/test_providers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index c4be0546..d38f2a28 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -223,6 +223,7 @@ async def test_http_client_provider(settings): results[0].request_downloader == mock_factory.return_value + def test_meta_provider(settings): crawler = get_crawler(Spider, settings) provider = MetaProvider(crawler) From f8a7efed98f160dbbf12da576424d01b6e6fe1fa Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 11 Mar 2022 13:33:48 +0800 Subject: [PATCH 19/71] remove ContextVar approach and use Dependency Injection in Provider instead --- scrapy_poet/middleware.py | 6 ------ tests/test_backend.py | 10 ++++++++++ tests/test_providers.py | 1 - 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 3ea591fd..20b7169b 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -15,7 +15,6 @@ from .overrides import PerDomainOverridesRegistry from .page_input_providers import HttpResponseProvider, HttpClientProvider, MetaProvider from .injection import Injector -from .backend import enable_backend, scrapy_downloader_var logger = logging.getLogger(__name__) @@ -51,16 +50,11 @@ def __init__(self, crawler: Crawler) -> None: def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> InjectionMiddlewareTV: o = cls(crawler) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) - crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) return o def spider_closed(self, spider: Spider) -> None: self.injector.close() - def spider_opened(self, spider): - scrapy_downloader_var.set(spider.crawler.engine.download) - enable_backend() - def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` diff --git a/tests/test_backend.py b/tests/test_backend.py index ee027560..1ea32919 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -38,6 +38,16 @@ async def test_incompatible_scrapy_request(scrapy_backend): await scrapy_backend(req) +@pytest.mark.asyncio +async def test_incompatible_scrapy_request(scrapy_backend): + """The Request must be web_poet.Request and not anything else.""" + + req = scrapy.Request("https://example.com") + + with pytest.raises(TypeError): + await scrapy_backend(req) + + @pytest.fixture def fake_http_response(): return web_poet.HttpResponse( diff --git a/tests/test_providers.py b/tests/test_providers.py index d38f2a28..c4be0546 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -223,7 +223,6 @@ async def test_http_client_provider(settings): results[0].request_downloader == mock_factory.return_value - def test_meta_provider(settings): crawler = get_crawler(Spider, settings) provider = MetaProvider(crawler) From cc97213920900247f7f3ba5e6545b3c1a0578f33 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 11 Mar 2022 15:23:06 +0800 Subject: [PATCH 20/71] update CHANGELOG to new support on additional requests --- CHANGELOG.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f6c97ef9..646bef0b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,13 @@ Changelog TBR --- +* Support for the new features in ``web_poet>=0.2.0`` for supporting additional + requests inside Page Objects: + + * Created new providers for ``web_poet.Meta`` and ``web_poet.HttpClient``. + * Using the said additional requests needs ``async/await`` support in + ``asyncio``. This raises the minimum scrapy requirement to ``scrapy>=2.6.0``. + * Use the new ``web_poet.HttpResponse`` which replaces ``web_poet.ResponseData``. * Support for the new features in ``web_poet>=0.2.0`` for supporting additional requests inside Page Objects: From a25b61e4306cbd3fee5f0aa72c788578d7fa10b2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 16 Mar 2022 13:55:32 +0800 Subject: [PATCH 21/71] update callback_for() to have async support --- CHANGELOG.rst | 1 + scrapy_poet/api.py | 22 +++++++++++++--- tests/test_callback_for.py | 51 ++++++++++++++++++++++++++++++++++---- 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 646bef0b..c1a34c6c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -19,6 +19,7 @@ TBR * Created new providers for ``web_poet.Meta`` and ``web_poet.HttpClient``. * Using the said additional requests needs ``async/await`` support in ``asyncio``. This raises the minimum scrapy requirement to ``scrapy>=2.6.0``. +* add ``async`` support for ``callback_for``. 0.3.0 (2022-01-28) diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index 53454809..faff7c7e 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -30,7 +30,7 @@ def __init__(self, url: str, request=Optional[Request]): super().__init__(url=url, request=request) -def callback_for(page_cls: Type[ItemPage]) -> Callable: +def callback_for(page_cls: Type[ItemPage], is_async: bool = False) -> Callable: """Create a callback for an :class:`web_poet.pages.ItemPage` subclass. The generated callback returns the output of the @@ -67,6 +67,15 @@ def parse(self, response): parse_book = callback_for(BookPage) + The optional ``is_async`` param can also be set to ``True`` to support async + callbacks like the following, especially when Page Objects uses additional + requests needing the ``async/await`` syntax. + + .. code-block:: python + + async def parse_book(self, response: DummyResponse, page: BookPage): + yield await page.to_item() + The generated callback could be used as a spider instance method or passed as an inline/anonymous argument. Make sure to define it as a spider attribute (as shown in the example above) if you're planning to use @@ -90,5 +99,12 @@ def parse(self, response): def parse(*args, page: page_cls, **kwargs): # type: ignore yield page.to_item() # type: ignore - setattr(parse, _CALLBACK_FOR_MARKER, True) - return parse + async def async_parse(*args, page: page_cls, **kwargs): # type: ignore + yield await page.to_item() # type: ignore + + if is_async: + setattr(async_parse, _CALLBACK_FOR_MARKER, True) + return async_parse + else: + setattr(parse, _CALLBACK_FOR_MARKER, True) + return parse diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index 7a830712..fd19f020 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -14,6 +14,11 @@ class FakeItemPage(ItemPage): def to_item(self): return 'fake item page' +class FakeItemPageAsync(ItemPage): + + async def to_item(self): + return 'fake item page' + class FakeItemWebPage(ItemWebPage): @@ -28,6 +33,12 @@ class MySpider(scrapy.Spider): parse_web = callback_for(FakeItemWebPage) +class MySpiderAsync(scrapy.Spider): + + name = 'my_spider_async' + parse_item = callback_for(FakeItemPage, is_async=True) + + def test_callback_for(): """Simple test case to ensure it works as expected.""" cb = callback_for(FakeItemPage) @@ -39,6 +50,20 @@ def test_callback_for(): assert list(result) == ['fake item page'] +@pytest.mark.asyncio +async def test_callback_for_async(): + cb = callback_for(FakeItemPage, is_async=True) + assert callable(cb) + + fake_page = FakeItemPageAsync() + response = DummyResponse('http://example.com/') + result = cb(response=response, page=fake_page) + + assert await result.__anext__() == 'fake item page' + with pytest.raises(StopAsyncIteration): + assert await result.__anext__() + + def test_callback_for_instance_method(): spider = MySpider() response = DummyResponse('http://example.com/') @@ -47,12 +72,16 @@ def test_callback_for_instance_method(): assert list(result) == ['fake item page'] -def test_callback_for_inline(): - callback = callback_for(FakeItemPage) +@pytest.mark.asyncio +async def test_callback_for_instance_method_async(): + spider = MySpiderAsync() response = DummyResponse('http://example.com/') - fake_page = FakeItemPage() - result = callback(response, page=fake_page) - assert list(result) == ['fake item page'] + fake_page = FakeItemPageAsync() + result = spider.parse_item(response, page=fake_page) + + assert await result.__anext__() == 'fake item page' + with pytest.raises(StopAsyncIteration): + assert await result.__anext__() def test_default_callback(): @@ -93,6 +122,18 @@ def test_inline_callback(): assert str(exc.value) == msg +def test_inline_callback_async(): + """Sample request with inline callback using async callback_for.""" + spider = MySpiderAsync() + cb = callback_for(FakeItemPage, is_async=True) + request = scrapy.Request('http://example.com/', callback=cb) + with pytest.raises(ValueError) as exc: + request_to_dict(request, spider) + + msg = f'Function {cb} is not an instance method in: {spider}' + assert str(exc.value) == msg + + def test_invalid_subclass(): """Classes should inherit from ItemPage.""" From eb3e837240d68e2d3513cb857f5f3ff32ec8a99a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 16 Mar 2022 16:32:11 +0800 Subject: [PATCH 22/71] add docs mentioning async support in callback_for() --- docs/intro/advanced-tutorial.rst | 1 + docs/intro/basic-tutorial.rst | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index 7fea1ac5..d592d19c 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -15,6 +15,7 @@ These are mainly achieved by **scrapy-poet** implementing **providers** for them * :class:`scrapy_poet.page_input_providers.HttpClientProvider` * :class:`scrapy_poet.page_input_providers.MetaProvider` +.. _`intro-additional-requests`: Additional Requests =================== diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 9ee1fb08..282f4458 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -198,6 +198,21 @@ returning the result of the ``to_item`` method call. We could use ``response.follow_all(links, callback_for(BookPage))``, without creating an attribute, but currently it won't work with Scrapy disk queues. +.. tip:: + + :func:`~.callback_for` also supports `async generators` via the ``is_async=True`` + parameter. In this way, having ``parse_book = callback_for(BookPage, is_async=True)`` + would result in the following: + + .. code-block:: python + + async def parse_book(self, response: DummyResponse, page: BookPage): + yield await page.to_item() + + This is useful when the Page Objects uses additional requests which relies + heavily on ``async/await`` format. More info on this in this tutorial section: + :ref:`intro-additional-requests`. + Final result ============ From 7b2d4cf9b2948e7b904d2d7ee9aa07593bbab33b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 16 Mar 2022 16:34:43 +0800 Subject: [PATCH 23/71] force callback_for() to have 'is_async' to be keyword-only param --- scrapy_poet/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index faff7c7e..7945c15c 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -30,7 +30,7 @@ def __init__(self, url: str, request=Optional[Request]): super().__init__(url=url, request=request) -def callback_for(page_cls: Type[ItemPage], is_async: bool = False) -> Callable: +def callback_for(page_cls: Type[ItemPage], *, is_async: bool = False) -> Callable: """Create a callback for an :class:`web_poet.pages.ItemPage` subclass. The generated callback returns the output of the From 5c4326f2ae16556a11dbd16f4e4f24802d306212 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 16 Mar 2022 16:40:26 +0800 Subject: [PATCH 24/71] update async test spider to use async PO as well --- tests/test_callback_for.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index fd19f020..650df200 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -36,7 +36,7 @@ class MySpider(scrapy.Spider): class MySpiderAsync(scrapy.Spider): name = 'my_spider_async' - parse_item = callback_for(FakeItemPage, is_async=True) + parse_item = callback_for(FakeItemPageAsync, is_async=True) def test_callback_for(): From b79dfa8eb4ffff0a644d6950e06127ea580d696d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 18 Mar 2022 16:05:00 +0800 Subject: [PATCH 25/71] remove 'is_async' param in callback_for --- docs/intro/basic-tutorial.rst | 15 ++++++++++++--- scrapy_poet/api.py | 21 ++++++++++++++++----- tests/test_callback_for.py | 6 +++--- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 282f4458..0c4691c9 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -200,9 +200,18 @@ returning the result of the ``to_item`` method call. We could use .. tip:: - :func:`~.callback_for` also supports `async generators` via the ``is_async=True`` - parameter. In this way, having ``parse_book = callback_for(BookPage, is_async=True)`` - would result in the following: + :func:`~.callback_for` also supports `async generators` by checking if the + ``to_item()`` method is a coroutine. So having the following: + + .. code-block:: python + + class BookPage(web_poet.ItemWebPage): + async def to_item(self): + return await do_something_async() + + callback_for(BookPage) + + would result in: .. code-block:: python diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index 7945c15c..8520ea0b 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -1,4 +1,5 @@ from typing import Callable, Optional, Type +from inspect import iscoroutinefunction from scrapy.http import Request, Response @@ -30,7 +31,7 @@ def __init__(self, url: str, request=Optional[Request]): super().__init__(url=url, request=request) -def callback_for(page_cls: Type[ItemPage], *, is_async: bool = False) -> Callable: +def callback_for(page_cls: Type[ItemPage]) -> Callable: """Create a callback for an :class:`web_poet.pages.ItemPage` subclass. The generated callback returns the output of the @@ -67,9 +68,19 @@ def parse(self, response): parse_book = callback_for(BookPage) - The optional ``is_async`` param can also be set to ``True`` to support async - callbacks like the following, especially when Page Objects uses additional - requests needing the ``async/await`` syntax. + This also produces an async generator callable if the Page Objects's + ``to_item()`` method is a coroutine which uses the ``async/await`` syntax. + So having the following: + + .. code-block:: python + + class BookPage(web_poet.ItemWebPage): + async def to_item(self): + return await do_something_async() + + callback_for(BookPage) + + would result in: .. code-block:: python @@ -102,7 +113,7 @@ def parse(*args, page: page_cls, **kwargs): # type: ignore async def async_parse(*args, page: page_cls, **kwargs): # type: ignore yield await page.to_item() # type: ignore - if is_async: + if iscoroutinefunction(page_cls.to_item): setattr(async_parse, _CALLBACK_FOR_MARKER, True) return async_parse else: diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index 650df200..3002de71 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -36,7 +36,7 @@ class MySpider(scrapy.Spider): class MySpiderAsync(scrapy.Spider): name = 'my_spider_async' - parse_item = callback_for(FakeItemPageAsync, is_async=True) + parse_item = callback_for(FakeItemPageAsync) def test_callback_for(): @@ -52,7 +52,7 @@ def test_callback_for(): @pytest.mark.asyncio async def test_callback_for_async(): - cb = callback_for(FakeItemPage, is_async=True) + cb = callback_for(FakeItemPageAsync) assert callable(cb) fake_page = FakeItemPageAsync() @@ -125,7 +125,7 @@ def test_inline_callback(): def test_inline_callback_async(): """Sample request with inline callback using async callback_for.""" spider = MySpiderAsync() - cb = callback_for(FakeItemPage, is_async=True) + cb = callback_for(FakeItemPageAsync) request = scrapy.Request('http://example.com/', callback=cb) with pytest.raises(ValueError) as exc: request_to_dict(request, spider) From a74d264c4badba8fd6dd5471a2be5854e37c8532 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 29 Mar 2022 12:49:32 +0800 Subject: [PATCH 26/71] remove duplicated test --- tests/test_backend.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 1ea32919..ee027560 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -38,16 +38,6 @@ async def test_incompatible_scrapy_request(scrapy_backend): await scrapy_backend(req) -@pytest.mark.asyncio -async def test_incompatible_scrapy_request(scrapy_backend): - """The Request must be web_poet.Request and not anything else.""" - - req = scrapy.Request("https://example.com") - - with pytest.raises(TypeError): - await scrapy_backend(req) - - @pytest.fixture def fake_http_response(): return web_poet.HttpResponse( From af0b802d52ba4a492fbd42015857f6e0561646b0 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 5 Apr 2022 13:38:10 +0800 Subject: [PATCH 27/71] remove unrelated file --- scrapy_poet/debug.log | 1 - 1 file changed, 1 deletion(-) delete mode 100644 scrapy_poet/debug.log diff --git a/scrapy_poet/debug.log b/scrapy_poet/debug.log deleted file mode 100644 index e5cd4dd5..00000000 --- a/scrapy_poet/debug.log +++ /dev/null @@ -1 +0,0 @@ -/home/k/.pyenv/versions/3.7.9/bin/python3: can't open file 'multiple_spider_in_one_process.py': [Errno 2] No such file or directory From 42bf6dac12e1ba6c86172da8689cd303f685997a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 16 May 2022 13:23:33 +0800 Subject: [PATCH 28/71] update imports after web_poet refactoring --- scrapy_poet/backend.py | 9 +++++---- scrapy_poet/page_input_providers.py | 3 +-- scrapy_poet/utils.py | 2 +- tests/test_backend.py | 16 +++++++++------- tests/test_providers.py | 2 +- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 5dba5808..042066f9 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -3,7 +3,8 @@ import attr import scrapy from scrapy.utils.defer import deferred_to_future -from web_poet.requests import Request, RequestBackendError +from web_poet import HttpRequest +from web_poet.exceptions import RequestBackendError from scrapy_poet.utils import scrapy_response_to_http_response @@ -12,10 +13,10 @@ def create_scrapy_backend(backend): - async def scrapy_backend(request: Request): - if not isinstance(request, Request): + async def scrapy_backend(request: HttpRequest): + if not isinstance(request, HttpRequest): raise TypeError( - f"The request should be 'web_poet.Request' but received " + f"The request should be 'web_poet.HttpRequest' but received " f"one of type: '{type(request)}'." ) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 55e93212..40db64cc 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -22,8 +22,7 @@ from scrapy_poet.utils import scrapy_response_to_http_response from scrapy_poet.injection_errors import MalformedProvidedClassesError from scrapy_poet.backend import create_scrapy_backend -from web_poet import HttpResponse, HttpResponseHeaders, Meta -from web_poet.requests import HttpClient +from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, Meta class PageObjectInputProvider: diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 7ea99742..d67fb0f7 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -40,5 +40,5 @@ def scrapy_response_to_http_response(response: Response): url=response.url, body=response.body, status=response.status, - headers=HttpResponseHeaders.from_bytes(response.headers), + headers=HttpResponseHeaders.from_bytes_dict(response.headers), ) diff --git a/tests/test_backend.py b/tests/test_backend.py index ee027560..d1cffc44 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -4,9 +4,11 @@ import web_poet import scrapy -from scrapy_poet.backend import RequestBackendError, create_scrapy_backend +from web_poet.exceptions import RequestBackendError from tests.utils import AsyncMock +from scrapy_poet.backend import create_scrapy_backend + @pytest.fixture def scrapy_backend(): @@ -19,7 +21,7 @@ async def test_incompatible_request(scrapy_backend): """The Request must have fields that are a subset of `scrapy.Request`.""" @attr.define - class Request(web_poet.Request): + class Request(web_poet.HttpRequest): incompatible_field: str = "value" req = Request("https://example.com") @@ -30,7 +32,7 @@ class Request(web_poet.Request): @pytest.mark.asyncio async def test_incompatible_scrapy_request(scrapy_backend): - """The Request must be web_poet.Request and not anything else.""" + """The Request must be web_poet.HttpRequest and not anything else.""" req = scrapy.Request("https://example.com") @@ -43,14 +45,14 @@ def fake_http_response(): return web_poet.HttpResponse( "https://example.com", b"some content", - 200, - {"Content-Type": "text/html; charset=utf-8"}, + status=200, + headers={"Content-Type": "text/html; charset=utf-8"}, ) @pytest.mark.asyncio async def test_scrapy_poet_backend(fake_http_response): - req = web_poet.Request("https://example.com") + req = web_poet.HttpRequest("https://example.com") with mock.patch( "scrapy_poet.backend.deferred_to_future", new_callable=AsyncMock @@ -76,7 +78,7 @@ async def test_scrapy_poet_backend(fake_http_response): @pytest.mark.asyncio async def test_scrapy_poet_backend_ignored_request(): """It should handle IgnoreRequest from Scrapy gracefully.""" - req = web_poet.Request("https://example.com") + req = web_poet.HttpRequest("https://example.com") with mock.patch( "scrapy_poet.backend.deferred_to_future", new_callable=AsyncMock diff --git a/tests/test_providers.py b/tests/test_providers.py index c4be0546..abd42f5f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -221,7 +221,7 @@ async def test_http_client_provider(settings): results = provider(set(), crawler) assert isinstance(results[0], HttpClient) - results[0].request_downloader == mock_factory.return_value + results[0]._request_downloader == mock_factory.return_value def test_meta_provider(settings): crawler = get_crawler(Spider, settings) From 3983ed6f1d8315c9f593f27bffe5f79a3dbb4cf7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 13:04:52 +0800 Subject: [PATCH 29/71] fix duplicated entry in CHANGELOG --- CHANGELOG.rst | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c1a34c6c..1e97b227 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,13 +5,6 @@ Changelog TBR --- -* Support for the new features in ``web_poet>=0.2.0`` for supporting additional - requests inside Page Objects: - - * Created new providers for ``web_poet.Meta`` and ``web_poet.HttpClient``. - * Using the said additional requests needs ``async/await`` support in - ``asyncio``. This raises the minimum scrapy requirement to ``scrapy>=2.6.0``. - * Use the new ``web_poet.HttpResponse`` which replaces ``web_poet.ResponseData``. * Support for the new features in ``web_poet>=0.2.0`` for supporting additional requests inside Page Objects: @@ -19,6 +12,7 @@ TBR * Created new providers for ``web_poet.Meta`` and ``web_poet.HttpClient``. * Using the said additional requests needs ``async/await`` support in ``asyncio``. This raises the minimum scrapy requirement to ``scrapy>=2.6.0``. + * add ``async`` support for ``callback_for``. From d76db345627cb3c11f3afba6d50b026288e60ba2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 13:12:45 +0800 Subject: [PATCH 30/71] Remove implementation details about callback_for() in the docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/intro/basic-tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 0c4691c9..2fb59f74 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -200,8 +200,8 @@ returning the result of the ``to_item`` method call. We could use .. tip:: - :func:`~.callback_for` also supports `async generators` by checking if the - ``to_item()`` method is a coroutine. So having the following: + :func:`~.callback_for` also supports `async generators`. So having the + following: .. code-block:: python From f1126fb5c1862718e053714ae980672d0420ce8a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 13:23:44 +0800 Subject: [PATCH 31/71] remove else block in callback_for() --- scrapy_poet/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index 8520ea0b..bb0da171 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -116,6 +116,6 @@ async def async_parse(*args, page: page_cls, **kwargs): # type: ignore if iscoroutinefunction(page_cls.to_item): setattr(async_parse, _CALLBACK_FOR_MARKER, True) return async_parse - else: - setattr(parse, _CALLBACK_FOR_MARKER, True) - return parse + + setattr(parse, _CALLBACK_FOR_MARKER, True) + return parse From c2bfe89cedcae29cc3e9c2515dcb68d8945ef182 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 14:06:38 +0800 Subject: [PATCH 32/71] Update docs/intro/basic-tutorial.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/intro/basic-tutorial.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 2fb59f74..6881fdac 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -218,9 +218,9 @@ returning the result of the ``to_item`` method call. We could use async def parse_book(self, response: DummyResponse, page: BookPage): yield await page.to_item() - This is useful when the Page Objects uses additional requests which relies - heavily on ``async/await`` format. More info on this in this tutorial section: - :ref:`intro-additional-requests`. + This is useful when the Page Objects uses additional requests, which rely + heavily on ``async/await`` syntax. More info on this in this tutorial + section: :ref:`intro-additional-requests`. Final result ============ From 75d5a13f75925ffb9529585928cfb72e34f61ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 30 May 2022 13:37:57 +0200 Subject: [PATCH 33/71] Fix tests --- tests/test_injection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_injection.py b/tests/test_injection.py index 99393e52..6871b1b6 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -266,11 +266,11 @@ def callback(response: DummyResponse, class Html(Injectable): url = "http://example.com" - html = """Price: 22€""" + text = """Price: 22€""" @property def selector(self): - return parsel.Selector(self.html) + return parsel.Selector(self.text) class EurDollarRate(Injectable): From 1df25a800f7f1a4df769c2d3b06d48467defaace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 30 May 2022 14:05:16 +0200 Subject: [PATCH 34/71] Handle additional request IgnoreError as per web-poet #38 --- scrapy_poet/backend.py | 11 ++++++----- tests/test_backend.py | 9 +++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 58481b4f..eb124608 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -4,7 +4,7 @@ import scrapy from scrapy.utils.defer import deferred_to_future from web_poet import HttpRequest -from web_poet.exceptions import RequestBackendError +from web_poet.exceptions import HttpError, RequestBackendError from scrapy_poet.utils import scrapy_response_to_http_response @@ -27,11 +27,12 @@ async def scrapy_backend(request: HttpRequest): f"Ensure that it doesn't contain extra fields: {request}" ) + deferred = backend(request) try: - deferred = backend(request) response: scrapy.http.Response = await deferred_to_future(deferred) - return scrapy_response_to_http_response(response) - except scrapy.exceptions.IgnoreRequest: - logger.warning(f"Additional Request Ignored: {request}") + raise HttpError(f"Additional request ignored: {request}") + + return scrapy_response_to_http_response(response) + return scrapy_backend diff --git a/tests/test_backend.py b/tests/test_backend.py index d1cffc44..7db73eb2 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -77,15 +77,16 @@ async def test_scrapy_poet_backend(fake_http_response): @pytest.mark.asyncio async def test_scrapy_poet_backend_ignored_request(): - """It should handle IgnoreRequest from Scrapy gracefully.""" + """It should handle IgnoreRequest from Scrapy according to the web poet + standard on additional request error handling.""" req = web_poet.HttpRequest("https://example.com") with mock.patch( "scrapy_poet.backend.deferred_to_future", new_callable=AsyncMock ) as mock_dtf: - + mock_dtf.side_effect = scrapy.exceptions.IgnoreRequest mock_downloader = mock.MagicMock(return_value=AsyncMock) - mock_downloader.side_effect = scrapy.exceptions.IgnoreRequest scrapy_backend = create_scrapy_backend(mock_downloader) - await scrapy_backend(req) + with pytest.raises(web_poet.exceptions.HttpError): + await scrapy_backend(req) From 2d6da5a95e8d385e328544cc9ccfdcb14d4af181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 30 May 2022 14:10:53 +0200 Subject: [PATCH 35/71] Fix the documentation build --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 2e205d04..84466f9d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. From 063ef205e95a487ff11a3e1f755236e26551b0bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 09:05:54 +0200 Subject: [PATCH 36/71] Support a non-asyncio Twisted reactor --- scrapy_poet/backend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index eb124608..55a5567f 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -2,7 +2,7 @@ import attr import scrapy -from scrapy.utils.defer import deferred_to_future +from scrapy.utils.defer import maybe_deferred_to_future from web_poet import HttpRequest from web_poet.exceptions import HttpError, RequestBackendError @@ -28,8 +28,9 @@ async def scrapy_backend(request: HttpRequest): ) deferred = backend(request) + deferred_or_future = maybe_deferred_to_future(deferred) try: - response: scrapy.http.Response = await deferred_to_future(deferred) + response = await deferred_or_future except scrapy.exceptions.IgnoreRequest: raise HttpError(f"Additional request ignored: {request}") From 2a9e8d0894445ca60c395e60a6918fff71db3533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 09:11:05 +0200 Subject: [PATCH 37/71] Fix tests --- tests/test_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 7db73eb2..1c10b2e9 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -55,7 +55,7 @@ async def test_scrapy_poet_backend(fake_http_response): req = web_poet.HttpRequest("https://example.com") with mock.patch( - "scrapy_poet.backend.deferred_to_future", new_callable=AsyncMock + "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock ) as mock_dtf: mock_dtf.return_value = fake_http_response @@ -82,7 +82,7 @@ async def test_scrapy_poet_backend_ignored_request(): req = web_poet.HttpRequest("https://example.com") with mock.patch( - "scrapy_poet.backend.deferred_to_future", new_callable=AsyncMock + "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock ) as mock_dtf: mock_dtf.side_effect = scrapy.exceptions.IgnoreRequest mock_downloader = mock.MagicMock(return_value=AsyncMock) From f47816b416c78cfa74d997ff2fb1fbdd34609072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 09:33:06 +0200 Subject: [PATCH 38/71] backend: handle unexpected exceptions as HttpRequestError --- scrapy_poet/backend.py | 8 +++++++- tests/test_backend.py | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 55a5567f..b7e5417f 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -4,7 +4,11 @@ import scrapy from scrapy.utils.defer import maybe_deferred_to_future from web_poet import HttpRequest -from web_poet.exceptions import HttpError, RequestBackendError +from web_poet.exceptions import ( + HttpError, + HttpRequestError, + RequestBackendError, +) from scrapy_poet.utils import scrapy_response_to_http_response @@ -33,6 +37,8 @@ async def scrapy_backend(request: HttpRequest): response = await deferred_or_future except scrapy.exceptions.IgnoreRequest: raise HttpError(f"Additional request ignored: {request}") + except Exception: + raise HttpRequestError(f"Additional request failed: {request}") return scrapy_response_to_http_response(response) diff --git a/tests/test_backend.py b/tests/test_backend.py index 1c10b2e9..9aa5df0c 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -4,6 +4,7 @@ import web_poet import scrapy +import twisted from web_poet.exceptions import RequestBackendError from tests.utils import AsyncMock @@ -90,3 +91,20 @@ async def test_scrapy_poet_backend_ignored_request(): with pytest.raises(web_poet.exceptions.HttpError): await scrapy_backend(req) + + +@pytest.mark.asyncio +async def test_scrapy_poet_backend_twisted_error(): + """It should handle IgnoreRequest from Scrapy according to the web poet + standard on additional request error handling.""" + req = web_poet.HttpRequest("https://example.com") + + with mock.patch( + "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + mock_dtf.side_effect = twisted.internet.error.TimeoutError + mock_downloader = mock.MagicMock(return_value=AsyncMock) + scrapy_backend = create_scrapy_backend(mock_downloader) + + with pytest.raises(web_poet.exceptions.HttpRequestError): + await scrapy_backend(req) From c1e8b9381a71bec384856399a0bda8542dc6b0ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 11:00:42 +0200 Subject: [PATCH 39/71] Additional requests: prevent HEAD redirects --- scrapy_poet/backend.py | 3 +++ tests/test_backend.py | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index b7e5417f..a8228fbd 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -31,6 +31,9 @@ async def scrapy_backend(request: HttpRequest): f"Ensure that it doesn't contain extra fields: {request}" ) + if request.method == "HEAD": + request.meta["dont_redirect"] = True + deferred = backend(request) deferred_or_future = maybe_deferred_to_future(deferred) try: diff --git a/tests/test_backend.py b/tests/test_backend.py index 9aa5df0c..434414b4 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -95,8 +95,6 @@ async def test_scrapy_poet_backend_ignored_request(): @pytest.mark.asyncio async def test_scrapy_poet_backend_twisted_error(): - """It should handle IgnoreRequest from Scrapy according to the web poet - standard on additional request error handling.""" req = web_poet.HttpRequest("https://example.com") with mock.patch( @@ -108,3 +106,20 @@ async def test_scrapy_poet_backend_twisted_error(): with pytest.raises(web_poet.exceptions.HttpRequestError): await scrapy_backend(req) + + +@pytest.mark.asyncio +async def test_scrapy_poet_backend_head_redirect(fake_http_response): + req = web_poet.HttpRequest("https://example.com", method="HEAD") + + with mock.patch( + "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + mock_dtf.return_value = fake_http_response + mock_downloader = mock.MagicMock(return_value=AsyncMock) + scrapy_backend = create_scrapy_backend(mock_downloader) + + await scrapy_backend(req) + + scrapy_request = mock_downloader.call_args.args[0] + assert scrapy_request.meta.get("dont_redirect") is True From e7989ed11300843c5a0c4035ceee473f7ebf26a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 11:03:25 +0200 Subject: [PATCH 40/71] Additional requests: do not filter out duplicate requests --- scrapy_poet/backend.py | 2 +- tests/test_backend.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index a8228fbd..17102a92 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -24,7 +24,7 @@ async def scrapy_backend(request: HttpRequest): ) try: - request = scrapy.Request(**attr.asdict(request)) + request = scrapy.Request(**attr.asdict(request), dont_filter=True) except TypeError: raise RequestBackendError( f"The given Request isn't compatible with `scrapy.Request`. " diff --git a/tests/test_backend.py b/tests/test_backend.py index 434414b4..c689fec9 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -123,3 +123,20 @@ async def test_scrapy_poet_backend_head_redirect(fake_http_response): scrapy_request = mock_downloader.call_args.args[0] assert scrapy_request.meta.get("dont_redirect") is True + + +@pytest.mark.asyncio +async def test_scrapy_poet_backend_dont_filter(fake_http_response): + req = web_poet.HttpRequest("https://example.com") + + with mock.patch( + "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: + mock_dtf.return_value = fake_http_response + mock_downloader = mock.MagicMock(return_value=AsyncMock) + scrapy_backend = create_scrapy_backend(mock_downloader) + + await scrapy_backend(req) + + scrapy_request = mock_downloader.call_args.args[0] + assert scrapy_request.dont_filter is True From 6252526352dbaa461d2c10ded5b4523b30a4db96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 11:08:52 +0200 Subject: [PATCH 41/71] Make the latest tests compatible with Pytho 3.7 --- tests/test_backend.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index c689fec9..e0acffa9 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -121,7 +121,8 @@ async def test_scrapy_poet_backend_head_redirect(fake_http_response): await scrapy_backend(req) - scrapy_request = mock_downloader.call_args.args[0] + args, kwargs = mock_downloader.call_args + scrapy_request = args[0] assert scrapy_request.meta.get("dont_redirect") is True @@ -138,5 +139,6 @@ async def test_scrapy_poet_backend_dont_filter(fake_http_response): await scrapy_backend(req) - scrapy_request = mock_downloader.call_args.args[0] + args, kwargs = mock_downloader.call_args + scrapy_request = args[0] assert scrapy_request.dont_filter is True From 0358146acb25c9e10cf3f27e04db7afd8a76da44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 11:11:32 +0200 Subject: [PATCH 42/71] =?UTF-8?q?po=5Fargs=20=E2=86=92=20po=5Fmeta?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/intro/advanced-tutorial.rst | 4 ++-- scrapy_poet/page_input_providers.py | 4 ++-- tests/test_providers.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index d592d19c..ee40f9c5 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -131,7 +131,7 @@ This basically acts as a switch to update the behavior of the Page Object: Passing the ``enable_extracting_all_images`` meta value from the spider into the Page Object can be achieved by using **Scrapy's** ``Request.meta`` attribute. -Specifically, any ``dict`` value inside the ``po_args`` parameter inside +Specifically, any ``dict`` value inside the ``po_meta`` parameter inside **Scrapy's** ``Request.meta`` will be passed into ``web_poet.Meta``. Let's see it in action: @@ -159,7 +159,7 @@ Let's see it in action: yield scrapy.Request( url=url, callback=self.parse, - meta={"po_args": {"enable_extracting_all_images": True}} + meta={"po_meta": {"enable_extracting_all_images": True}} ) async def parse(self, response, page: ProductPage): diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 40db64cc..cd7b2d24 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -220,6 +220,6 @@ class MetaProvider(PageObjectInputProvider): def __call__(self, to_provide: Set[Callable], request: Request): """Creates a ``web_poet.requests.Meta`` instance based on the data found - from the ``meta["po_args"]`` field of a ``scrapy.http.Response`` instance. + from the ``meta["po_meta"]`` field of a ``scrapy.http.Response`` instance. """ - return [Meta(**request.meta.get("po_args", {}))] + return [Meta(**request.meta.get("po_meta", {}))] diff --git a/tests/test_providers.py b/tests/test_providers.py index abd42f5f..713d82d0 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -233,7 +233,7 @@ def test_meta_provider(settings): assert results[0] == {} expected_data = {"key": "value"} - request.meta.update({"po_args": expected_data}) + request.meta.update({"po_meta": expected_data}) results = provider(set(), request) assert results[0] == expected_data From 0146c83e8379b7326f64327bfbd4a5c3ebce584c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 11:27:33 +0200 Subject: [PATCH 43/71] Document the peculiarities of additional request handling --- docs/intro/advanced-tutorial.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index ee40f9c5..2c8fbe2e 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -26,14 +26,18 @@ the spider. It would work as-is because of the readily available out of the box. This supplies the Page Object with the necessary ``web_poet.HttpClient`` instance. -Take note the HTTP Downloader implementation that **scrapy-poet** provides to -``web_poet.HttpClient`` would be the **Scrapy Downloader**. -.. tip:: +The HTTP client implementation that **scrapy-poet** provides to +``web_poet.HttpClient`` handles requests as follows: - This means that the additional requests inside a Page Object will have access - to the **Downloader Middlewares** that the Spider is using. +- Requests go through downloader middlewares, but they do not go through + spider middlewares or through the scheduler. +- Duplicate requests are not filtered out. + +- In line with the web-poet specification for additional requests, + ``Request.meta['dont_redirect']`` is set to ``True`` for requests with the + ``HEAD`` HTTP method. Suppose we have the following Page Object: From b4ce3959c01bd9100682c306d6a27a849a05e73d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 15:39:05 +0200 Subject: [PATCH 44/71] Test both asyncio and non-asyncio reactors --- CHANGELOG.rst | 3 +- docs/intro/advanced-tutorial.rst | 3 -- tests/test_backend.py | 62 +++++++++++++++++++++++++++----- tests/test_callback_for.py | 5 +-- tests/test_providers.py | 4 +-- tox.ini | 8 ++--- 6 files changed, 64 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1032d14e..9b56ef98 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,8 +10,7 @@ TBR requests inside Page Objects: * Created new providers for ``web_poet.Meta`` and ``web_poet.HttpClient``. - * Using the said additional requests needs ``async/await`` support in - ``asyncio``. This raises the minimum scrapy requirement to ``scrapy>=2.6.0``. + * The minimum Scrapy version is now ``2.6.0``. * We have these **backward incompatible** changes since the ``web_poet.OverrideRule`` follow a different structure: diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index 2c8fbe2e..a48c104c 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -93,9 +93,6 @@ Note that we needed to update the ``parse()`` method to be an ``async`` method, since the ``to_item()`` method of the Page Object we're using is an ``async`` method as well. -This is also the primary reason why **scrapy-poet** requires ``scrapy>=2.6.0`` -since it's the minimum version that has full :mod:`asyncio` support. - Meta ==== diff --git a/tests/test_backend.py b/tests/test_backend.py index e0acffa9..142148fa 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -5,10 +5,17 @@ import web_poet import scrapy import twisted -from web_poet.exceptions import RequestBackendError +from pytest_twisted import ensureDeferred, inlineCallbacks +from scrapy import Spider from tests.utils import AsyncMock +from web_poet import HttpClient +from web_poet.exceptions import RequestBackendError +from web_poet.pages import ItemWebPage from scrapy_poet.backend import create_scrapy_backend +from tests.utils import ( + crawl_single_item, make_crawler, HtmlResource, MockServer +) @pytest.fixture @@ -17,7 +24,7 @@ def scrapy_backend(): return create_scrapy_backend(mock_backend) -@pytest.mark.asyncio +@ensureDeferred async def test_incompatible_request(scrapy_backend): """The Request must have fields that are a subset of `scrapy.Request`.""" @@ -31,7 +38,7 @@ class Request(web_poet.HttpRequest): await scrapy_backend(req) -@pytest.mark.asyncio +@ensureDeferred async def test_incompatible_scrapy_request(scrapy_backend): """The Request must be web_poet.HttpRequest and not anything else.""" @@ -51,7 +58,7 @@ def fake_http_response(): ) -@pytest.mark.asyncio +@ensureDeferred async def test_scrapy_poet_backend(fake_http_response): req = web_poet.HttpRequest("https://example.com") @@ -76,7 +83,7 @@ async def test_scrapy_poet_backend(fake_http_response): assert len(response.headers) == 1 -@pytest.mark.asyncio +@ensureDeferred async def test_scrapy_poet_backend_ignored_request(): """It should handle IgnoreRequest from Scrapy according to the web poet standard on additional request error handling.""" @@ -93,7 +100,7 @@ async def test_scrapy_poet_backend_ignored_request(): await scrapy_backend(req) -@pytest.mark.asyncio +@ensureDeferred async def test_scrapy_poet_backend_twisted_error(): req = web_poet.HttpRequest("https://example.com") @@ -108,7 +115,7 @@ async def test_scrapy_poet_backend_twisted_error(): await scrapy_backend(req) -@pytest.mark.asyncio +@ensureDeferred async def test_scrapy_poet_backend_head_redirect(fake_http_response): req = web_poet.HttpRequest("https://example.com", method="HEAD") @@ -126,7 +133,7 @@ async def test_scrapy_poet_backend_head_redirect(fake_http_response): assert scrapy_request.meta.get("dont_redirect") is True -@pytest.mark.asyncio +@ensureDeferred async def test_scrapy_poet_backend_dont_filter(fake_http_response): req = web_poet.HttpRequest("https://example.com") @@ -142,3 +149,42 @@ async def test_scrapy_poet_backend_dont_filter(fake_http_response): args, kwargs = mock_downloader.call_args scrapy_request = args[0] assert scrapy_request.dont_filter is True + + +@inlineCallbacks +def test_scrapy_poet_backend_await(): + """Make sure that the awaiting of the backend call works. + + For this test to pass, the resulting deferred must be awaited as such when + using a non-asyncio Twisted reactor, but first converted into a future + when using an asyncio Twisted reactor. + """ + items = [] + with MockServer(HtmlResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + await self.http_client.request(server.root_url) + return {'foo': 'bar'} + + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + assert items == [{'foo': 'bar'}] diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index 3002de71..52024ed6 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -1,5 +1,6 @@ import scrapy import pytest +from pytest_twisted import ensureDeferred from scrapy.utils.reqser import request_to_dict from web_poet.pages import ItemPage, ItemWebPage @@ -50,7 +51,7 @@ def test_callback_for(): assert list(result) == ['fake item page'] -@pytest.mark.asyncio +@ensureDeferred async def test_callback_for_async(): cb = callback_for(FakeItemPageAsync) assert callable(cb) @@ -72,7 +73,7 @@ def test_callback_for_instance_method(): assert list(result) == ['fake item page'] -@pytest.mark.asyncio +@ensureDeferred async def test_callback_for_instance_method_async(): spider = MySpiderAsync() response = DummyResponse('http://example.com/') diff --git a/tests/test_providers.py b/tests/test_providers.py index 713d82d0..f9fd2929 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -4,7 +4,7 @@ import attr import json import pytest -from pytest_twisted import inlineCallbacks +from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy_poet import HttpResponseProvider from twisted.python.failure import Failure @@ -209,7 +209,7 @@ def test_response_data_provider_fingerprint(settings): assert json.loads(fp) -@pytest.mark.asyncio +@ensureDeferred async def test_http_client_provider(settings): crawler = get_crawler(Spider, settings) crawler.engine = AsyncMock() diff --git a/tox.ini b/tox.ini index fdbfc2f6..4e1203a2 100644 --- a/tox.ini +++ b/tox.ini @@ -1,14 +1,10 @@ [tox] envlist = py37,py38,py39,py310,mypy,docs -[pytest] -asyncio_mode = strict - [testenv] deps = pytest pytest-cov - pytest-asyncio scrapy >= 2.6.0 pytest-twisted web-poet @ git+https://git@github.com/scrapinghub/web-poet@master#egg=web-poet @@ -19,6 +15,10 @@ commands = --doctest-modules \ {posargs:scrapy_poet tests} +[testenv:asyncio] +commands = + {[testenv]commands} --reactor=asyncio + [testenv:mypy] deps = mypy==0.790 From bedbb689612a6d6ef4e7aa0cbae1c853ac2af681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 15:44:39 +0200 Subject: [PATCH 45/71] GitHub Actions: test both reactors --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ab3146fb..4a34949f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,6 +17,7 @@ jobs: fail-fast: false matrix: python-version: ['3.7', '3.8', '3.9', '3.10'] + toxenv: ['py', 'asyncio'] steps: - uses: actions/checkout@v2 @@ -29,6 +30,8 @@ jobs: python -m pip install --upgrade pip python -m pip install tox - name: tox + env: + TOXENV: ${{ matrix.toxenv }} run: | tox -e py - name: coverage From 98a47bac069ca56002f9a309131c6b16870d5047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 15:50:51 +0200 Subject: [PATCH 46/71] Use raise-from syntax for additional request exceptions --- scrapy_poet/backend.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 17102a92..7f7f81e6 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -38,10 +38,11 @@ async def scrapy_backend(request: HttpRequest): deferred_or_future = maybe_deferred_to_future(deferred) try: response = await deferred_or_future - except scrapy.exceptions.IgnoreRequest: - raise HttpError(f"Additional request ignored: {request}") - except Exception: - raise HttpRequestError(f"Additional request failed: {request}") + except scrapy.exceptions.IgnoreRequest: as e + raise HttpError(f"Additional request ignored: {request}") from e + except Exception as e: + message = f"Additional request failed: {request}" + raise HttpRequestError(message) from e return scrapy_response_to_http_response(response) From 4d49428beec726092ec4d3cbcd0f71729a9c5f64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 2 Jun 2022 15:53:29 +0200 Subject: [PATCH 47/71] Fix syntax error --- scrapy_poet/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 7f7f81e6..1fb5613f 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -38,7 +38,7 @@ async def scrapy_backend(request: HttpRequest): deferred_or_future = maybe_deferred_to_future(deferred) try: response = await deferred_or_future - except scrapy.exceptions.IgnoreRequest: as e + except scrapy.exceptions.IgnoreRequest as e: raise HttpError(f"Additional request ignored: {request}") from e except Exception as e: message = f"Additional request failed: {request}" From 1d7ef884bcd645fff7832d67020189a8e06ef086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Jun 2022 11:12:36 +0200 Subject: [PATCH 48/71] Move request conversion into a function --- scrapy_poet/backend.py | 19 +++++++++++-------- scrapy_poet/utils.py | 9 +++++++-- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 1fb5613f..053f6114 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -1,6 +1,5 @@ import logging -import attr import scrapy from scrapy.utils.defer import maybe_deferred_to_future from web_poet import HttpRequest @@ -10,7 +9,10 @@ RequestBackendError, ) -from scrapy_poet.utils import scrapy_response_to_http_response +from scrapy_poet.utils import ( + http_request_to_scrapy_request, + scrapy_response_to_http_response, +) logger = logging.getLogger(__name__) @@ -24,24 +26,25 @@ async def scrapy_backend(request: HttpRequest): ) try: - request = scrapy.Request(**attr.asdict(request), dont_filter=True) + scrapy_request = http_request_to_scrapy_request(request) except TypeError: raise RequestBackendError( f"The given Request isn't compatible with `scrapy.Request`. " f"Ensure that it doesn't contain extra fields: {request}" ) - if request.method == "HEAD": - request.meta["dont_redirect"] = True + if scrapy_request.method == "HEAD": + scrapy_request.meta["dont_redirect"] = True - deferred = backend(request) + deferred = backend(scrapy_request) deferred_or_future = maybe_deferred_to_future(deferred) try: response = await deferred_or_future except scrapy.exceptions.IgnoreRequest as e: - raise HttpError(f"Additional request ignored: {request}") from e + message = f"Additional request ignored: {scrapy_request}" + raise HttpError(message) from e except Exception as e: - message = f"Additional request failed: {request}" + message = f"Additional request failed: {scrapy_request}" raise HttpRequestError(message) from e return scrapy_response_to_http_response(response) diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 28097586..53c645fe 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,7 +1,8 @@ import os -from web_poet import HttpResponse, HttpResponseHeaders -from scrapy.http import Response +import attr +from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders +from scrapy.http import Request, Response from scrapy.utils.project import project_data_dir, inside_project @@ -18,6 +19,10 @@ def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") - return path +def http_request_to_scrapy_request(request: HttpRequest) -> Request: + return Request(**attr.asdict(request), dont_filter=True) + + def scrapy_response_to_http_response(response: Response): """Convenience method to convert a ``scrapy.http.Response`` into a ``web_poet.HttpResponse``. From 9f9dfa1b97843dc77b61299e5c6bf5d385c1fb6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Jun 2022 12:07:25 +0200 Subject: [PATCH 49/71] On request conversion, silently ignore unknown attributes --- scrapy_poet/backend.py | 5 ++- scrapy_poet/utils.py | 10 ++++-- tests/test_backend.py | 19 ++++++----- tests/test_utils.py | 77 +++++++++++++++++++++++++++++++++++++++++- 4 files changed, 98 insertions(+), 13 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 053f6114..d49d35e7 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -26,7 +26,10 @@ async def scrapy_backend(request: HttpRequest): ) try: - scrapy_request = http_request_to_scrapy_request(request) + scrapy_request = http_request_to_scrapy_request( + request, + dont_filter=True, + ) except TypeError: raise RequestBackendError( f"The given Request isn't compatible with `scrapy.Request`. " diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 53c645fe..f6050712 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -19,8 +19,14 @@ def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") - return path -def http_request_to_scrapy_request(request: HttpRequest) -> Request: - return Request(**attr.asdict(request), dont_filter=True) +def http_request_to_scrapy_request(request: HttpRequest, **kwargs) -> Request: + return Request( + url=str(request.url), + method=request.method, + headers=request.headers, + body=request.body, + **kwargs, + ) def scrapy_response_to_http_response(response: Response): diff --git a/tests/test_backend.py b/tests/test_backend.py index 142148fa..84859e49 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -24,18 +24,19 @@ def scrapy_backend(): return create_scrapy_backend(mock_backend) -@ensureDeferred -async def test_incompatible_request(scrapy_backend): - """The Request must have fields that are a subset of `scrapy.Request`.""" +# TODO: Restore or remove. +#@ensureDeferred +#async def test_incompatible_request(scrapy_backend): + #"""The Request must have fields that are a subset of `scrapy.Request`.""" - @attr.define - class Request(web_poet.HttpRequest): - incompatible_field: str = "value" + #@attr.define + #class Request(web_poet.HttpRequest): + #incompatible_field: str = "value" - req = Request("https://example.com") + #req = Request("https://example.com") - with pytest.raises(RequestBackendError): - await scrapy_backend(req) + #with pytest.raises(RequestBackendError): + #await scrapy_backend(req) @ensureDeferred diff --git a/tests/test_utils.py b/tests/test_utils.py index 05e55542..1c98692a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,14 @@ from unittest import mock from pathlib import PosixPath -from scrapy_poet.utils import get_scrapy_data_path +import pytest +from scrapy import Request +from web_poet import HttpRequest + +from scrapy_poet.utils import ( + get_scrapy_data_path, + http_request_to_scrapy_request, +) @mock.patch("scrapy_poet.utils.os.makedirs") @@ -19,3 +26,71 @@ def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path): mock_makedirs.assert_called_once() mock_makedirs.assert_called_with(path, exist_ok=True) + + +@pytest.mark.parametrize( + "http_request,kwargs,scrapy_request", + ( + ( + HttpRequest("https://example.com"), + {}, + Request("https://example.com"), + ), + ( + HttpRequest("https://example.com"), + {"dont_filter": True}, + Request("https://example.com", dont_filter=True), + ), + ( + HttpRequest("https://example.com", method="POST"), + {}, + Request("https://example.com", method="POST"), + ), + ( + HttpRequest("https://example.com", headers={"a": "b"}), + {}, + Request("https://example.com", headers={"a": "b"}), + ), + ( + HttpRequest("https://example.com", headers={"a": "b"}), + {}, + Request("https://example.com", headers=(("a", "b"),)), + ), + ( + HttpRequest("https://example.com", headers=(("a", "b"),)), + {}, + Request("https://example.com", headers=(("a", "b"),)), + ), + ( + HttpRequest( + "https://example.com", + headers=(("a", "b"), ("a", "c")), + ), + {}, + Request( + "https://example.com", + headers=(("a", "b"), ("a", "c")), + ), + ), + ( + HttpRequest("https://example.com", body=b"a"), + {}, + Request("https://example.com", body=b"a"), + ), + ), +) +def test_http_request_to_scrapy_request(http_request, kwargs, scrapy_request): + result = http_request_to_scrapy_request(http_request, **kwargs) + assert result.url == scrapy_request.url + assert result.method == scrapy_request.method + assert result.headers == scrapy_request.headers + assert result.body == scrapy_request.body + assert result.meta == scrapy_request.meta + assert result.cb_kwargs == scrapy_request.cb_kwargs + assert result.cookies == scrapy_request.cookies + assert result.encoding == scrapy_request.encoding + assert result.priority == scrapy_request.priority + assert result.dont_filter == scrapy_request.dont_filter + assert result.callback == scrapy_request.callback + assert result.errback == scrapy_request.errback + assert result.flags == scrapy_request.flags From 6f5218f9729525b62647a506cf1e4dcc294d1d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Jun 2022 12:13:36 +0200 Subject: [PATCH 50/71] Contextualize additional request exception handling --- scrapy_poet/backend.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index d49d35e7..6b96ba06 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -44,9 +44,16 @@ async def scrapy_backend(request: HttpRequest): try: response = await deferred_or_future except scrapy.exceptions.IgnoreRequest as e: + # A Scrapy downloader middleware has caused the request to be + # ignored. message = f"Additional request ignored: {scrapy_request}" raise HttpError(message) from e except Exception as e: + # This could be caused either by network errors (Twisted + # exceptions, OpenSSL, exceptions, etc.) or by unhandled exceptions + # in Scrapy downloader middlewares. We assume the former (raise + # HttpRequestError instead of HttpError), it being the most likely, + # and the latter only happening due to badly written code. message = f"Additional request failed: {scrapy_request}" raise HttpRequestError(message) from e From 850ad4e808acc3ecf9d9d34cabfa99a3e99d4810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Jun 2022 13:44:58 +0200 Subject: [PATCH 51/71] Pass user-defined encoding on response conversion --- scrapy_poet/utils.py | 7 +++- tests/test_utils.py | 78 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 3 deletions(-) diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index f6050712..3b79a560 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -29,13 +29,18 @@ def http_request_to_scrapy_request(request: HttpRequest, **kwargs) -> Request: ) -def scrapy_response_to_http_response(response: Response): +def scrapy_response_to_http_response(response: Response) -> HttpResponse: """Convenience method to convert a ``scrapy.http.Response`` into a ``web_poet.HttpResponse``. """ + kwargs = {} + encoding = getattr(response, "_encoding", None) + if encoding: + kwargs["encoding"] = encoding return HttpResponse( url=response.url, body=response.body, status=response.status, headers=HttpResponseHeaders.from_bytes_dict(response.headers), + **kwargs, ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 1c98692a..f6764695 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,12 +2,13 @@ from pathlib import PosixPath import pytest -from scrapy import Request -from web_poet import HttpRequest +from scrapy.http import Request, Response, TextResponse +from web_poet import HttpRequest, HttpResponse from scrapy_poet.utils import ( get_scrapy_data_path, http_request_to_scrapy_request, + scrapy_response_to_http_response, ) @@ -94,3 +95,76 @@ def test_http_request_to_scrapy_request(http_request, kwargs, scrapy_request): assert result.callback == scrapy_request.callback assert result.errback == scrapy_request.errback assert result.flags == scrapy_request.flags + + +@pytest.mark.parametrize( + "scrapy_response,http_response", + ( + ( + Response("https://example.com"), + HttpResponse("https://example.com", body=b"", status=200), + ), + ( + Response("https://example.com", body=b"a"), + HttpResponse("https://example.com", body=b"a", status=200), + ), + ( + Response("https://example.com", status=429), + HttpResponse("https://example.com", body=b"", status=429), + ), + ( + Response("https://example.com", headers={"a": "b"}), + HttpResponse( + "https://example.com", + body=b"", + status=200, + headers={"a": "b"}, + ), + ), + ( + Response("https://example.com", headers={"a": "b"}), + HttpResponse( + "https://example.com", + body=b"", + status=200, + headers=(("a", "b"),), + ), + ), + ( + Response("https://example.com", headers=(("a", "b"),)), + HttpResponse( + "https://example.com", + body=b"", + status=200, + headers=(("a", "b"),), + ), + ), + pytest.param( + Response("https://example.com", headers=(("a", "b"), ("a", "c"))), + HttpResponse( + "https://example.com", + body=b"", + status=200, + headers=(("a", "b"), ("a", "c")), + ), + marks=pytest.mark.xfail( + reason="https://github.com/scrapy/scrapy/issues/5515", + ), + ), + ( + TextResponse("https://example.com", body="a", encoding="ascii"), + HttpResponse("https://example.com", body=b"a", status=200, encoding="ascii"), + ), + ( + TextResponse("https://example.com", body="a", encoding="utf-8"), + HttpResponse("https://example.com", body=b"a", status=200, encoding="utf-8"), + ), + ), +) +def test_scrapy_response_to_http_response(scrapy_response, http_response): + result = scrapy_response_to_http_response(scrapy_response) + assert result.url == http_response.url + assert result.body == http_response.body + assert result.status == http_response.status + assert result.headers == http_response.headers + assert result._encoding == http_response._encoding From 5f483c4dc92f03b6ab374f2e6073c4bcee7fd3b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Jun 2022 13:53:22 +0200 Subject: [PATCH 52/71] Support non-string values as meta keys --- scrapy_poet/page_input_providers.py | 2 +- tests/test_providers.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index cd7b2d24..51ad3204 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -222,4 +222,4 @@ def __call__(self, to_provide: Set[Callable], request: Request): """Creates a ``web_poet.requests.Meta`` instance based on the data found from the ``meta["po_meta"]`` field of a ``scrapy.http.Response`` instance. """ - return [Meta(**request.meta.get("po_meta", {}))] + return [Meta(request.meta.get("po_meta", {}))] diff --git a/tests/test_providers.py b/tests/test_providers.py index f9fd2929..a6e94878 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -237,3 +237,10 @@ def test_meta_provider(settings): results = provider(set(), request) assert results[0] == expected_data + + # Check that keys that are invalid Python variable names work. + expected_data = {1: "a"} + request.meta.update({"po_meta": expected_data}) + results = provider(set(), request) + + assert results[0] == expected_data From 19a32839dd89ea28012e1d6bd69adb8f251d6c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Jun 2022 14:00:49 +0200 Subject: [PATCH 53/71] Remove request convertion TypeError handling --- scrapy_poet/backend.py | 15 ++++----------- tests/test_backend.py | 16 ---------------- 2 files changed, 4 insertions(+), 27 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 6b96ba06..212d4fe5 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -6,7 +6,6 @@ from web_poet.exceptions import ( HttpError, HttpRequestError, - RequestBackendError, ) from scrapy_poet.utils import ( @@ -25,16 +24,10 @@ async def scrapy_backend(request: HttpRequest): f"one of type: '{type(request)}'." ) - try: - scrapy_request = http_request_to_scrapy_request( - request, - dont_filter=True, - ) - except TypeError: - raise RequestBackendError( - f"The given Request isn't compatible with `scrapy.Request`. " - f"Ensure that it doesn't contain extra fields: {request}" - ) + scrapy_request = http_request_to_scrapy_request( + request, + dont_filter=True, + ) if scrapy_request.method == "HEAD": scrapy_request.meta["dont_redirect"] = True diff --git a/tests/test_backend.py b/tests/test_backend.py index 84859e49..08f08ce2 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -9,7 +9,6 @@ from scrapy import Spider from tests.utils import AsyncMock from web_poet import HttpClient -from web_poet.exceptions import RequestBackendError from web_poet.pages import ItemWebPage from scrapy_poet.backend import create_scrapy_backend @@ -24,21 +23,6 @@ def scrapy_backend(): return create_scrapy_backend(mock_backend) -# TODO: Restore or remove. -#@ensureDeferred -#async def test_incompatible_request(scrapy_backend): - #"""The Request must have fields that are a subset of `scrapy.Request`.""" - - #@attr.define - #class Request(web_poet.HttpRequest): - #incompatible_field: str = "value" - - #req = Request("https://example.com") - - #with pytest.raises(RequestBackendError): - #await scrapy_backend(req) - - @ensureDeferred async def test_incompatible_scrapy_request(scrapy_backend): """The Request must be web_poet.HttpRequest and not anything else.""" From 4dcca57e503952346161ba9c93cb4d3341391fd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 6 Jun 2022 18:44:02 +0200 Subject: [PATCH 54/71] Provide integration tests for good and bad additional responses --- tests/test_backend.py | 111 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 98 insertions(+), 13 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 08f08ce2..cf6acc68 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -1,14 +1,18 @@ import attr -import pytest from unittest import mock -import web_poet +import pytest import scrapy import twisted +import web_poet from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Spider from tests.utils import AsyncMock +from twisted.internet.task import deferLater +from twisted.web.resource import Resource +from twisted.web.server import NOT_DONE_YET from web_poet import HttpClient +from web_poet.exceptions import HttpResponseError from web_poet.pages import ItemWebPage from scrapy_poet.backend import create_scrapy_backend @@ -136,25 +140,88 @@ async def test_scrapy_poet_backend_dont_filter(fake_http_response): assert scrapy_request.dont_filter is True -@inlineCallbacks -def test_scrapy_poet_backend_await(): - """Make sure that the awaiting of the backend call works. +class LeafResource(Resource): + isLeaf = True + + def deferRequest(self, request, delay, f, *a, **kw): + def _cancelrequest(_): + # silence CancelledError + d.addErrback(lambda _: None) + d.cancel() + + d = deferLater(reactor, delay, f, *a, **kw) + request.notifyFinish().addErrback(_cancelrequest) + return d - For this test to pass, the resulting deferred must be awaited as such when - using a non-asyncio Twisted reactor, but first converted into a future - when using an asyncio Twisted reactor. - """ + +class EchoResource(LeafResource): + def render_GET(self, request): + return request.content.read() + + +@inlineCallbacks +def test_additional_requests_success(): items = [] - with MockServer(HtmlResource) as server: + + with MockServer(EchoResource) as server: @attr.define class ItemPage(ItemWebPage): http_client: HttpClient async def to_item(self): - await self.http_client.request(server.root_url) - return {'foo': 'bar'} + response = await self.http_client.request( + server.root_url, + body=b'bar', + ) + return {'foo': response.body.decode()} + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'foo': 'bar'}] + + +class StatusResource(LeafResource): + def render_GET(self, request): + decoded_body = request.content.read().decode() + if decoded_body: + request.setResponseCode(int(decoded_body)) + return b"" + + +@inlineCallbacks +def test_additional_requests_bad_response(): + items = [] + + with MockServer(StatusResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + try: + await self.http_client.request( + server.root_url, + body=b'400', + ) + except HttpResponseError: + return {'foo': 'bar'} class TestSpider(Spider): name = 'test_spider' @@ -172,4 +239,22 @@ async def parse(self, response, page: ItemPage): crawler = make_crawler(TestSpider, {}) yield crawler.crawl() - assert items == [{'foo': 'bar'}] + + assert items == [{'foo': 'bar'}] + + +@inlineCallbacks +def test_additional_requests_connection_issue(): + ... # TODO + + +@inlineCallbacks +def test_additional_requests_ignored_request(): + ... # TODO + + +@inlineCallbacks +def test_additional_requests_dont_filter(): + ... # TODO + # Test using the same URL for the source request and for 2 additional + # requests. From aa09109ffa1c948dfddb9d125b08f0a09ebf943e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Jun 2022 12:19:12 +0200 Subject: [PATCH 55/71] =?UTF-8?q?Meta=20=E2=86=92=20PageParams?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.rst | 3 ++- docs/intro/advanced-tutorial.rst | 27 ++++++++++++++------------- scrapy_poet/middleware.py | 8 ++++++-- scrapy_poet/page_input_providers.py | 15 ++++++++------- tests/test_backend.py | 22 +++++++++++----------- tests/test_providers.py | 12 ++++++------ 6 files changed, 47 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c817ab02..275b29b9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,7 +9,8 @@ TBR * Support for the new features in ``web_poet>=0.2.0`` for supporting additional requests inside Page Objects: - * Created new providers for ``web_poet.Meta`` and ``web_poet.HttpClient``. + * Created new providers for ``web_poet.PageParams`` and + ``web_poet.HttpClient``. * The minimum Scrapy version is now ``2.6.0``. * We have these **backward incompatible** changes since the ``web_poet.OverrideRule`` follow a different structure: diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index a48c104c..d45b4ccd 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -8,12 +8,12 @@ This section intends to go over the supported features in **web-poet** by **scrapy-poet**: * ``web_poet.HttpClient`` - * ``web_poet.Meta`` + * ``web_poet.PageParams`` These are mainly achieved by **scrapy-poet** implementing **providers** for them: * :class:`scrapy_poet.page_input_providers.HttpClientProvider` - * :class:`scrapy_poet.page_input_providers.MetaProvider` + * :class:`scrapy_poet.page_input_providers.PageParamsProvider` .. _`intro-additional-requests`: @@ -94,11 +94,11 @@ since the ``to_item()`` method of the Page Object we're using is an ``async`` method as well. -Meta -==== +Page params +=========== -Using ``web_poet.Meta`` allows the Scrapy spider to pass any arbitrary information -into the Page Object. +Using ``web_poet.PageParams`` allows the Scrapy spider to pass any arbitrary +information into the Page Object. Suppose we update the earlier Page Object to control the additional request. This basically acts as a switch to update the behavior of the Page Object: @@ -112,7 +112,7 @@ This basically acts as a switch to update the behavior of the Page Object: @attr.define class ProductPage(web_poet.ItemWebPage): http_client: web_poet.HttpClient - meta: web_poet.Meta + page_params: web_poet.PageParams async def to_item(self): item = { @@ -122,7 +122,7 @@ This basically acts as a switch to update the behavior of the Page Object: } # Simulates clicking on a button that says "View All Images" - if self.meta.get("enable_extracting_all_images") + if self.page_params.get("enable_extracting_all_images") response: web_poet.HttpResponse = await self.http_client.get( f"https://api.example.com/v2/images?id={item['product_id']}" ) @@ -130,10 +130,11 @@ This basically acts as a switch to update the behavior of the Page Object: return item -Passing the ``enable_extracting_all_images`` meta value from the spider into -the Page Object can be achieved by using **Scrapy's** ``Request.meta`` attribute. -Specifically, any ``dict`` value inside the ``po_meta`` parameter inside -**Scrapy's** ``Request.meta`` will be passed into ``web_poet.Meta``. +Passing the ``enable_extracting_all_images`` page parameter from the spider +into the Page Object can be achieved by using **Scrapy's** ``Request.meta`` +attribute. Specifically, any ``dict`` value inside the ``page_params`` +parameter inside **Scrapy's** ``Request.meta`` will be passed into +``web_poet.PageParams``. Let's see it in action: @@ -160,7 +161,7 @@ Let's see it in action: yield scrapy.Request( url=url, callback=self.parse, - meta={"po_meta": {"enable_extracting_all_images": True}} + meta={"page_params": {"enable_extracting_all_images": True}} ) async def parse(self, response, page: ProductPage): diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index fd356c77..fde14af2 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -13,7 +13,11 @@ from scrapy.utils.misc import create_instance, load_object from .api import DummyResponse -from .page_input_providers import HttpResponseProvider, HttpClientProvider, MetaProvider +from .page_input_providers import ( + HttpClientProvider, + HttpResponseProvider, + PageParamsProvider, +) from .overrides import OverridesRegistry from .injection import Injector @@ -24,7 +28,7 @@ DEFAULT_PROVIDERS = { HttpResponseProvider: 500, HttpClientProvider: 600, - MetaProvider: 700, + PageParamsProvider: 700, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 9b30c793..06364f7d 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -21,7 +21,7 @@ from scrapy_poet.utils import scrapy_response_to_http_response from scrapy_poet.injection_errors import MalformedProvidedClassesError from scrapy_poet.backend import create_scrapy_backend -from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, Meta +from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams class PageObjectInputProvider: @@ -213,12 +213,13 @@ def __call__(self, to_provide: Set[Callable], crawler: Crawler): return [HttpClient(request_downloader=backend)] -class MetaProvider(PageObjectInputProvider): - """This class provides ``web_poet.page_inputs.Meta`` instances.""" - provided_classes = {Meta} +class PageParamsProvider(PageObjectInputProvider): + """This class provides ``web_poet.page_inputs.PageParams`` instances.""" + provided_classes = {PageParams} def __call__(self, to_provide: Set[Callable], request: Request): - """Creates a ``web_poet.requests.Meta`` instance based on the data found - from the ``meta["po_meta"]`` field of a ``scrapy.http.Response`` instance. + """Creates a ``web_poet.requests.PageParams`` instance based on the + data found from the ``meta["page_params"]`` field of a + ``scrapy.http.Response`` instance. """ - return [Meta(request.meta.get("po_meta", {}))] + return [PageParams(request.meta.get("page_params", {}))] diff --git a/tests/test_backend.py b/tests/test_backend.py index cf6acc68..e4e0ba2c 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -243,18 +243,18 @@ async def parse(self, response, page: ItemPage): assert items == [{'foo': 'bar'}] -@inlineCallbacks -def test_additional_requests_connection_issue(): - ... # TODO +#@inlineCallbacks +#def test_additional_requests_connection_issue(): + #... # TODO -@inlineCallbacks -def test_additional_requests_ignored_request(): - ... # TODO +#@inlineCallbacks +#def test_additional_requests_ignored_request(): + #... # TODO -@inlineCallbacks -def test_additional_requests_dont_filter(): - ... # TODO - # Test using the same URL for the source request and for 2 additional - # requests. +#@inlineCallbacks +#def test_additional_requests_dont_filter(): + #... # TODO + ## Test using the same URL for the source request and for 2 additional + ## requests. diff --git a/tests/test_providers.py b/tests/test_providers.py index a6e94878..d035366f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -15,9 +15,9 @@ from scrapy.utils.test import get_crawler from scrapy_poet.page_input_providers import ( CacheDataProviderMixin, - PageObjectInputProvider, HttpClientProvider, - MetaProvider, + PageObjectInputProvider, + PageParamsProvider, ) from tests.utils import crawl_single_item, HtmlResource, AsyncMock from web_poet import HttpResponse, HttpClient @@ -223,9 +223,9 @@ async def test_http_client_provider(settings): results[0]._request_downloader == mock_factory.return_value -def test_meta_provider(settings): +def test_page_params_provider(settings): crawler = get_crawler(Spider, settings) - provider = MetaProvider(crawler) + provider = PageParamsProvider(crawler) request = scrapy.http.Request("https://example.com") results = provider(set(), request) @@ -233,14 +233,14 @@ def test_meta_provider(settings): assert results[0] == {} expected_data = {"key": "value"} - request.meta.update({"po_meta": expected_data}) + request.meta.update({"page_params": expected_data}) results = provider(set(), request) assert results[0] == expected_data # Check that keys that are invalid Python variable names work. expected_data = {1: "a"} - request.meta.update({"po_meta": expected_data}) + request.meta.update({"page_params": expected_data}) results = provider(set(), request) assert results[0] == expected_data From dcb6716ea7dfac73f8d8364d375664b881dbf658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Jun 2022 13:18:29 +0200 Subject: [PATCH 56/71] Implement test_additional_requests_connection_issue --- tests/test_backend.py | 74 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index e4e0ba2c..00a4a7e7 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -1,4 +1,5 @@ import attr +from functools import partial from unittest import mock import pytest @@ -8,14 +9,16 @@ from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Spider from tests.utils import AsyncMock +from twisted.internet import reactor from twisted.internet.task import deferLater from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET from web_poet import HttpClient -from web_poet.exceptions import HttpResponseError +from web_poet.exceptions import HttpRequestError, HttpResponseError from web_poet.pages import ItemWebPage from scrapy_poet.backend import create_scrapy_backend +from scrapy_poet.utils import http_request_to_scrapy_request from tests.utils import ( crawl_single_item, make_crawler, HtmlResource, MockServer ) @@ -243,13 +246,78 @@ async def parse(self, response, page: ItemPage): assert items == [{'foo': 'bar'}] +class DelayedResource(LeafResource): + def render_GET(self, request): + decoded_body = request.content.read().decode() + seconds = float(decoded_body) if decoded_body else 0 + self.deferRequest( + request, + seconds, + self._delayedRender, + request, + seconds, + ) + return NOT_DONE_YET + + def _delayedRender(self, request, seconds): + request.finish() + + +@inlineCallbacks +def test_additional_requests_connection_issue(): + items = [] + + with ( + mock.patch('scrapy_poet.backend.http_request_to_scrapy_request') + as mock_http_request_to_scrapy_request + ): + mock_http_request_to_scrapy_request.side_effect = partial( + http_request_to_scrapy_request, + meta={'download_timeout': 0.001}, + ) + + with MockServer(DelayedResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + try: + await self.http_client.request( + server.root_url, + body=b"0.002", + ) + except HttpRequestError: + return {'foo': 'bar'} + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'foo': 'bar'}] + + #@inlineCallbacks -#def test_additional_requests_connection_issue(): +#def test_additional_requests_ignored_request(): #... # TODO #@inlineCallbacks -#def test_additional_requests_ignored_request(): +#def test_additional_requests_unhandled_downloader_middleware_exception(): #... # TODO From 381eb25a5dc2d2f1ecef1d8989352cda2f99882a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Jun 2022 13:26:31 +0200 Subject: [PATCH 57/71] Implement test_additional_requests_ignored_request --- tests/test_backend.py | 50 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 00a4a7e7..0d18dbc7 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -8,13 +8,14 @@ import web_poet from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Spider +from scrapy.exceptions import IgnoreRequest from tests.utils import AsyncMock from twisted.internet import reactor from twisted.internet.task import deferLater from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET from web_poet import HttpClient -from web_poet.exceptions import HttpRequestError, HttpResponseError +from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError from web_poet.pages import ItemWebPage from scrapy_poet.backend import create_scrapy_backend @@ -311,9 +312,50 @@ async def parse(self, response, page: ItemPage): assert items == [{'foo': 'bar'}] -#@inlineCallbacks -#def test_additional_requests_ignored_request(): - #... # TODO +@inlineCallbacks +def test_additional_requests_ignored_request(): + items = [] + + with MockServer(EchoResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + try: + await self.http_client.request( + server.root_url, + body=b'ignore', + ) + except HttpError as e: + return {'exc': e.__class__} + + class TestDownloaderMiddleware: + def process_response(self, request, response, spider): + if b'ignore' in response.body: + raise IgnoreRequest + return response + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + TestDownloaderMiddleware: 1, + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'exc': HttpError}] #@inlineCallbacks From 3429d496de65f474bcbf3da5048ee54cbce2dd33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Jun 2022 13:31:57 +0200 Subject: [PATCH 58/71] Implement test_additional_requests_unhandled_downloader_middleware_exception --- tests/test_backend.py | 58 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 0d18dbc7..9687c9c9 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -358,9 +358,61 @@ async def parse(self, response, page: ItemPage): assert items == [{'exc': HttpError}] -#@inlineCallbacks -#def test_additional_requests_unhandled_downloader_middleware_exception(): - #... # TODO +@pytest.mark.xfail( + reason=( + "Currently, we do not make a distinction between exceptions raised " + "from the downloader or from a downloader middleware, except for " + "IgnoreRequest. In the future, we might want to inspect the stack to " + "determine the source of an exception and raise HttpError instead of " + "HttpRequestError when the exception originates in a downloader " + "middleware." + ), + strict=True, +) +@inlineCallbacks +def test_additional_requests_unhandled_downloader_middleware_exception(): + items = [] + + with MockServer(EchoResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + try: + await self.http_client.request( + server.root_url, + body=b'raise', + ) + except HttpError as e: + return {'exc': e.__class__} + + class TestDownloaderMiddleware: + def process_response(self, request, response, spider): + if b'raise' in response.body: + raise RuntimeError + return response + + class TestSpider(Spider): + name = 'test_spider' + start_urls = [server.root_url] + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + TestDownloaderMiddleware: 1, + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'exc': HttpError}] #@inlineCallbacks From 902d41c283c4d871ed20d383785106c464d24620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Jun 2022 13:36:29 +0200 Subject: [PATCH 59/71] Fix pre-3.9 syntax error --- tests/test_backend.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 9687c9c9..f3594552 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -268,10 +268,8 @@ def _delayedRender(self, request, seconds): def test_additional_requests_connection_issue(): items = [] - with ( - mock.patch('scrapy_poet.backend.http_request_to_scrapy_request') - as mock_http_request_to_scrapy_request - ): + with mock.patch('scrapy_poet.backend.http_request_to_scrapy_request') \ + as mock_http_request_to_scrapy_request: mock_http_request_to_scrapy_request.side_effect = partial( http_request_to_scrapy_request, meta={'download_timeout': 0.001}, From 776b768f0d0551bdde77c513f7daa51f2c6ae31f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Jun 2022 14:18:07 +0200 Subject: [PATCH 60/71] Implement test_additional_requests_dont_filter --- scrapy_poet/backend.py | 5 +---- tests/test_backend.py | 51 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 212d4fe5..5b4d247c 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -24,10 +24,7 @@ async def scrapy_backend(request: HttpRequest): f"one of type: '{type(request)}'." ) - scrapy_request = http_request_to_scrapy_request( - request, - dont_filter=True, - ) + scrapy_request = http_request_to_scrapy_request(request) if scrapy_request.method == "HEAD": scrapy_request.meta["dont_redirect"] = True diff --git a/tests/test_backend.py b/tests/test_backend.py index f3594552..a1d0ace2 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -7,7 +7,7 @@ import twisted import web_poet from pytest_twisted import ensureDeferred, inlineCallbacks -from scrapy import Spider +from scrapy import Request, Spider from scrapy.exceptions import IgnoreRequest from tests.utils import AsyncMock from twisted.internet import reactor @@ -413,8 +413,47 @@ async def parse(self, response, page: ItemPage): assert items == [{'exc': HttpError}] -#@inlineCallbacks -#def test_additional_requests_dont_filter(): - #... # TODO - ## Test using the same URL for the source request and for 2 additional - ## requests. +@inlineCallbacks +def test_additional_requests_dont_filter(): + """Verify that while duplicate regular requests are filtered out, + additional requests are not (neither relative to the main requests not + relative to each other). + + In Scrapy, request de-duplication is implemented on the scheduler, and + because additional requests do not go through the scheduler, this works as + expected. + """ + items = [] + + with MockServer(EchoResource) as server: + + @attr.define + class ItemPage(ItemWebPage): + http_client: HttpClient + + async def to_item(self): + await self.http_client.request(server.root_url) + await self.http_client.request(server.root_url) + return {'foo': 'bar'} + + class TestSpider(Spider): + name = 'test_spider' + + custom_settings = { + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy_poet.InjectionMiddleware': 543, + }, + } + + def start_requests(self): + yield Request(server.root_url) + yield Request(server.root_url) + + async def parse(self, response, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert items == [{'foo': 'bar'}] From a1c52f80318fc1a1a905973a4ea396269a43de5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 Jun 2022 14:20:32 +0200 Subject: [PATCH 61/71] Remove unneeded test --- tests/test_backend.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index a1d0ace2..300422d0 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -126,24 +126,6 @@ async def test_scrapy_poet_backend_head_redirect(fake_http_response): assert scrapy_request.meta.get("dont_redirect") is True -@ensureDeferred -async def test_scrapy_poet_backend_dont_filter(fake_http_response): - req = web_poet.HttpRequest("https://example.com") - - with mock.patch( - "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock - ) as mock_dtf: - mock_dtf.return_value = fake_http_response - mock_downloader = mock.MagicMock(return_value=AsyncMock) - scrapy_backend = create_scrapy_backend(mock_downloader) - - await scrapy_backend(req) - - args, kwargs = mock_downloader.call_args - scrapy_request = args[0] - assert scrapy_request.dont_filter is True - - class LeafResource(Resource): isLeaf = True From 64eeb721c91e8560b6d398ea1ece878765eaa228 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 Jun 2022 08:53:29 +0200 Subject: [PATCH 62/71] Update docs/intro/advanced-tutorial.rst Co-authored-by: Mikhail Korobov --- docs/intro/advanced-tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index d45b4ccd..48e91e95 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -73,7 +73,7 @@ It can be directly used inside the spider as: import scrapy - def ProductSpider(scrapy.Spider): + class ProductSpider(scrapy.Spider): custom_settings = { "DOWNLOADER_MIDDLEWARES": { From 5210b2afdf1c3bb9613defaa522b06ea53d2ae05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 Jun 2022 08:53:35 +0200 Subject: [PATCH 63/71] Update docs/intro/advanced-tutorial.rst Co-authored-by: Mikhail Korobov --- docs/intro/advanced-tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index 48e91e95..a1fd2fa5 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -143,7 +143,7 @@ Let's see it in action: import scrapy - def ProductSpider(scrapy.Spider): + class ProductSpider(scrapy.Spider): custom_settings = { "DOWNLOADER_MIDDLEWARES": { From 8aecfab12d77de75907f8cffff8ebdf78325df40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 Jun 2022 08:59:50 +0200 Subject: [PATCH 64/71] =?UTF-8?q?backend=20=E2=86=92=20download=5Ffunc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_poet/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 5b4d247c..4518b481 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -def create_scrapy_backend(backend): +def create_scrapy_backend(download_func): async def scrapy_backend(request: HttpRequest): if not isinstance(request, HttpRequest): raise TypeError( @@ -29,7 +29,7 @@ async def scrapy_backend(request: HttpRequest): if scrapy_request.method == "HEAD": scrapy_request.meta["dont_redirect"] = True - deferred = backend(scrapy_request) + deferred = download_func(scrapy_request) deferred_or_future = maybe_deferred_to_future(deferred) try: response = await deferred_or_future From dc36f5a29e39850a953b3bca3ade5f6a7bdf4803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 Jun 2022 09:04:24 +0200 Subject: [PATCH 65/71] test_additional_requests_dont_filter: ensure additional requests are not no-ops --- tests/test_backend.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 300422d0..7af8a854 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -414,9 +414,15 @@ class ItemPage(ItemWebPage): http_client: HttpClient async def to_item(self): - await self.http_client.request(server.root_url) - await self.http_client.request(server.root_url) - return {'foo': 'bar'} + response1 = await self.http_client.request( + server.root_url, + body=b'a', + ) + response2 = await self.http_client.request( + server.root_url, + body=b'a', + ) + return {response1.body.decode(): response2.body.decode()} class TestSpider(Spider): name = 'test_spider' @@ -428,8 +434,8 @@ class TestSpider(Spider): } def start_requests(self): - yield Request(server.root_url) - yield Request(server.root_url) + yield Request(server.root_url, body=b'a') + yield Request(server.root_url, body=b'a') async def parse(self, response, page: ItemPage): item = await page.to_item() @@ -438,4 +444,4 @@ async def parse(self, response, page: ItemPage): crawler = make_crawler(TestSpider, {}) yield crawler.crawl() - assert items == [{'foo': 'bar'}] + assert items == [{'a': 'a'}] From b839eb183a1b6e354b7b891993ee848120bbb0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 9 Jun 2022 11:19:47 +0200 Subject: [PATCH 66/71] Cast url from HttpRequest and HttpResponse before comparisons --- tests/test_backend.py | 2 +- tests/test_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 7af8a854..512fd123 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -69,7 +69,7 @@ async def test_scrapy_poet_backend(fake_http_response): mock_downloader.assert_called_once() assert isinstance(response, web_poet.HttpResponse) - assert response.url == "https://example.com" + assert str(response.url) == "https://example.com" assert response.text == "some content" assert response.status == 200 assert response.headers.get("Content-Type") == "text/html; charset=utf-8" diff --git a/tests/test_utils.py b/tests/test_utils.py index f6764695..3083579b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -163,7 +163,7 @@ def test_http_request_to_scrapy_request(http_request, kwargs, scrapy_request): ) def test_scrapy_response_to_http_response(scrapy_response, http_response): result = scrapy_response_to_http_response(scrapy_response) - assert result.url == http_response.url + assert str(result.url) == str(http_response.url) assert result.body == http_response.body assert result.status == http_response.status assert result.headers == http_response.headers From 995e9b92446248dd3a0e57e413cded9072af4580 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 13 Jun 2022 22:46:51 +0800 Subject: [PATCH 67/71] fix examples when using callback_for() --- docs/intro/basic-tutorial.rst | 28 ++++++++++++++++++++-------- scrapy_poet/api.py | 31 ++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 5866724c..6a37c548 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -200,23 +200,35 @@ returning the result of the ``to_item`` method call. We could use .. tip:: - :func:`~.callback_for` also supports `async generators`. So having the + :func:`~.callback_for` also supports `async generators`. So if we have the following: .. code-block:: python - class BookPage(web_poet.ItemWebPage): - async def to_item(self): - return await do_something_async() + class BooksSpider(scrapy.Spider): + name = 'books' + start_urls = ['http://books.toscrape.com/'] - callback_for(BookPage) + def parse(self, response): + links = response.css('.image_container a') + yield from response.follow_all(links, self.parse_book) - would result in: + async def parse_book(self, response: DummyResponse, page: BookPage): + yield await page.to_item() + + It could be turned into: .. code-block:: python - async def parse_book(self, response: DummyResponse, page: BookPage): - yield await page.to_item() + class BooksSpider(scrapy.Spider): + name = 'books' + start_urls = ['http://books.toscrape.com/'] + + def parse(self, response): + links = response.css('.image_container a') + yield from response.follow_all(links, self.parse_book) + + parse_book = callback_for(BookPage) This is useful when the Page Objects uses additional requests, which rely heavily on ``async/await`` syntax. More info on this in this tutorial diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index bb0da171..d09259b5 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -68,24 +68,37 @@ def parse(self, response): parse_book = callback_for(BookPage) - This also produces an async generator callable if the Page Objects's + It also supports producing an async generator callable if the Page Objects's ``to_item()`` method is a coroutine which uses the ``async/await`` syntax. - So having the following: + + So if we have the following: .. code-block:: python - class BookPage(web_poet.ItemWebPage): - async def to_item(self): - return await do_something_async() + class BooksSpider(scrapy.Spider): + name = 'books' + start_urls = ['http://books.toscrape.com/'] - callback_for(BookPage) + def parse(self, response): + links = response.css('.image_container a') + yield from response.follow_all(links, self.parse_book) - would result in: + async def parse_book(self, response: DummyResponse, page: BookPage): + yield await page.to_item() + + It could be turned into: .. code-block:: python - async def parse_book(self, response: DummyResponse, page: BookPage): - yield await page.to_item() + class BooksSpider(scrapy.Spider): + name = 'books' + start_urls = ['http://books.toscrape.com/'] + + def parse(self, response): + links = response.css('.image_container a') + yield from response.follow_all(links, self.parse_book) + + parse_book = callback_for(BookPage) The generated callback could be used as a spider instance method or passed as an inline/anonymous argument. Make sure to define it as a spider From be786fc191ca4d409ca6dd486a115bcc07e27972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 10:15:17 +0200 Subject: [PATCH 68/71] =?UTF-8?q?backend=20=E2=86=92=20downloader?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_poet/{backend.py => downloader.py} | 6 +-- scrapy_poet/page_input_providers.py | 6 +-- tests/{test_backend.py => test_downloader.py} | 46 +++++++++---------- tests/test_providers.py | 2 +- 4 files changed, 30 insertions(+), 30 deletions(-) rename scrapy_poet/{backend.py => downloader.py} (93%) rename tests/{test_backend.py => test_downloader.py} (89%) diff --git a/scrapy_poet/backend.py b/scrapy_poet/downloader.py similarity index 93% rename from scrapy_poet/backend.py rename to scrapy_poet/downloader.py index 4518b481..b6ae534f 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/downloader.py @@ -16,8 +16,8 @@ logger = logging.getLogger(__name__) -def create_scrapy_backend(download_func): - async def scrapy_backend(request: HttpRequest): +def create_scrapy_downloader(download_func): + async def scrapy_downloader(request: HttpRequest): if not isinstance(request, HttpRequest): raise TypeError( f"The request should be 'web_poet.HttpRequest' but received " @@ -49,4 +49,4 @@ async def scrapy_backend(request: HttpRequest): return scrapy_response_to_http_response(response) - return scrapy_backend + return scrapy_downloader diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 06364f7d..f47bec2a 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -20,7 +20,7 @@ from scrapy_poet.utils import scrapy_response_to_http_response from scrapy_poet.injection_errors import MalformedProvidedClassesError -from scrapy_poet.backend import create_scrapy_backend +from scrapy_poet.downloader import create_scrapy_downloader from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams @@ -209,8 +209,8 @@ def __call__(self, to_provide: Set[Callable], crawler: Crawler): """Creates an ``web_poet.requests.HttpClient`` instance using Scrapy's downloader. """ - backend = create_scrapy_backend(crawler.engine.download) - return [HttpClient(request_downloader=backend)] + downloader = create_scrapy_downloader(crawler.engine.download) + return [HttpClient(request_downloader=downloader)] class PageParamsProvider(PageObjectInputProvider): diff --git a/tests/test_backend.py b/tests/test_downloader.py similarity index 89% rename from tests/test_backend.py rename to tests/test_downloader.py index 512fd123..bd74a791 100644 --- a/tests/test_backend.py +++ b/tests/test_downloader.py @@ -18,7 +18,7 @@ from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError from web_poet.pages import ItemWebPage -from scrapy_poet.backend import create_scrapy_backend +from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.utils import http_request_to_scrapy_request from tests.utils import ( crawl_single_item, make_crawler, HtmlResource, MockServer @@ -26,19 +26,19 @@ @pytest.fixture -def scrapy_backend(): - mock_backend = AsyncMock() - return create_scrapy_backend(mock_backend) +def scrapy_downloader(): + mock_downloader = AsyncMock() + return create_scrapy_downloader(mock_downloader) @ensureDeferred -async def test_incompatible_scrapy_request(scrapy_backend): +async def test_incompatible_scrapy_request(scrapy_downloader): """The Request must be web_poet.HttpRequest and not anything else.""" req = scrapy.Request("https://example.com") with pytest.raises(TypeError): - await scrapy_backend(req) + await scrapy_downloader(req) @pytest.fixture @@ -52,19 +52,19 @@ def fake_http_response(): @ensureDeferred -async def test_scrapy_poet_backend(fake_http_response): +async def test_scrapy_poet_downloader(fake_http_response): req = web_poet.HttpRequest("https://example.com") with mock.patch( - "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock ) as mock_dtf: mock_dtf.return_value = fake_http_response mock_downloader = mock.MagicMock(return_value=AsyncMock) - scrapy_backend = create_scrapy_backend(mock_downloader) + scrapy_downloader = create_scrapy_downloader(mock_downloader) - response = await scrapy_backend(req) + response = await scrapy_downloader(req) mock_downloader.assert_called_once() assert isinstance(response, web_poet.HttpResponse) @@ -77,49 +77,49 @@ async def test_scrapy_poet_backend(fake_http_response): @ensureDeferred -async def test_scrapy_poet_backend_ignored_request(): +async def test_scrapy_poet_downloader_ignored_request(): """It should handle IgnoreRequest from Scrapy according to the web poet standard on additional request error handling.""" req = web_poet.HttpRequest("https://example.com") with mock.patch( - "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock ) as mock_dtf: mock_dtf.side_effect = scrapy.exceptions.IgnoreRequest mock_downloader = mock.MagicMock(return_value=AsyncMock) - scrapy_backend = create_scrapy_backend(mock_downloader) + scrapy_downloader = create_scrapy_downloader(mock_downloader) with pytest.raises(web_poet.exceptions.HttpError): - await scrapy_backend(req) + await scrapy_downloader(req) @ensureDeferred -async def test_scrapy_poet_backend_twisted_error(): +async def test_scrapy_poet_downloader_twisted_error(): req = web_poet.HttpRequest("https://example.com") with mock.patch( - "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock ) as mock_dtf: mock_dtf.side_effect = twisted.internet.error.TimeoutError mock_downloader = mock.MagicMock(return_value=AsyncMock) - scrapy_backend = create_scrapy_backend(mock_downloader) + scrapy_downloader = create_scrapy_downloader(mock_downloader) with pytest.raises(web_poet.exceptions.HttpRequestError): - await scrapy_backend(req) + await scrapy_downloader(req) @ensureDeferred -async def test_scrapy_poet_backend_head_redirect(fake_http_response): +async def test_scrapy_poet_downloader_head_redirect(fake_http_response): req = web_poet.HttpRequest("https://example.com", method="HEAD") with mock.patch( - "scrapy_poet.backend.maybe_deferred_to_future", new_callable=AsyncMock + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock ) as mock_dtf: mock_dtf.return_value = fake_http_response mock_downloader = mock.MagicMock(return_value=AsyncMock) - scrapy_backend = create_scrapy_backend(mock_downloader) + scrapy_downloader = create_scrapy_downloader(mock_downloader) - await scrapy_backend(req) + await scrapy_downloader(req) args, kwargs = mock_downloader.call_args scrapy_request = args[0] @@ -250,7 +250,7 @@ def _delayedRender(self, request, seconds): def test_additional_requests_connection_issue(): items = [] - with mock.patch('scrapy_poet.backend.http_request_to_scrapy_request') \ + with mock.patch('scrapy_poet.downloader.http_request_to_scrapy_request') \ as mock_http_request_to_scrapy_request: mock_http_request_to_scrapy_request.side_effect = partial( http_request_to_scrapy_request, diff --git a/tests/test_providers.py b/tests/test_providers.py index d035366f..5907810a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -215,7 +215,7 @@ async def test_http_client_provider(settings): crawler.engine = AsyncMock() with mock.patch( - "scrapy_poet.page_input_providers.create_scrapy_backend" + "scrapy_poet.page_input_providers.create_scrapy_downloader" ) as mock_factory: provider = HttpClientProvider(crawler) results = provider(set(), crawler) From 3398dc77f96bfdbffaa104612547252a730f7a51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 10:40:07 +0200 Subject: [PATCH 69/71] Update instal requirements --- .github/workflows/test.yml | 7 +++++-- setup.py | 12 ++++++------ tox.ini | 23 ++++++++++++++++++++--- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4a34949f..c7539bd5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -43,8 +43,11 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.10'] - tox-job: ["mypy", "docs"] + include: + - python-version: "3.7" + tox-job: "mypy" + - python-version: "3.10" + tox-job: "docs" steps: - uses: actions/checkout@v2 diff --git a/setup.py b/setup.py index 39e5a000..930ebaf2 100755 --- a/setup.py +++ b/setup.py @@ -12,12 +12,12 @@ packages=find_packages(exclude=['tests', 'example']), install_requires=[ 'andi >= 0.4.1', - 'attrs', - 'parsel', - 'url-matcher', - 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@master#egg=web-poet', - 'tldextract', - 'sqlitedict', + 'attrs >= 21.3.0', + 'parsel >= 1.5.0', + 'scrapy >= 2.6.0', + 'sqlitedict >= 1.5.0', + 'url-matcher >= 0.2.0', + 'web-poet >= 0.2.0', ], classifiers=[ 'Development Status :: 3 - Alpha', diff --git a/tox.ini b/tox.ini index 4e1203a2..cfe5b6ab 100644 --- a/tox.ini +++ b/tox.ini @@ -1,13 +1,11 @@ [tox] -envlist = py37,py38,py39,py310,mypy,docs +envlist = min,py37,py38,py39,py310,asyncio,asyncio-min,mypy,docs [testenv] deps = pytest pytest-cov - scrapy >= 2.6.0 pytest-twisted - web-poet @ git+https://git@github.com/scrapinghub/web-poet@master#egg=web-poet commands = py.test \ @@ -19,8 +17,27 @@ commands = commands = {[testenv]commands} --reactor=asyncio +[testenv:min] +deps = + {[testenv]deps} + andi==0.4.1 + attrs==21.3.0 + parsel==1.5.0 + scrapy==2.6.0 + sqlitedict==1.5.0 + url-matcher==0.2.0 + web-poet==0.2.0 + +[testenv:asyncio-min] +commands = + {[testenv:asyncio]commands} +deps = + {[testenv:min]deps} + [testenv:mypy] +basepython = python3.7 deps = + {[testenv:min]deps} mypy==0.790 commands = mypy --ignore-missing-imports --no-warn-no-return scrapy_poet tests From 566e7272bae71887d80f42cccd2020ac0994575b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 10:48:03 +0200 Subject: [PATCH 70/71] GitHub Actions: test minimum dependency versions --- .github/workflows/test.yml | 17 +++++++++++++++-- tox.ini | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c7539bd5..f343d65a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,8 +16,21 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] - toxenv: ['py', 'asyncio'] + include: + - python-version: "3.7" + toxenv: "min" + - python-version: "3.7" + toxenv: "asyncio-min" + + - python-version: "3.8" + toxenv: "py" + - python-version: "3.9" + toxenv: "py" + + - python-version: "3.10" + toxenv: "py" + - python-version: "3.10" + toxenv: "asyncio" steps: - uses: actions/checkout@v2 diff --git a/tox.ini b/tox.ini index cfe5b6ab..adc8fa34 100644 --- a/tox.ini +++ b/tox.ini @@ -18,6 +18,7 @@ commands = {[testenv]commands} --reactor=asyncio [testenv:min] +basepython = python3.7 deps = {[testenv]deps} andi==0.4.1 @@ -29,6 +30,7 @@ deps = web-poet==0.2.0 [testenv:asyncio-min] +basepython = python3.7 commands = {[testenv:asyncio]commands} deps = From 98ce4545a06dc6b6c3a8a34d46373e93a5606cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 Jun 2022 12:32:07 +0200 Subject: [PATCH 71/71] Revert mypy changes --- .github/workflows/test.yml | 7 ++----- tox.ini | 2 -- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f343d65a..556f0b50 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -56,11 +56,8 @@ jobs: strategy: fail-fast: false matrix: - include: - - python-version: "3.7" - tox-job: "mypy" - - python-version: "3.10" - tox-job: "docs" + python-version: ['3.10'] + tox-job: ["mypy", "docs"] steps: - uses: actions/checkout@v2 diff --git a/tox.ini b/tox.ini index adc8fa34..62cb3fd7 100644 --- a/tox.ini +++ b/tox.ini @@ -37,9 +37,7 @@ deps = {[testenv:min]deps} [testenv:mypy] -basepython = python3.7 deps = - {[testenv:min]deps} mypy==0.790 commands = mypy --ignore-missing-imports --no-warn-no-return scrapy_poet tests