diff --git a/scrapy_poet/utils/__init__.py b/scrapy_poet/utils/__init__.py index 62e37012..bc08b73e 100644 --- a/scrapy_poet/utils/__init__.py +++ b/scrapy_poet/utils/__init__.py @@ -17,6 +17,12 @@ default_registry, ) +try: + from scrapy.http.request import NO_CALLBACK # available on Scrapy >= 2.8 +except ImportError: + # NO_CALLBACK = lambda: None # noqa: E731 + NO_CALLBACK = None + def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str: """Return a path to a folder where Scrapy is storing data. @@ -37,6 +43,7 @@ def http_request_to_scrapy_request(request: HttpRequest, **kwargs) -> Request: method=request.method, headers=request.headers, body=request.body, + callback=NO_CALLBACK, **kwargs, ) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 86641929..ed3fd2ec 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -1,7 +1,7 @@ import sys import warnings from functools import partial -from typing import Callable, List, Optional +from typing import Any, Callable, List, Optional, Sequence, Set from unittest import mock import attr @@ -11,14 +11,21 @@ import web_poet from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Request, Spider +from scrapy.crawler import Crawler from scrapy.exceptions import IgnoreRequest -from web_poet import HttpClient +from scrapy.http import Response +from scrapy.utils.defer import maybe_deferred_to_future +from web_poet import BrowserResponse, HttpClient from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError from web_poet.pages import WebPage -from scrapy_poet import DummyResponse +from scrapy_poet import DummyResponse, PageObjectInputProvider from scrapy_poet.downloader import create_scrapy_downloader -from scrapy_poet.utils import http_request_to_scrapy_request, is_min_scrapy_version +from scrapy_poet.utils import ( + NO_CALLBACK, + http_request_to_scrapy_request, + is_min_scrapy_version, +) from scrapy_poet.utils.mockserver import MockServer from scrapy_poet.utils.testing import ( AsyncMock, @@ -418,6 +425,79 @@ async def parse(self, response, page: ItemPage): assert items == [{"a": "a"}] +@inlineCallbacks +def test_additional_requests_no_cb_deps() -> None: + # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/135 + # This tests that the additional request doesn't go through dep resolving + # like if it used self.parse as a callback. + + items = [] + provider_calls = 0 + + class BrowserResponseProvider(PageObjectInputProvider): + provided_classes = {BrowserResponse} + + async def __call__( + self, to_provide: Set[Callable], request: Request, crawler: Crawler + ) -> Sequence[Any]: + nonlocal provider_calls + provider_calls += 1 + custom_request = Request( + request.url, body=request.body, callback=NO_CALLBACK + ) + scrapy_response: Response = await maybe_deferred_to_future( + crawler.engine.download(custom_request) + ) + result = BrowserResponse( + url=scrapy_response.url, + html=scrapy_response.text, + status=scrapy_response.status, + ) + return [result] + + with MockServer(EchoResource) as server: + + @attr.define + class ItemPage(WebPage): + browser_response: BrowserResponse + http: HttpClient + + async def to_item(self): + additional_response = await self.http.request( + server.root_url, + body=b"a", + ) + return { + "main": str(self.browser_response.html), + "additional": additional_response.body.decode(), + } + + class TestSpider(Spider): + name = "test_spider" + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, + }, + "SCRAPY_POET_PROVIDERS": { + BrowserResponseProvider: 1100, + }, + } + + def start_requests(self): + yield Request(server.root_url, callback=self.parse) + + async def parse(self, response: DummyResponse, page: ItemPage): + item = await page.to_item() + items.append(item) + + crawler = make_crawler(TestSpider, {}) + yield crawler.crawl() + + assert provider_calls == 1 + assert items == [{"main": "", "additional": "a"}] + + @attr.define class BasicPage(WebPage): def to_item(self): diff --git a/tests/test_utils.py b/tests/test_utils.py index 69b52a87..27fa4222 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,6 +8,7 @@ from web_poet import HttpRequest, HttpResponse from scrapy_poet.utils import ( + NO_CALLBACK, create_registry_instance, get_scrapy_data_path, http_request_to_scrapy_request, @@ -39,32 +40,32 @@ def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path): ( HttpRequest("https://example.com"), {}, - Request("https://example.com"), + Request("https://example.com", callback=NO_CALLBACK), ), ( HttpRequest("https://example.com"), {"dont_filter": True}, - Request("https://example.com", dont_filter=True), + Request("https://example.com", callback=NO_CALLBACK, dont_filter=True), ), ( HttpRequest("https://example.com", method="POST"), {}, - Request("https://example.com", method="POST"), + Request("https://example.com", callback=NO_CALLBACK, method="POST"), ), ( HttpRequest("https://example.com", headers={"a": "b"}), {}, - Request("https://example.com", headers={"a": "b"}), + Request("https://example.com", callback=NO_CALLBACK, headers={"a": "b"}), ), ( HttpRequest("https://example.com", headers={"a": "b"}), {}, - Request("https://example.com", headers=(("a", "b"),)), + Request("https://example.com", callback=NO_CALLBACK, headers=(("a", "b"),)), ), ( HttpRequest("https://example.com", headers=(("a", "b"),)), {}, - Request("https://example.com", headers=(("a", "b"),)), + Request("https://example.com", callback=NO_CALLBACK, headers=(("a", "b"),)), ), ( HttpRequest( @@ -74,13 +75,14 @@ def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path): {}, Request( "https://example.com", + callback=NO_CALLBACK, headers=(("a", "b"), ("a", "c")), ), ), ( HttpRequest("https://example.com", body=b"a"), {}, - Request("https://example.com", body=b"a"), + Request("https://example.com", callback=NO_CALLBACK, body=b"a"), ), ), )