Skip to content

Commit

Permalink
Merge pull request #170 from scrapinghub/additional-requests-cb
Browse files Browse the repository at this point in the history
Set NO_CALLBACK for HttpClientProvider requests.
  • Loading branch information
wRAR authored Nov 2, 2023
2 parents 7b03d71 + 7be392e commit 47e44f6
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 11 deletions.
7 changes: 7 additions & 0 deletions scrapy_poet/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
default_registry,
)

try:
from scrapy.http.request import NO_CALLBACK # available on Scrapy >= 2.8
except ImportError:
# NO_CALLBACK = lambda: None # noqa: E731
NO_CALLBACK = None


def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str:
"""Return a path to a folder where Scrapy is storing data.
Expand All @@ -37,6 +43,7 @@ def http_request_to_scrapy_request(request: HttpRequest, **kwargs) -> Request:
method=request.method,
headers=request.headers,
body=request.body,
callback=NO_CALLBACK,
**kwargs,
)

Expand Down
88 changes: 84 additions & 4 deletions tests/test_downloader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
import warnings
from functools import partial
from typing import Callable, List, Optional
from typing import Any, Callable, List, Optional, Sequence, Set
from unittest import mock

import attr
Expand All @@ -11,14 +11,21 @@
import web_poet
from pytest_twisted import ensureDeferred, inlineCallbacks
from scrapy import Request, Spider
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest
from web_poet import HttpClient
from scrapy.http import Response
from scrapy.utils.defer import maybe_deferred_to_future
from web_poet import BrowserResponse, HttpClient
from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError
from web_poet.pages import WebPage

from scrapy_poet import DummyResponse
from scrapy_poet import DummyResponse, PageObjectInputProvider
from scrapy_poet.downloader import create_scrapy_downloader
from scrapy_poet.utils import http_request_to_scrapy_request, is_min_scrapy_version
from scrapy_poet.utils import (
NO_CALLBACK,
http_request_to_scrapy_request,
is_min_scrapy_version,
)
from scrapy_poet.utils.mockserver import MockServer
from scrapy_poet.utils.testing import (
AsyncMock,
Expand Down Expand Up @@ -418,6 +425,79 @@ async def parse(self, response, page: ItemPage):
assert items == [{"a": "a"}]


@inlineCallbacks
def test_additional_requests_no_cb_deps() -> None:
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/135
# This tests that the additional request doesn't go through dep resolving
# like if it used self.parse as a callback.

items = []
provider_calls = 0

class BrowserResponseProvider(PageObjectInputProvider):
provided_classes = {BrowserResponse}

async def __call__(
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
nonlocal provider_calls
provider_calls += 1
custom_request = Request(
request.url, body=request.body, callback=NO_CALLBACK
)
scrapy_response: Response = await maybe_deferred_to_future(
crawler.engine.download(custom_request)
)
result = BrowserResponse(
url=scrapy_response.url,
html=scrapy_response.text,
status=scrapy_response.status,
)
return [result]

with MockServer(EchoResource) as server:

@attr.define
class ItemPage(WebPage):
browser_response: BrowserResponse
http: HttpClient

async def to_item(self):
additional_response = await self.http.request(
server.root_url,
body=b"a",
)
return {
"main": str(self.browser_response.html),
"additional": additional_response.body.decode(),
}

class TestSpider(Spider):
name = "test_spider"

custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"scrapy_poet.InjectionMiddleware": 543,
},
"SCRAPY_POET_PROVIDERS": {
BrowserResponseProvider: 1100,
},
}

def start_requests(self):
yield Request(server.root_url, callback=self.parse)

async def parse(self, response: DummyResponse, page: ItemPage):
item = await page.to_item()
items.append(item)

crawler = make_crawler(TestSpider, {})
yield crawler.crawl()

assert provider_calls == 1
assert items == [{"main": "", "additional": "a"}]


@attr.define
class BasicPage(WebPage):
def to_item(self):
Expand Down
16 changes: 9 additions & 7 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from web_poet import HttpRequest, HttpResponse

from scrapy_poet.utils import (
NO_CALLBACK,
create_registry_instance,
get_scrapy_data_path,
http_request_to_scrapy_request,
Expand Down Expand Up @@ -39,32 +40,32 @@ def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path):
(
HttpRequest("https://example.com"),
{},
Request("https://example.com"),
Request("https://example.com", callback=NO_CALLBACK),
),
(
HttpRequest("https://example.com"),
{"dont_filter": True},
Request("https://example.com", dont_filter=True),
Request("https://example.com", callback=NO_CALLBACK, dont_filter=True),
),
(
HttpRequest("https://example.com", method="POST"),
{},
Request("https://example.com", method="POST"),
Request("https://example.com", callback=NO_CALLBACK, method="POST"),
),
(
HttpRequest("https://example.com", headers={"a": "b"}),
{},
Request("https://example.com", headers={"a": "b"}),
Request("https://example.com", callback=NO_CALLBACK, headers={"a": "b"}),
),
(
HttpRequest("https://example.com", headers={"a": "b"}),
{},
Request("https://example.com", headers=(("a", "b"),)),
Request("https://example.com", callback=NO_CALLBACK, headers=(("a", "b"),)),
),
(
HttpRequest("https://example.com", headers=(("a", "b"),)),
{},
Request("https://example.com", headers=(("a", "b"),)),
Request("https://example.com", callback=NO_CALLBACK, headers=(("a", "b"),)),
),
(
HttpRequest(
Expand All @@ -74,13 +75,14 @@ def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path):
{},
Request(
"https://example.com",
callback=NO_CALLBACK,
headers=(("a", "b"), ("a", "c")),
),
),
(
HttpRequest("https://example.com", body=b"a"),
{},
Request("https://example.com", body=b"a"),
Request("https://example.com", callback=NO_CALLBACK, body=b"a"),
),
),
)
Expand Down

0 comments on commit 47e44f6

Please sign in to comment.