Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle item overrides #164

Merged
merged 11 commits into from
Sep 25, 2023
10 changes: 7 additions & 3 deletions scrapy_poet/commands.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import logging
from pathlib import Path
from typing import Optional, Type
from typing import Dict, Optional, Type

import andi
import scrapy
Expand Down Expand Up @@ -34,10 +34,14 @@
class SavingInjector(Injector):
@inlineCallbacks
def build_instances_from_providers(
self, request: Request, response: Response, plan: andi.Plan
self,
request: Request,
response: Response,
plan: andi.Plan,
prev_instances: Optional[Dict] = None,
):
instances = yield super().build_instances_from_providers(
request, response, plan
request, response, plan, prev_instances
)
if request.meta.get("savefixture", False):
saved_dependencies.extend(instances.values())
Expand Down
2 changes: 1 addition & 1 deletion scrapy_poet/downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
PageParamsProvider: 700,
RequestUrlProvider: 800,
ResponseUrlProvider: 900,
ItemProvider: 1000,
ItemProvider: 2000,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Increasing this value allows ZyteApiProvider (as well as any other item provider in the future) to run before scrapy-poet's ItemProvider. This ensures any item dependency in page objects are available.

}

InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")
Expand Down
38 changes: 33 additions & 5 deletions scrapy_poet/injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,20 @@ def build_plan(self, request: Request) -> andi.Plan:
)

@inlineCallbacks
def build_instances(self, request: Request, response: Response, plan: andi.Plan):
def build_instances(
self,
request: Request,
response: Response,
plan: andi.Plan,
prev_instances: Optional[Dict] = None,
):
"""Build the instances dict from a plan including external dependencies."""
# First we build the external dependencies using the providers
instances = yield from self.build_instances_from_providers(
request, response, plan
request,
response,
plan,
prev_instances,
)
# All the remaining dependencies are internal so they can be built just
# following the andi plan.
Expand All @@ -169,10 +178,14 @@ def build_instances(self, request: Request, response: Response, plan: andi.Plan)

@inlineCallbacks
def build_instances_from_providers(
self, request: Request, response: Response, plan: andi.Plan
self,
request: Request,
response: Response,
plan: andi.Plan,
prev_instances: Optional[Dict] = None,
):
"""Build dependencies handled by registered providers"""
instances: Dict[Callable, Any] = {}
instances: Dict[Callable, Any] = prev_instances or {}
Copy link
Contributor Author

@BurnzZ BurnzZ Sep 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Passing the prev_instances allows the PO to gain access to the item produced by a provider (as long as that provider was called earlier).

@attrs.define 
class ProductPage(WebPage[Product]):
    product: Product

In the above example, before the ProductPage instance can be made, the product: Product instance should first be created. This is where this passing of new prev_instances parameter is useful.

scrapy_provided_dependencies = self.available_dependencies_for_providers(
request, response
)
Expand All @@ -182,10 +195,22 @@ def build_instances_from_providers(
provided_classes = {
cls for cls in dependencies_set if provider.is_provided(cls)
}
provided_classes -= instances.keys() # ignore already provided types

# ignore already provided types if provider doesn't need to use them
if not provider.allow_prev_instances:
provided_classes -= instances.keys()

if not provided_classes:
continue

# If dependency instances were already made by previously invoked
# providers, don't try to build them again since it may result in
# incorrect values (e.g. PO modifying an item > 2 times).
required_deps = set(plan.dependencies[-1][1].values())
built_deps = set(instances.keys())
if required_deps and required_deps == built_deps:
continue

objs, fingerprint = [], None
cache_hit = False
if self.cache:
Expand Down Expand Up @@ -221,6 +246,8 @@ def build_instances_from_providers(
externally_provided=scrapy_provided_dependencies,
full_final_kwargs=False,
).final_kwargs(scrapy_provided_dependencies)
if provider.allow_prev_instances:
kwargs.update({"prev_instances": instances})
try:
# Invoke the provider to get the data
objs = yield maybeDeferred_coro(
Expand Down Expand Up @@ -414,6 +441,7 @@ class MySpider(Spider):
spider = MySpider()
spider.settings = settings
crawler.spider = spider
crawler.stats = load_object(crawler.settings["STATS_CLASS"])(crawler)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scrapy 2.11 changes broke this since stats was missing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ouch, I've already fixed it in master before looking here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it's good to see we did it in (kinda) the same way!

if not registry:
registry = create_registry_instance(RulesRegistry, crawler)
return Injector(crawler, registry=registry)
Expand Down
16 changes: 13 additions & 3 deletions scrapy_poet/page_input_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ def __call__(self, to_provide, response: Response):
provided_classes: Union[Set[Callable], Callable[[Callable], bool]]
name: ClassVar[str] = "" # It must be a unique name. Used by the cache mechanism

# If set to True, the Injector will not skip the Provider when the dependency has
# been built. Instead, the Injector will pass the previously built instances (by
# the other providers) to the Provider. The Provider can then choose to modify
# these previous instances before returning them to the Injector.
allow_prev_instances: bool = False
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved

def is_provided(self, type_: Callable) -> bool:
"""
Return ``True`` if the given type is provided by this provider based
Expand Down Expand Up @@ -230,6 +236,8 @@ class ItemProvider(PageObjectInputProvider):
"trying to resolve this plan: {plan}"
)

allow_prev_instances: bool = True

def __init__(self, injector):
super().__init__(injector)
self.registry = self.injector.registry
Expand Down Expand Up @@ -280,11 +288,11 @@ async def __call__(
to_provide: Set[Callable],
request: Request,
response: Response,
prev_instances: Dict,
) -> List[Any]:
results = []
for cls in to_provide:
item = self.get_from_cache(request, cls)
if item:
if item := self.get_from_cache(request, cls):
results.append(item)
continue

Expand All @@ -308,7 +316,9 @@ async def __call__(

try:
deferred_or_future = maybe_deferred_to_future(
self.injector.build_instances(request, response, plan)
self.injector.build_instances(
request, response, plan, prev_instances
)
)
# RecursionError NOT raised when ``AsyncioSelectorReactor`` is used.
# Could be related: https://github.com/python/cpython/issues/93837
Expand Down
10 changes: 2 additions & 8 deletions tests/test_response_required_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from scrapy.crawler import Crawler
from scrapy.http import HtmlResponse, Request, TextResponse
from scrapy.settings import Settings
from scrapy.utils.misc import load_object
from web_poet import ItemPage, WebPage

from scrapy_poet import DummyResponse, callback_for
Expand All @@ -32,18 +33,15 @@

@attr.s(auto_attribs=True)
class DummyProductResponse:

data: Dict[str, Any]


@attr.s(auto_attribs=True)
class FakeProductResponse:

data: Dict[str, Any]


class DummyProductProvider(PageObjectInputProvider):

provided_classes = {DummyProductResponse}

def __call__(self, to_provide, request: scrapy.Request):
Expand All @@ -57,7 +55,6 @@ def __call__(self, to_provide, request: scrapy.Request):


class FakeProductProvider(PageObjectInputProvider):

provided_classes = {FakeProductResponse}

def __call__(self, to_provide):
Expand All @@ -71,7 +68,6 @@ def __call__(self, to_provide):


class TextProductProvider(HttpResponseProvider):

# This is wrong. You should not annotate provider dependencies with classes
# like TextResponse or HtmlResponse, you should use Response instead.
def __call__(self, to_provide, response: TextResponse):
Expand All @@ -85,7 +81,6 @@ def __call__(self, to_provide, response: str):

@attr.s(auto_attribs=True)
class DummyProductPage(ItemPage):

response: DummyProductResponse

@property
Expand All @@ -99,7 +94,6 @@ def to_item(self):

@attr.s(auto_attribs=True)
class FakeProductPage(ItemPage):

response: FakeProductResponse

@property
Expand All @@ -117,7 +111,6 @@ def to_item(self):


class MySpider(scrapy.Spider):

name = "foo"
custom_settings = {
"SCRAPY_POET_PROVIDERS": {
Expand Down Expand Up @@ -360,6 +353,7 @@ def test_is_response_going_to_be_used():
crawler = Crawler(MySpider)
spider = MySpider()
crawler.spider = spider
crawler.stats = load_object(crawler.settings["STATS_CLASS"])(crawler)

def response(request):
return HtmlResponse(request.url, request=request, body=b"<html></html>")
Expand Down
Loading
Loading