From 8f4c4e531a23ced7b8d9c806fa2cf91323fa33b5 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 25 Aug 2022 13:23:57 +0500 Subject: [PATCH] New ItemPage (#74) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fixed fields in subclasses. Fixes GH-69. * more type annotations * items support using a generic ItemPage class * fix tests to account for new behavior * tests for types * run types tests on github actions * fixed typing issues * backwards compat for Python 3.7 * more tests; skip_nonitem_fields argument * fix typing issue * extract SetItemType * update fields tutorial * rename SetItemType to Returns * typo fix * item_cls_fields -> skip_nonitem_fields * added a comment to tox.ini temporary dependency * deprecate ItemWebPage * test WebPage + fields * clean up from-fround-up tutorial * typo fix * Apply suggestions from code review Co-authored-by: Adrián Chaves Co-authored-by: Kevin Lloyd Bernal * ignore B024 flake8-bugbear warning * more tests for ItemPage validation * mention to_item earlier in docs Co-authored-by: Adrián Chaves Co-authored-by: Kevin Lloyd Bernal --- .flake8 | 5 +- .github/workflows/test.yml | 2 +- docs/advanced/additional-requests.rst | 20 +- docs/advanced/fields.rst | 184 ++++++++----------- docs/advanced/page-params.rst | 10 +- docs/advanced/retries.rst | 8 +- docs/api-reference.rst | 3 +- docs/intro/from-ground-up.rst | 222 ++++++++++++----------- docs/intro/overrides.rst | 10 +- docs/intro/tutorial.rst | 8 +- pyproject.toml | 2 +- tests/test_fields.py | 21 +-- tests/test_pages.py | 183 +++++++++++++++++-- tests_typing/test_fields.mypy-testing | 46 +++++ tests_typing/test_item_page.mypy-testing | 90 +++++++++ tox.ini | 12 +- web_poet/__init__.py | 2 +- web_poet/_typing.py | 24 +++ web_poet/fields.py | 68 +++---- web_poet/overrides.py | 4 +- web_poet/pages.py | 52 ++++-- 21 files changed, 638 insertions(+), 338 deletions(-) create mode 100644 tests_typing/test_fields.mypy-testing create mode 100644 tests_typing/test_item_page.mypy-testing create mode 100644 web_poet/_typing.py diff --git a/.flake8 b/.flake8 index 664832fa..a326177c 100644 --- a/.flake8 +++ b/.flake8 @@ -24,7 +24,10 @@ ignore = D209, # Multi-line docstring closing quotes should be on a separate line D400, # First line should end with a period D401, # First line should be in imperative mood - D402 # First line should not be the function's "signature" + D402, # First line should not be the function's "signature" + + # see https://github.com/PyCQA/flake8-bugbear/issues/278 + B024 # abstract base class without abstract methods per-file-ignores = # F401: Ignore "imported but unused" errors in __init__ files, as those diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2500e052..8898b561 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,7 +41,7 @@ jobs: fail-fast: false matrix: python-version: ['3.10'] - tox-job: ["mypy", "docs", "linters"] + tox-job: ["mypy", "docs", "linters", "types"] steps: - uses: actions/checkout@v2 diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index 64634f5e..10d41862 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -295,7 +295,7 @@ Executing a HttpRequest instance @attrs.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http_client: web_poet.HttpClient async def to_item(self): @@ -351,7 +351,7 @@ method on it. @attrs.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http_client: web_poet.HttpClient async def to_item(self): @@ -396,7 +396,7 @@ Thus, additional requests inside the Page Object are typically needed for it: @attrs.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http_client: web_poet.HttpClient async def to_item(self): @@ -485,7 +485,7 @@ list of :class:`~.HttpRequest` to be executed in batch using the @attrs.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http_client: web_poet.HttpClient default_pagination_limit = 10 @@ -586,7 +586,7 @@ from the previous subsection named: :ref:`httpclient-get-example`. @attrs.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http_client: web_poet.HttpClient async def to_item(self): @@ -664,7 +664,7 @@ For this example, let's improve the code snippet from the previous subsection na @attrs.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http_client: web_poet.HttpClient default_pagination_limit = 10 @@ -835,7 +835,7 @@ This can be set using: @attrs.define - class SomePage(web_poet.ItemWebPage): + class SomePage(web_poet.WebPage): http_client: web_poet.HttpClient async def to_item(self): @@ -884,7 +884,7 @@ when creating an :class:`~.HttpClient` instance: @attrs.define - class SomePage(web_poet.ItemWebPage): + class SomePage(web_poet.WebPage): http_client: web_poet.HttpClient async def to_item(self): @@ -956,7 +956,7 @@ like the ones above, then it would cause the code to look like: @attrs.define - class SomePage(web_poet.ItemWebPage): + class SomePage(web_poet.WebPage): http_client: web_poet.HttpClient async def to_item(self): @@ -985,7 +985,7 @@ This makes the code simpler: @attrs.define - class SomePage(web_poet.ItemWebPage): + class SomePage(web_poet.WebPage): http_client: web_poet.HttpClient async def to_item(self): diff --git a/docs/advanced/fields.rst b/docs/advanced/fields.rst index 49589d6b..0411571f 100644 --- a/docs/advanced/fields.rst +++ b/docs/advanced/fields.rst @@ -47,23 +47,15 @@ This approach has 2 main advantages: However, writing and maintaining ``to_item()`` method can get tedious, especially if there is a lot of properties. -web_poet.fields ---------------- - +@field decorator +---------------- To aid writing Page Objects in this style, ``web-poet`` provides -a few utilities: - -* :func:`@web_poet.field ` decorator, -* :func:`web_poet.item_from_fields ` - and :func:`web_poet.item_from_fields_sync ` - functions. - -We can rewrite the example like this: +the :func:`@web_poet.field ` decorator: .. code-block:: python import attrs - from web_poet import ItemPage, HttpResponse, field, item_from_fields_sync + from web_poet import ItemPage, HttpResponse, field @attrs.define @@ -78,24 +70,27 @@ We can rewrite the example like this: def price(self): return self.response.css(".price").get() - def to_item(self) -> dict: - return item_from_fields_sync(self) +:class:`~.ItemPage` has a default ``to_item()`` +implementation: it uses all the properties created with the +:func:`@field ` decorator, and returns +a dict with the result, where keys are method names, and values are +property values. In the example above, ``to_item()`` returns a +``{"name": ..., "price": ...}`` dict with the extracted data. -Methods annotated with :func:`@field ` decorator -become properties; for ``page = MyPage(...)`` instance +Methods annotated with the :func:`@field ` decorator +become properties; for a ``page = MyPage(...)`` instance you can access them as ``page.name``. -As you can guess, :func:`~.item_from_fields_sync` uses all the properties -created with :func:`@field ` decorator, and returns -a dict with the result, where keys are method names, and values are -property values. +It's important to note that the default +:meth:`ItemPage.to_item() ` implementation +is an ``async def`` function - make sure to await its result: +``item = await page.to_item()`` Asynchronous fields ------------------- -``async def`` fields are also supported, as well as a mix of -sync and async methods - use :func:`~.item_from_fields` in ``to_item`` -to make it work. +The reason :class:`~.ItemPage` provides an async ``to_item`` method by +default is that both regular and ``async def`` fields are supported. For example, you might need to send :ref:`advanced-requests` to extract some of the attributes: @@ -103,13 +98,13 @@ of the attributes: .. code-block:: python import attrs - from web_poet import ItemPage, HttpResponse, HttpClient, field, item_from_fields + from web_poet import ItemPage, HttpResponse, HttpClient, field @attrs.define class MyPage(ItemPage): response: HttpResponse - http_client: HttpClient + http: HttpClient @field def name(self): @@ -117,25 +112,9 @@ of the attributes: @field async def price(self): - resp = self.http_client.get("...") + resp = await self.http.get("...") return resp.json()['price'] - async def to_item(self) -> dict: - return await item_from_fields(self) - -Because :func:`~.item_from_fields` supports both sync and async fields, -it's recommended to use it over :func:`~.item_from_fields_sync`, even -if there are no async fields yet. The only reason to use -:func:`~.item_from_fields_sync` would be to avoid using -``async def to_item`` method. - -If you want to get a value of an async field, make sure to await it: - -.. code-block:: python - - page = MyPage(...) - price = await page.price - Using Page Objects with async fields ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -185,31 +164,33 @@ attrs instances) instead of unstructured dicts to hold the data: from web_poet import ItemPage, HttpResponse @attrs.define - class Item: + class Product: name: str price: str @attrs.define - class MyPage(ItemPage): + class ProductPage(ItemPage): # ... - def to_item(self) -> Item: - return Item( + def to_item(self) -> Product: + return Product( name=self.name, price=self.price ) -:mod:`web_poet.fields` supports it, by allowing to pass an item class to the -:func:`~.item_from_fields` / :func:`~.item_from_fields_sync` functions: +:mod:`web_poet.fields` supports it, by allowing to parametrize +:class:`~.ItemPage` with an item class: .. code-block:: python @attrs.define - class MyPage(ItemPage): + class ProductPage(ItemPage[Product]): # ... - async def to_item(self) -> Item: - return await item_from_fields(self, item_cls=Item) +When :class:`~.ItemPage` is parametrized with an item class, +its ``to_item()`` method starts to return item instances, instead +of ``dict`` instances. In the example above ``ProductPage.to_item`` method +returns ``Product`` instances. Defining an Item class may be an overkill if you only have a single Page Object, but item classes are of a great help when @@ -217,6 +198,9 @@ but item classes are of a great help when * you need to extract data in the same format from multiple websites, or * if you want to define the schema upfront. +Item classes can also be used to hold common attribute +pre-processing and validation logic. + Error prevention ~~~~~~~~~~~~~~~~ @@ -229,35 +213,32 @@ Consider the following badly written page object: .. code-block:: python import attrs - from web_poet import ItemPage, HttpResponse, field, item_from_fields + from web_poet import ItemPage, HttpResponse, field @attrs.define - class Item: + class Product: name: str price: str @attrs.define - class MyPage(ItemPage): + class ProductPage(ItemPage[Product]): response: HttpResponse @field def nane(self): return self.response.css(".name").get() - async def to_item(self) -> Item: - return await item_from_fields(self, item_cls=Item) - -Because Item class is used, a typo ("nane" instead of "name") is detected -at runtime: creation of Item instance would fail with a ``TypeError``, because -of unexpected keyword argument "nane". +Because the ``Product`` item class is used, a typo ("nane" instead of "name") +is detected at runtime: the creation of a ``Product`` instance would fail with +a ``TypeError``, because of the unexpected keyword argument "nane". After fixing it (renaming "nane" method to "name"), another error is going to be -detected: ``price`` argument is required, but there is no extraction method for -this attribute, so ``Item.__init__`` will raise another ``TypeError``, +detected: the ``price`` argument is required, but there is no extraction method for +this attribute, so ``Product.__init__`` will raise another ``TypeError``, indicating that a required argument is missing. -Without an Item class, none of these errors are detected. +Without an item class, none of these errors are detected. Changing Item type ~~~~~~~~~~~~~~~~~~ @@ -278,7 +259,7 @@ different, using the original Page Object as a dependency is a good approach: import attrs from my_library import FooPage, StandardItem - from web_poet import ItemPage, HttpResponse, field, ensure_awaitable, item_from_fields + from web_poet import ItemPage, HttpResponse, field, ensure_awaitable @attrs.define class CustomItem: @@ -286,7 +267,7 @@ different, using the original Page Object as a dependency is a good approach: new_price: str @attrs.define - class CustomFooPage(ItemPage): + class CustomFooPage(ItemPage[CustomItem]): response: HttpResponse standard: FooPage @@ -300,9 +281,6 @@ different, using the original Page Object as a dependency is a good approach: async def new_price(self): ... - async def to_item(self) -> CustomItem: - return await item_from_fields(self, item_cls=CustomItem) - However, if items are similar, and share many attributes, this approach could lead to boilerplate code. For example, you might be extending an item with a new field, and it'd be required to duplicate definitions for all @@ -314,63 +292,63 @@ to the item: .. code-block:: python + import attrs + from my_library import FooPage, StandardItem + from web_poet import field, Returns + @attrs.define class CustomItem(StandardItem): new_field: str @attrs.define - class CustomFooPage(FooPage): + class CustomFooPage(FooPage, Returns[CustomItem]): @field def new_field(self) -> str: # ... - async def to_item(self) -> CustomItem: - # we need to override to_item to ensure CustomItem is returned - return await item_from_fields(self, item_cls=CustomItem) +Note how :class:`~.Returns` is used as one of the base classes of +``CustomFooPage``; it allows to change the item type returned by a page object. -Removing fields (as well as renaming) is more tricky with inheritance though. +Removing fields (as well as renaming) is a bit more tricky. -The caveat is that by default :func:`item_from_fields` uses all fields +The caveat is that by default :class:`~.ItemPage` uses all fields defined as ``@field`` to produce an item, passing all these values to -``Item.__init__``. So, if you follow the previous example, and inherit from -the "base", "standard" Page Object, there could be a ``@field`` from the base -class which is not present in the ``CustomItem``. It'd be still passed -to ``CustomItem.__init__``, causing an exception. +item's ``__init__`` method. So, if you follow the previous example, and +inherit from the "base", "standard" Page Object, there could be a ``@field`` +from the base class which is not present in the ``CustomItem``. +It'd be still passed to ``CustomItem.__init__``, causing an exception. -To solve it, you can either +One way to solve it is to make the orignal Page Object a dependency +instead of inheriting from it, as explained in the beginning. -* make the orignal Page Object a dependency instead of inheriting from it - (as explained in the beginning), or -* use ``item_cls_fields=True`` argument of :func:`item_from_fields`: - when ``item_cls_fields`` parameter is True, ``@fields`` which - are not defined in the item are skipped. +Alternatively, you can use ``skip_nonitem_fields=True`` class argument - it tells +:meth:`~.ItemPage.to_item` to skip ``@fields`` which are not defined +in the item: .. code-block:: python @attrs.define - class CustomItem(Item): + class CustomItem: # let's pick only 1 attribute from StandardItem, nothing more name: str - @attrs.define - class CustomFooPage(FooPage): - # inheriting from a page object which defines all StandardItem fields + class CustomFooPage(FooPage, Returns[CustomItem], skip_nonitem_fields=True): + pass - async def to_item(self) -> CustomItem: - return await item_from_fields(self, item_cls=CustomItem, - item_cls_fields=True) Here, ``CustomFooPage.to_item`` only uses ``name`` field of the ``FooPage``, ignoring -all other fields defined in ``FooPage``, because ``item_cls_fields=True`` +all other fields defined in ``FooPage``, because ``skip_nonitem_fields=True`` is passed, and ``name`` is the only field ``CustomItem`` supports. To recap: -* Use ``item_cls_fields=False`` (default) when your Page Object corresponds - to an item exactly, or when you're only adding fields. This is a safe option, - which allows to detect typos in field names, even for optional fields. -* Use ``item_cls_fields=True`` when it's possible for the Page Object +* Use ``Returns[NewItemType]`` to change the item type in a subclass. +* Don't use ``skip_nonitem_fields=True`` when your Page Object corresponds + to an item exactly, or when you're only adding fields. This is a safe + approach, which allows to detect typos in field names, even for optional + fields. +* Use ``skip_nonitem_fields=True`` when it's possible for the Page Object to contain more ``@fields`` than defined in the item class, e.g. because Page Object is inherited from some other base Page Object. @@ -405,14 +383,7 @@ extracting the heavy operation to a method, and caching the results: .. code-block:: python - from web_poet import ( - ItemPage, - HttpResponse, - HttpClient, - field, - cached_method, - item_from_fields - ) + from web_poet import ItemPage, HttpResponse, HttpClient, field, cached_method class MyPage(ItemPage): response: HttpResponse @@ -437,9 +408,6 @@ extracting the heavy operation to a method, and caching the results: api_response = await self.api_response() return api_response["sku"] - async def to_item(self): - return await item_from_fields(self) - As you can see, ``web-poet`` provides :func:`~.cached_method` decorator, which allows to memoize the function results. It supports both sync and async methods, i.e. you can use it on regular methods (``def foo(self)``), @@ -517,6 +485,6 @@ returns a dictionary, where keys are field names, and values are fields_dict = get_fields_dict(MyPage) field_names = fields_dict.keys() my_field_meta = fields_dict["my_field"].meta - + print(field_names) # dict_keys(['my_field']) print(my_field_meta) # {'expensive': True} diff --git a/docs/advanced/page-params.rst b/docs/advanced/page-params.rst index 78560398..a16c6ee5 100644 --- a/docs/advanced/page-params.rst +++ b/docs/advanced/page-params.rst @@ -9,7 +9,7 @@ them. Such information can dictate the behavior of the Page Object or affect its data entirely depending on the needs of the developer. If you can recall from the previous basic tutorials, one essential requirement of -Page Objects that inherit from :class:`~.WebPage` or :class:`~.ItemWebPage` would +Page Objects that inherit from :class:`~.WebPage` would be :class:`~.HttpResponse`. This holds the HTTP response information that the Page Object is trying to represent. @@ -23,8 +23,8 @@ we'll need to use :class:`~.PageParams` similar on how we use import web_poet @attrs.define - class SomePage(web_poet.ItemWebPage): - # The HttpResponse attribute is inherited from ItemWebPage + class SomePage(web_poet.WebPage): + # The HttpResponse attribute is inherited from WebPage page_params: web_poet.PageParams # Assume that it's constructed with the necessary arguments taken somewhere. @@ -51,7 +51,7 @@ Controlling item values @attrs.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): page_params: web_poet.PageParams default_tax_rate = 0.10 @@ -94,7 +94,7 @@ Let's try an example wherein :class:`~.PageParams` is able to control how @attrs.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http_client: web_poet.HttpClient page_params: web_poet.PageParams diff --git a/docs/advanced/retries.rst b/docs/advanced/retries.rst index 5a806920..12c7f302 100644 --- a/docs/advanced/retries.rst +++ b/docs/advanced/retries.rst @@ -23,10 +23,10 @@ supplies to your page object, your page object must raise .. code-block:: python - from web_poet import ItemWebPage + from web_poet import WebPage from web_poet.exceptions import Retry - class MyPage(ItemWebPage): + class MyPage(WebPage): def to_item(self) -> dict: if not self.css(".expected"): @@ -62,10 +62,10 @@ times before giving up: import attrs from tenacity import retry, stop_after_attempt - from web_poet import HttpClient, HttpRequest, ItemWebPage + from web_poet import HttpClient, HttpRequest, WebPage @attrs.define - class MyPage(ItemWebPage): + class MyPage(WebPage): http_client: HttpClient @retry(stop=stop_after_attempt(3)) diff --git a/docs/api-reference.rst b/docs/api-reference.rst index 0358c762..2f931e86 100644 --- a/docs/api-reference.rst +++ b/docs/api-reference.rst @@ -51,10 +51,9 @@ Pages :inherited-members: :no-special-members: -.. autoclass:: ItemWebPage +.. autoclass:: Returns :show-inheritance: :members: - :no-special-members: Mixins ====== diff --git a/docs/intro/from-ground-up.rst b/docs/intro/from-ground-up.rst index 34399c40..9af252d5 100644 --- a/docs/intro/from-ground-up.rst +++ b/docs/intro/from-ground-up.rst @@ -91,40 +91,46 @@ No problem, let's refactor it further. You may end up with something like that: .. code-block:: python import aiohttp + from dataclasses import dataclass import requests import parsel + @dataclass + class Response: + url: str + text: str + # === Extraction code - def extract_book(url, text): + def extract_book(response: Response) -> dict: """ Extract book information from a book page on http://books.toscrape.com website, e.g. from http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html """ - sel = parsel.Selector(text) + sel = parsel.Selector(response.text) return { - "url": url, + "url": response.url, "title": sel.css("h1").get(), "description": sel.css("#product_description+ p").get().strip(), # ... } # === Framework-specific I/O code - def download_sync(url): + def download_sync(url) -> Response: resp = requests.get(url) - return {"url": resp.url, "text": resp.text} + return Response(url=resp.url, text=resp.text) - async def download_async(url): + async def download_async(url) -> Response: async with aiohttp.ClientSession() as session: async with session.get(url) as response: text = await response.text() - return {"url": url, "text": text} + return Response(url=url, text=text) # === Usage example - # the way to get resp_data depends on an HTTP client - resp_data = download_sync("http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html") + # the way to get the Response instance depends on an HTTP client + resp = download_sync("http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html") - # but after we got resp_data, usage is the same - item = extract_book(url=resp_data["url"], text=resp_data["text"]) + # but after we got the response, the usage is the same + item = extract_book(resp) ``extract_book`` function now has all the desired properties: it is @@ -135,35 +141,38 @@ The same, but using web-poet ============================ ``web-poet`` asks you to organize code in a very similar way. Let's convert -``extract_book`` function to a Page Object, by defining BookPage class: +``extract_book`` function to a Page Object, by defining the BookPage class: .. code-block:: python import aiohttp import requests - from web_poet import WebPage, HttpResponse + from web_poet import ItemPage, HttpResponse # === Extraction code - class BookPage(WebPage): + class BookPage(ItemPage): """ A book page on http://books.toscrape.com website, e.g. http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html """ - def extract_book(self): + def __init__(self, response: HttpResponse): + self.response = response + + def to_item(self) -> dict: return { - "url": self.url, - "title": self.css("h1").get(), - "description": self.css("#product_description+ p").get().strip(), + "url": self.response.url, + "title": self.response.css("h1").get(), + "description": self.response.css("#product_description+ p").get().strip(), # ... } # === Framework-specific I/O code - def download_sync(url): + def download_sync(url) -> HttpResponse: resp = requests.get(url) return HttpResponse(url=resp.url, body=resp.content, headers=resp.headers) - async def download_async(url): + async def download_async(url) -> HttpResponse: async with aiohttp.ClientSession() as session: async with session.get(url) as response: body = await response.content.read() @@ -178,32 +187,41 @@ The same, but using web-poet # but after we got the response, the usage is the same book_page = BookPage(response=response) - item = book_page.extract_book() + item = book_page.to_item() Differences from a previous example: -* instead of dicts with "url" and "text" fields, :class:`~.HttpResponse` - instances are used. :class:`~.HttpResponse` is a structure - defined by web-poet acting as a generic data container for HTTP Responses. - *(check out the API reference of* :class:`~.HttpResponse` *for more info - about the fields it holds)* +* web-poet provides a standard :class:`~.HttpResponse` class, with helper + methods like :meth:`~.HttpResponse.css`. Note how headers are passed when creating :class:`~.HttpResponse` instance. This is needed to decode body (which is ``bytes``) to unicode properly, using the web browser rules. It involves checking ``Content-Encoding`` header, meta tags in HTML, BOM markers in the body, etc. -* instead of ``extract_book`` function we got ``BookPage`` class, - which receives response data in its ``__init__`` method - see how it - is created: ``BookPage(response=response)``. -* ``BookPage`` inherits from :class:`~.WebPage` base class. This base class - is not doing much: it +* instead of the ``extract_book`` function we've got a ``BookPage`` class, + which inherits from the :class:`~.ItemPage` base class, receives response + data in its ``__init__`` method, and returns the extracted item + in the ``to_item()`` method. ``to_item`` is a standard method name + used by ``web-poet``. - * defines ``__init__`` method which receives :class:`~.HttpResponse`, and - * provides shortcut methods like :meth:`~.WebPage.css`, which work by - creating :external:py:class:`parsel.selector.Selector` behind the scenes - (so that you don't need to create a selector in the ``extract_book`` method). +Receiving a ``response`` argument in ``__init__`` is very common for page +objects, so ``web-poet`` provides a shortcut for it: inherit from +:class:`~.WebPage`, which provides this ``__init__`` method implementation: + +.. code-block:: python + + from web_poet import WebPage + + class BookPage(WebPage): + def to_item(self) -> dict: + return { + "url": self.response.url, + "title": self.response.css("h1").get(), + "description": self.response.css("#product_description+ p").get().strip(), + # ... + } There are pros and cons for using classes vs functions for writing such extraction code, but the distinction is not that important; @@ -215,33 +233,7 @@ to_item() method It is common to have Page Objects for a web page where a single main data record needs to be extracted (e.g. book information in our example). ``web-poet`` standardizes this, by asking to name a method implementing the -extraction ``to_item``. It also provides the :class:`~.ItemWebPage` base class -and the :class:`ItemPage` mixin, which ensure the ``to_item`` method -is implemented. Let's change the code to follow this standard: - -.. code-block:: python - - import requests - from web_poet import ItemWebPage, HttpResponse - - - # === Extraction code - class BookPage(ItemWebPage): - """ - A book page on http://books.toscrape.com website, e.g. - http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html - """ - def to_item(self): - return { - "url": self.url, - "title": self.css("h1").get(), - "description": self.css("#product_description+ p").get().strip(), - # ... - } - - # ... get resp_data somehow - book_page = BookPage(response=response) - item = book_page.to_item() +extraction ``to_item``. As the method name is now standardized, the code which creates a Page Object instance can now work for other Page Objects like that. For example, you can @@ -249,7 +241,7 @@ have ``ToscrapeBookPage`` and ``BamazonBookPage`` classes, and .. code-block:: python - def get_item(page_cls: ItemWebPage, response: HttpResponse) -> dict: + def get_item(page_cls: WebPage, response: HttpResponse) -> dict: page = page_cls(response=response) return page.to_item() @@ -263,8 +255,8 @@ it for free: def get_item(extract_func, response: HttpResponse) -> dict: return extract_func(url=response.url, text=response.text) -No need to agree on ``to_item`` name and have a base class to check that the -method is implemented. Why bother with classes then? +No need to agree on ``to_item`` name or have a base class. +Why bother with classes then? Classes for web scraping code ============================= @@ -275,7 +267,7 @@ For example, we can extract logic for different attributes into properties: .. code-block:: python - class BookPage(ItemWebPage): + class BookPage(WebPage): """ A book page on http://books.toscrape.com website, e.g. http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html @@ -283,15 +275,15 @@ For example, we can extract logic for different attributes into properties: @property def title(self): - return self.css("h1").get() + return self.response.css("h1").get() @property def description(self): - return self.css("#product_description+ p").get().strip() + return self.response.css("#product_description+ p").get().strip() def to_item(self): return { - "url": self.url, + "url": self.response.url, "title": self.title, "description": self.description, # ... @@ -301,22 +293,35 @@ It might be easier to read the code written this way. Also, this style allows to extract only some of the attributes - if you don't need the complete to_item() output, you still can access individual properties. -.. note:: - web-poet provides a small framework to simplify writing Page Objects - in this style; see :ref:`web-poet-fields` . - -You may even write some base class to make it nicer - e.g. helper descriptors -to define properties from CSS selectors, and a default ``to_item`` -implementation (so, no need to define ``to_item``). -This is currently not implemented in ``web-poet``, but -nothing prevents us from having a DSL like this: +web-poet provides a small framework to simplify writing Page Objects +in this style; see :ref:`web-poet-fields`. The example above can be simplified +using web-poet fields - there is no need to write ``to_item`` boilerplate: .. code-block:: python - class BookPage(ItemWebPage): - title = Css("h1") - description = Css("#product_description+ p") | Strip() - url = TakeUrl() + from web_poet import WebPage, field + + class BookPage(WebPage): + """ + A book page on http://books.toscrape.com website, e.g. + http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html + """ + + @field + def title(self): + return self.response.css("h1").get() + + @field + def description(self): + return self.response.css("#product_description+ p").get().strip() + + @field + def url(self): + return self.response.url + +.. note:: + The ``BookPage.to_item()`` method is ``async`` in the example above. + Make sure to check :ref:`web-poet-fields` if you want to use web-poet fields. Another reason to consider classes for the extraction code is that sometimes there is no a single "main" method, but you still want to group the related code. @@ -342,10 +347,10 @@ pages and pagination URLs: class BookListPage(ProductListingPage): def item_urls(self): - return self.css(".product a::attr(href)").getall() + return self.response.css(".product a::attr(href)").getall() def page_urls(self): - return self.css(".paginator a::attr(href)").getall() + return self.response.css(".paginator a::attr(href)").getall() Web Scraping Frameworks @@ -368,8 +373,8 @@ Let's recall the example we started with: sel = parsel.Selector(resp) return { "url": resp.url, - "title": sel.css("h1").get(), - "description": sel.css("#product_description+ p").get().strip(), + "title": sel.response.css("h1").get(), + "description": sel.response.css("#product_description+ p").get().strip(), # ... } @@ -380,19 +385,19 @@ And this is what we ended up with: .. code-block:: python import requests - from web_poet import ItemWebPage, HttpResponse + from web_poet import WebPage, HttpResponse # === Extraction code - class BookPage(ItemWebPage): + class BookPage(WebPage): """ A book page on http://books.toscrape.com website, e.g. http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html """ def to_item(self): return { - "url": self.url, - "title": self.css("h1").get(), - "description": self.css("#product_description+ p").get().strip(), + "url": self.response.url, + "title": self.response.css("h1").get(), + "description": self.response.css("#product_description+ p").get().strip(), # ... } @@ -410,7 +415,7 @@ And this is what we ended up with: return HttpResponse(url=resp.url, body=body, headers=headers) # === Usage example - def get_item(page_cls: ItemWebPage, resp_data: HttpResponse) -> dict: + def get_item(page_cls: WebPage, resp_data: HttpResponse) -> dict: page = page_cls(response=resp_data) return page.to_item() @@ -439,17 +444,17 @@ would only need to write the "extraction" part: .. code-block:: python - from web_poet import ItemWebPage + from web_poet import WebPage - class BookPage(ItemWebPage): + class BookPage(WebPage): """ A book page on http://books.toscrape.com website, e.g. http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html """ def to_item(self): return { - "url": self.url, - "title": self.css("h1").get(), - "description": self.css("#product_description+ p").get().strip(), + "url": self.response.url, + "title": self.response.css("h1").get(), + "description": self.response.css("#product_description+ p").get().strip(), # ... } @@ -460,6 +465,7 @@ to process, and that's it: item = some_framework.extract(url, BookPage) +``web-poet`` **does not** provide such a framework. The role of ``web-poet`` is to define a standard on how to write the extraction logic, and allow it to be reused in different frameworks. ``web-poet`` Page Objects should be flexible enough to be used with: @@ -470,7 +476,6 @@ extraction logic, and allow it to be reused in different frameworks. * different underlying HTTP implementations - or without HTTP support at all, etc. - Page Objects ============ @@ -515,10 +520,9 @@ For example, a very basic Page Object could look like this: } There is no *need* to use other base classes and mixins -defined by ``web-poet`` (:class:`~.WebPage`, :class:`~.ResponseShortcutsMixin`, -:class:`~.ItemPage`, :class:`~.ItemWebPage`, etc.), but it can be a good -idea to familiarize yourself with them, as they are taking some of -the boilerplate out. +defined by ``web-poet`` (:class:`~.WebPage`, :class:`~.ItemPage`, etc.), +but it can be a good idea to familiarize yourself with them, as they are +taking some of the boilerplate out. Page Object Inputs ================== @@ -555,7 +559,7 @@ You may define page objects for this task: .. code-block:: python class BamazonBookPage(Injectable): - def __init__(self, response: SplashResponseData): + def __init__(self, response: SplashResponse): self.response = response def to_item(self): @@ -618,7 +622,7 @@ a different type annotation should be used: .. code-block:: python class BamazonBookPage(Injectable): - def __init__(self, response: SplashResponseData): + def __init__(self, response: SplashResponse): self.response = response class ToScrapeBookPage(Injectable): @@ -628,7 +632,7 @@ a different type annotation should be used: For each possible input a separate class needs to be defined, even if the data has the same format. For example, both :class:`~.HttpResponse` and -``SplashResponseData`` may have the same ``url`` and ``text`` properties, +``SplashResponse`` may have the same ``url`` and ``text`` properties, but they can't be the same class, because they need to work as "markers" - tell frameworks if the html should be taken from HTTP response body or from Splash DOM snapshot. @@ -708,10 +712,10 @@ Then, framework's role is to: (a common case is ``to_item``). For example, ``web-poet`` + Scrapy integration package (scrapy-poet_) -may inspect a WebPage subclass you defined, figure out it needs -:class:`~.HttpResponse` and nothing else, fetch scrapy's ``TextResponse``, -create :class:`~.HttpResponse` instance from it, create your -Page Object instance, and pass it to a spider callback. +inspects a WebPage subclass you defined, figures out it needs +:class:`~.HttpResponse` and nothing else, fetches Scrapy's ``TextResponse``, +creates an :class:`~.HttpResponse` instance from it, creates your +Page Object instance, and passes it to a spider callback. Finally, the Developer's role is to: diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index a43b0252..0939e13b 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -54,28 +54,28 @@ Let's take a look at how the following code is structured: .. code-block:: python - from web_poet import handle_urls, ItemWebPage + from web_poet import handle_urls, WebPage - class GenericProductPage(ItemWebPage): + class GenericProductPage(WebPage): def to_item(self): return {"product-title": self.css("title::text").get()} @handle_urls("example.com", overrides=GenericProductPage) - class ExampleProductPage(ItemWebPage): + class ExampleProductPage(WebPage): def to_item(self): ... # more specific parsing @handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/") - class AnotherExampleProductPage(ItemWebPage): + class AnotherExampleProductPage(WebPage): def to_item(self): ... # more specific parsing @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) - class DualExampleProductPage(ItemWebPage): + class DualExampleProductPage(WebPage): def to_item(self): ... # more specific parsing diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index 01e681b8..ab30bbbb 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -19,10 +19,10 @@ list page on `books.toscrape.com `_. .. code-block:: python - from web_poet.pages import ItemWebPage + from web_poet.pages import WebPage - class BookLinksPage(ItemWebPage): + class BookLinksPage(WebPage): @property def links(self): @@ -74,11 +74,11 @@ Our simple Python script might look like this: import requests - from web_poet.pages import ItemWebPage + from web_poet.pages import WebPage from web_poet.page_inputs import HttpResponse - class BookLinksPage(ItemWebPage): + class BookLinksPage(WebPage): @property def links(self): diff --git a/pyproject.toml b/pyproject.toml index 82912dca..892b5aab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,4 +5,4 @@ multi_line_output = 3 [tool.mypy] show_error_codes = true ignore_missing_imports = true -no_warn_no_return = true \ No newline at end of file +no_warn_no_return = true diff --git a/tests/test_fields.py b/tests/test_fields.py index 98193bc1..5df22555 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -6,6 +6,7 @@ from web_poet import ( HttpResponse, + Injectable, ItemPage, field, item_from_fields, @@ -21,7 +22,7 @@ class Item: @attrs.define -class Page(ItemPage): +class Page(ItemPage[Item]): response: HttpResponse @field @@ -33,12 +34,9 @@ async def price(self): # noqa: D102 await asyncio.sleep(0.01) return "$123" - async def to_item(self): # noqa: D102 - return await item_from_fields(self, Item) - @attrs.define -class InvalidPage(ItemPage): +class InvalidPage(ItemPage[Item]): response: HttpResponse @field @@ -49,9 +47,6 @@ def name(self): # noqa: D102 def unknown_attribute(self): # noqa: D102 return "foo" - async def to_item(self): # noqa: D102 - return await item_from_fields(self, Item) - EXAMPLE_RESPONSE = HttpResponse( "http://example.com", @@ -226,7 +221,7 @@ async def n_called(self): @pytest.mark.asyncio -async def test_item_cls_fields_async(): +async def test_skip_nonitem_fields_async(): class ExtendedPage(Page): @field def new_attribute(self): @@ -238,16 +233,16 @@ def new_attribute(self): class ExtendedPage2(ExtendedPage): async def to_item(self) -> Item: - return await item_from_fields(self, Item, item_cls_fields=True) + return await item_from_fields(self, Item, skip_nonitem_fields=True) page = ExtendedPage2(response=EXAMPLE_RESPONSE) item = await page.to_item() assert item == Item(name="Hello!", price="$123") -def test_item_cls_fields(): +def test_skip_nonitem_fields(): @attrs.define - class SyncPage(ItemPage): + class SyncPage(Injectable): response: HttpResponse @field @@ -272,7 +267,7 @@ def new_attribute(self): class ExtendedPage2(ExtendedPage): def to_item(self) -> Item: - return item_from_fields_sync(self, Item, item_cls_fields=True) + return item_from_fields_sync(self, Item, skip_nonitem_fields=True) page = ExtendedPage2(response=EXAMPLE_RESPONSE) item = page.to_item() diff --git a/tests/test_pages.py b/tests/test_pages.py index b546f567..7be4c482 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -1,22 +1,27 @@ -import pytest - -from web_poet.pages import ItemPage, ItemWebPage, is_injectable +from typing import Optional +import attrs +import pytest -def test_abstract_page_object(): - with pytest.raises(TypeError) as exc: - ItemPage() - assert "Can't instantiate abstract class" in str(exc.value) +from web_poet import HttpResponse, field +from web_poet.pages import ( + Injectable, + ItemPage, + ItemT, + ItemWebPage, + Returns, + WebPage, + is_injectable, +) -def test_abstract_web_page_object(): - with pytest.raises(TypeError) as exc: - ItemWebPage() - assert "Can't instantiate abstract class" in str(exc.value) +@attrs.define +class Item: + name: str def test_page_object(): - class MyItemPage(ItemPage): + class MyItemPage(Injectable): def to_item(self) -> dict: return { "foo": "bar", @@ -29,8 +34,8 @@ def to_item(self) -> dict: def test_web_page_object(book_list_html_response): - class MyWebPage(ItemWebPage): - def to_item(self) -> dict: + class MyWebPage(WebPage): + def to_item(self) -> dict: # type: ignore return { "url": self.url, "title": self.css("title::text").get().strip(), @@ -43,12 +48,21 @@ def to_item(self) -> dict: } +def test_item_web_page_deprecated(): + with pytest.warns( + DeprecationWarning, match="deprecated class web_poet.pages.ItemWebPage" + ): + + class MyItemWebPage(ItemWebPage): + pass + + def test_is_injectable(): class MyClass: pass class MyItemPage(ItemPage): - def to_item(self) -> dict: + def to_item(self) -> dict: # type: ignore return { "foo": "bar", } @@ -60,3 +74,142 @@ def to_item(self) -> dict: assert is_injectable(MyItemPage()) is False assert is_injectable(ItemPage) is True assert is_injectable(ItemWebPage) is True + + +@pytest.mark.asyncio +async def test_item_page_typed(): + class MyPage(ItemPage[Item]): + @field + def name(self): + return "name" + + page = MyPage() + assert page.item_cls is Item + item = await page.to_item() + assert isinstance(item, Item) + assert item == Item(name="name") + + +@pytest.mark.asyncio +async def test_web_page_fields(): + class MyPage(WebPage[Item]): + @field + def name(self): + return "name" + + page = MyPage(HttpResponse(url="http://example.com", body=b"")) + assert page.item_cls is Item + item = await page.to_item() + assert isinstance(item, Item) + assert item == Item(name="name") + + +@pytest.mark.asyncio +async def test_item_page_typed_subclass(): + class BasePage(ItemPage[ItemT]): + @field + def name(self): + return "name" + + class Subclass(BasePage[Item]): + pass + + page = BasePage() + assert page.item_cls is dict + assert (await page.to_item()) == {"name": "name"} + + page2 = Subclass() + assert page2.item_cls is Item + assert (await page2.to_item()) == Item(name="name") + + +@pytest.mark.asyncio +async def test_item_page_fields_typo(): + class MyPage(ItemPage[Item]): + @field + def nane(self): + return "name" + + page = MyPage() + assert page.item_cls is Item + with pytest.raises(TypeError, match="unexpected keyword argument 'nane'"): + await page.to_item() + + +@pytest.mark.asyncio +async def test_item_page_required_field_missing(): + @attrs.define + class MyItem: + name: str + price: Optional[float] + + class MyPage(ItemPage[MyItem]): + @field + def price(self): + return 100 + + page = MyPage() + assert page.item_cls is MyItem + with pytest.raises( + TypeError, match="missing 1 required positional argument: 'name'" + ): + await page.to_item() + + +@pytest.mark.asyncio +async def test_item_page_change_item_type_extra_fields() -> None: + class BasePage(ItemPage[Item]): + @field + def name(self): + return "hello" + + @attrs.define + class MyItem(Item): + price: float + + class Subclass(BasePage, Returns[MyItem]): + @field + def price(self): + return 123 + + page = Subclass() + assert page.item_cls is MyItem + item = await page.to_item() + assert isinstance(item, MyItem) + assert item == MyItem(name="hello", price=123) + + +@pytest.mark.asyncio +async def test_item_page_change_item_type_remove_fields() -> None: + @attrs.define + class MyItem: + name: str + price: float + + class BasePage(ItemPage[MyItem]): + @field + def name(self): + return "hello" + + @field + def price(self): + return 123 + + # Item only contains "name", but not "price" + class Subclass(BasePage, Returns[Item], skip_nonitem_fields=True): + pass + + page = Subclass() + assert page.item_cls is Item + item = await page.to_item() + assert isinstance(item, Item) + assert item == Item(name="hello") + + # Item only contains "name", but not "price", but "price" should be passed + class SubclassStrict(BasePage, Returns[Item]): + pass + + page2 = SubclassStrict() + assert page2.item_cls is Item + with pytest.raises(TypeError, match="unexpected keyword argument 'price'"): + await page2.to_item() diff --git a/tests_typing/test_fields.mypy-testing b/tests_typing/test_fields.mypy-testing new file mode 100644 index 00000000..ad377651 --- /dev/null +++ b/tests_typing/test_fields.mypy-testing @@ -0,0 +1,46 @@ +import pytest +import attrs + +from web_poet import ( + ItemPage, + field, + item_from_fields, + item_from_fields_sync, +) + + +class Page(ItemPage): + @field + def name(self): + return "hello" + + +@attrs.define +class Item: + name: str + + +@pytest.mark.mypy_testing +async def test_item_from_fields() -> None: + page = Page() + item1 = await item_from_fields(page, item_cls=dict) + reveal_type(item1) # R: builtins.dict[Any, Any] + item2 = await item_from_fields(page, item_cls=Item) + reveal_type(item2) # R: __main__.Item + + +@pytest.mark.mypy_testing +def test_item_from_fields_sync() -> None: + page = Page() + item1 = item_from_fields_sync(page, item_cls=dict) + reveal_type(item1) # R: builtins.dict[Any, Any] + item2 = item_from_fields_sync(page, item_cls=Item) + reveal_type(item2) # R: __main__.Item + + +@pytest.mark.mypy_testing +@pytest.mark.xfail +async def test_item_from_fields_default_item_cls() -> None: + page = Page() + item1 = await item_from_fields(page) + reveal_type(item1) # R: builtins.dict[Any, Any] diff --git a/tests_typing/test_item_page.mypy-testing b/tests_typing/test_item_page.mypy-testing new file mode 100644 index 00000000..827d673d --- /dev/null +++ b/tests_typing/test_item_page.mypy-testing @@ -0,0 +1,90 @@ +import attr +import pytest +import attrs + +from web_poet import ItemPage, field, Returns +from web_poet.pages import ItemT + + +@attrs.define +class Item: + name: str + + +@pytest.mark.mypy_testing +@pytest.mark.xfail +async def test_item_page() -> None: + class MyPage(ItemPage): + @field + def name(self): + return "hello" + + page = MyPage() + item = await page.to_item() + reveal_type(item) # R: dict + + +@pytest.mark.mypy_testing +async def test_item_page_parametrized() -> None: + class MyPage(ItemPage[Item]): + @field + def name(self): + return "hello" + + page = MyPage() + item = await page.to_item() + reveal_type(item) # R: __main__.Item + + +@pytest.mark.mypy_testing +async def test_item_page_parametrized_subclass() -> None: + class BasePage(ItemPage[Item]): + @field + def name(self): + return "hello" + + + class Subclass(BasePage): + pass + + page = Subclass() + item = await page.to_item() + reveal_type(item) # R: __main__.Item + + +@pytest.mark.mypy_testing +async def test_item_page_subclass_parametrized() -> None: + class BasePage(ItemPage[ItemT]): + @field + def name(self): + return "hello" + + class Subclass(BasePage[Item]): + pass + + page = Subclass() + item = await page.to_item() + reveal_type(item) # R: __main__.Item + + +@pytest.mark.mypy_testing +@pytest.mark.xfail +async def test_item_page_change_type() -> None: + + class BasePage(ItemPage[Item]): + @field + def name(self): + return "hello" + + @attr.define + class MyItem(Item): + price: float + + class Subclass(BasePage, Returns[MyItem]): + @field + def price(self): + return 123 + + page = Subclass() + item = await page.to_item() + reveal_type(item) # R: MyItem diff --git a/tox.ini b/tox.ini index aa2015fd..9d61e124 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,mypy,docs +envlist = py37,py38,py39,py310,mypy,docs,types [pytest] asyncio_mode = strict @@ -29,6 +29,16 @@ deps = commands = mypy web_poet tests +[testenv:types] +deps = + {[testenv]deps} + {[testenv:mypy]deps} + # waiting for https://github.com/davidfritzsche/pytest-mypy-testing/pull/31 + https://github.com/kmike/pytest-mypy-testing/archive/refs/heads/async-support.zip +; pytest-mypy-testing==0.0.11 + +commands = py.test {posargs: tests_typing} + [docs] changedir = docs deps = diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 1937168e..0d7fb36e 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -13,7 +13,7 @@ RequestUrl, ResponseUrl, ) -from .pages import Injectable, ItemPage, ItemWebPage, WebPage +from .pages import Injectable, ItemPage, ItemWebPage, Returns, WebPage from .requests import request_downloader_var from .utils import cached_method diff --git a/web_poet/_typing.py b/web_poet/_typing.py new file mode 100644 index 00000000..a5ec9fed --- /dev/null +++ b/web_poet/_typing.py @@ -0,0 +1,24 @@ +"""Utilities for typing""" +import typing + +if hasattr(typing, "get_args"): + _get_args = typing.get_args +else: + + def _get_args(base): + return getattr(base, "__args__", ()) + + +def is_generic_alias(obj) -> bool: + for attr_name in ["GenericAlias", "_GenericAlias"]: + if hasattr(typing, attr_name): + if isinstance(obj, getattr(typing, attr_name)): + return True + return False + + +def get_generic_parameter(cls): + for base in cls.__orig_bases__: + if is_generic_alias(base): + args = _get_args(base) + return args[0] diff --git a/web_poet/fields.py b/web_poet/fields.py index e616855a..96a271ef 100644 --- a/web_poet/fields.py +++ b/web_poet/fields.py @@ -1,31 +1,9 @@ """ -``web_poet.fields`` is a module with helpers for defining Page Objects. -It allows to define Page Objects in the following way: - -.. code-block:: python - - from web_poet import ItemPage, field, item_from_fields - - - class MyPage(ItemWebPage): - @field - def name(self): - return self.response.css(".name").get() - - @field - def price(self): - return self.response.css(".price").get() - - @field - def currency(self): - return "USD" - - async def to_item(self): - return await item_from_fields(self) - +``web_poet.fields`` is a module with helpers for putting extraction logic +into separate Page Object methods / properties. """ from functools import update_wrapper -from typing import Dict, Optional +from typing import Dict, List, Optional, Type, TypeVar import attrs from itemadapter import ItemAdapter @@ -68,8 +46,8 @@ def __init_subclass__(cls, **kwargs): def field(method=None, *, cached: bool = False, meta: Optional[dict] = None): """ Page Object method decorated with ``@field`` decorator becomes a property, - which is used by :func:`item_from_fields` or :func:`item_from_fields_sync` - to populate item attributes. + which is then used by :class:`~.ItemPage`'s to_item() method to populate + a corresponding item attribute. By default, the value is computed on each property access. Use ``@field(cached=True)`` to cache the property value. @@ -118,33 +96,47 @@ def get_fields_dict(cls_or_instance) -> Dict[str, FieldInfo]: return getattr(cls_or_instance, _FIELDS_INFO_ATTRIBUTE_READ, {}) -async def item_from_fields(obj, item_cls=dict, *, item_cls_fields=False): +T = TypeVar("T") + + +# FIXME: type is ignored as a workaround for https://github.com/python/mypy/issues/3737 +# inference works properly if a non-default item_cls is passed; for dict +# it's not working (return type is Any) +async def item_from_fields( + obj, item_cls: Type[T] = dict, *, skip_nonitem_fields: bool = False # type: ignore[assignment] +) -> T: """Return an item of ``item_cls`` type, with its attributes populated from the ``obj`` methods decorated with :class:`field` decorator. - If ``item_cls_fields`` is True, ``@fields`` whose names don't match - any of the ``item_cls`` attributes are not passed to ``item_cls.__init__``. - When ``item_cls_fields`` is False (default), all ``@fields`` are passed - to ``item_cls.__init__``. + If ``skip_nonitem_fields`` is True, ``@fields`` whose names are not + among ``item_cls`` field names are not passed to ``item_cls.__init__``. + + When ``skip_nonitem_fields`` is False (default), all ``@fields`` are passed + to ``item_cls.__init__``, possibly causing exceptions if + ``item_cls.__init__`` doesn't support them. """ - item_dict = item_from_fields_sync(obj, item_cls=dict, item_cls_fields=False) - field_names = item_dict.keys() - if item_cls_fields: + item_dict = item_from_fields_sync(obj, item_cls=dict, skip_nonitem_fields=False) + field_names = list(item_dict.keys()) + if skip_nonitem_fields: field_names = _without_unsupported_field_names(item_cls, field_names) return item_cls( **{name: await ensure_awaitable(item_dict[name]) for name in field_names} ) -def item_from_fields_sync(obj, item_cls=dict, *, item_cls_fields=False): +def item_from_fields_sync( + obj, item_cls: Type[T] = dict, *, skip_nonitem_fields: bool = False # type: ignore[assignment] +) -> T: """Synchronous version of :func:`item_from_fields`.""" field_names = list(get_fields_dict(obj)) - if item_cls_fields: + if skip_nonitem_fields: field_names = _without_unsupported_field_names(item_cls, field_names) return item_cls(**{name: getattr(obj, name) for name in field_names}) -def _without_unsupported_field_names(item_cls, field_names): +def _without_unsupported_field_names( + item_cls: type, field_names: List[str] +) -> List[str]: item_field_names = ItemAdapter.get_field_names_from_class(item_cls) if item_field_names is None: # item_cls doesn't define field names upfront return field_names[:] diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 9c962ffa..bf4d0e35 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -69,10 +69,10 @@ class PageObjectRegistry(dict): .. code-block:: python - from web_poet import handle_urls, default_registry, ItemWebPage + from web_poet import handle_urls, default_registry, WebPage @handle_urls("example.com", overrides=ProductPageObject) - class ExampleComProductPage(ItemWebPage): + class ExampleComProductPage(WebPage): ... override_rules = default_registry.get_overrides() diff --git a/web_poet/pages.py b/web_poet/pages.py index fdbf1220..5d268759 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -3,9 +3,11 @@ import attr -from web_poet.fields import FieldsMixin +from web_poet._typing import get_generic_parameter +from web_poet.fields import FieldsMixin, item_from_fields from web_poet.mixins import ResponseShortcutsMixin from web_poet.page_inputs import HttpResponse +from web_poet.utils import _create_deprecated_class class Injectable(abc.ABC, FieldsMixin): @@ -35,33 +37,47 @@ def is_injectable(cls: typing.Any) -> bool: return isinstance(cls, type) and issubclass(cls, Injectable) -class ItemPage(Injectable, abc.ABC): - """Base Page Object with a required :meth:`to_item` method. - Make sure you're creating Page Objects with ``to_item`` methods - if their main goal is to extract a single data record from a web page. +ItemT = typing.TypeVar("ItemT") + + +class Returns(typing.Generic[ItemT]): + """Inherit from this generic mixin to change the item type used by + :class:`~.ItemPage`""" + + @property + def item_cls(self) -> typing.Type[ItemT]: + """Item class""" + param = get_generic_parameter(self.__class__) + if isinstance(param, typing.TypeVar): # class is not parametrized + return dict # type: ignore[return-value] + return param + + +class ItemPage(Injectable, Returns[ItemT]): + """Base Page Object, with a default :meth:`to_item` implementation + which supports web-poet fields. """ - @abc.abstractmethod - def to_item(self): + _skip_nonitem_fields: bool + + def __init_subclass__(cls, skip_nonitem_fields: bool = False, **kwargs): + super().__init_subclass__(**kwargs) + cls._skip_nonitem_fields = skip_nonitem_fields + + async def to_item(self) -> ItemT: """Extract an item from a web page""" + return await item_from_fields( + self, item_cls=self.item_cls, skip_nonitem_fields=self._skip_nonitem_fields + ) @attr.s(auto_attribs=True) -class WebPage(Injectable, ResponseShortcutsMixin): +class WebPage(ItemPage[ItemT], ResponseShortcutsMixin): """Base Page Object which requires :class:`~.HttpResponse` and provides XPath / CSS shortcuts. - - Use this class as a base class for Page Objects which work on - HTML downloaded using an HTTP client directly. """ response: HttpResponse -@attr.s(auto_attribs=True) -class ItemWebPage(WebPage, ItemPage): - """:class:`WebPage` that requires the :meth:`to_item` method to - be implemented. - """ - - pass +ItemWebPage = _create_deprecated_class("ItemWebPage", WebPage, warn_once=False)