From 8b4fc8c25d2a46f86432d496b413983da2bd87eb Mon Sep 17 00:00:00 2001 From: Willi Date: Thu, 8 Aug 2024 19:27:34 +0530 Subject: [PATCH 1/9] RangePaginator: Stops pagination in case of page without data items --- dlt/sources/helpers/rest_client/client.py | 2 +- dlt/sources/helpers/rest_client/paginators.py | 62 +++++++++++-------- .../helpers/rest_client/test_client.py | 2 +- .../helpers/rest_client/test_paginators.py | 29 +++++++++ 4 files changed, 68 insertions(+), 27 deletions(-) diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index 73ae064299..c05dabc30c 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -225,7 +225,7 @@ def raise_for_status(response: Response, *args: Any, **kwargs: Any) -> None: if paginator is None: paginator = self.detect_paginator(response, data) - paginator.update_state(response) + paginator.update_state(response, data) paginator.update_request(request) # yield data with context diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 4c8ce70bb2..078d4b0a87 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -1,6 +1,6 @@ import warnings from abc import ABC, abstractmethod -from typing import Optional, Dict, Any +from typing import Any, Dict, List, Optional from urllib.parse import urlparse, urljoin from requests import Response, Request @@ -39,7 +39,7 @@ def init_request(self, request: Request) -> None: # noqa: B027, optional overri pass @abstractmethod - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: """Updates the paginator's state based on the response from the API. This method should extract necessary pagination details (like next page @@ -73,7 +73,7 @@ def __str__(self) -> str: class SinglePagePaginator(BasePaginator): """A paginator for single-page API responses.""" - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: self._has_next_page = False def update_request(self, request: Request) -> None: @@ -96,6 +96,7 @@ def __init__( maximum_value: Optional[int] = None, total_path: Optional[jsonpath.TJsonPath] = None, error_message_items: str = "items", + stop_after_empty_page: bool = False, ): """ Args: @@ -127,6 +128,7 @@ def __init__( self.maximum_value = maximum_value self.total_path = jsonpath.compile_path(total_path) if total_path else None self.error_message_items = error_message_items + self.stop_after_empty_page = stop_after_empty_page def init_request(self, request: Request) -> None: if request.params is None: @@ -134,26 +136,32 @@ def init_request(self, request: Request) -> None: request.params[self.param_name] = self.current_value - def update_state(self, response: Response) -> None: - total = None - if self.total_path: - response_json = response.json() - values = jsonpath.find_values(self.total_path, response_json) - total = values[0] if values else None - if total is None: - self._handle_missing_total(response_json) - - try: - total = int(total) - except ValueError: - self._handle_invalid_total(total) - - self.current_value += self.value_step - - if (total is not None and self.current_value >= total + self.base_index) or ( - self.maximum_value is not None and self.current_value >= self.maximum_value - ): + def update_state(self, response: Response, data: List[Any] = None) -> None: + if self._stop_after_this_page(data): self._has_next_page = False + else: + total = None + if self.total_path: + response_json = response.json() + values = jsonpath.find_values(self.total_path, response_json) + total = values[0] if values else None + if total is None: + self._handle_missing_total(response_json) + + try: + total = int(total) + except ValueError: + self._handle_invalid_total(total) + + self.current_value += self.value_step + + if (total is not None and self.current_value >= total + self.base_index) or ( + self.maximum_value is not None and self.current_value >= self.maximum_value + ): + self._has_next_page = False + + def _stop_after_this_page(self, data: List[Any]) -> bool: + return self.stop_after_empty_page and data == [] def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: raise ValueError( @@ -229,6 +237,7 @@ def __init__( page_param: str = "page", total_path: jsonpath.TJsonPath = "total", maximum_page: Optional[int] = None, + stop_after_empty_page: bool = False, ): """ Args: @@ -260,6 +269,7 @@ def __init__( value_step=1, maximum_value=maximum_page, error_message_items="pages", + stop_after_empty_page=stop_after_empty_page, ) def __str__(self) -> str: @@ -330,6 +340,7 @@ def __init__( limit_param: str = "limit", total_path: jsonpath.TJsonPath = "total", maximum_offset: Optional[int] = None, + stop_after_empty_page: bool = False, ) -> None: """ Args: @@ -356,6 +367,7 @@ def __init__( total_path=total_path, value_step=limit, maximum_value=maximum_offset, + stop_after_empty_page=stop_after_empty_page, ) self.limit_param = limit_param self.limit = limit @@ -484,7 +496,7 @@ def __init__(self, links_next_key: str = "next") -> None: super().__init__() self.links_next_key = links_next_key - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: """Extracts the next page URL from the 'Link' header in the response.""" self._next_reference = response.links.get(self.links_next_key, {}).get("url") @@ -539,7 +551,7 @@ def __init__( super().__init__() self.next_url_path = jsonpath.compile_path(next_url_path) - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: """Extracts the next page URL from the JSON response.""" values = jsonpath.find_values(self.next_url_path, response.json()) self._next_reference = values[0] if values else None @@ -618,7 +630,7 @@ def __init__( self.cursor_path = jsonpath.compile_path(cursor_path) self.cursor_param = cursor_param - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: List[Any] = None) -> None: """Extracts the cursor value from the JSON response.""" values = jsonpath.find_values(self.cursor_path, response.json()) self._next_reference = values[0] if values else None diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index f5de1ec5da..af914bf89d 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -400,7 +400,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response): + def update_state(self, response, data): self._next_reference = response.json().get("next_page") def update_request(self, request): diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 8a3c136e09..9e4ccada72 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -1,3 +1,4 @@ +from typing import Any, List from unittest.mock import Mock import pytest @@ -312,6 +313,19 @@ def test_client_pagination(self, rest_client): assert_pagination(pages) + def test_stop_after_empty_page(self): + paginator = OffsetPaginator( + offset=0, + limit=50, + maximum_offset=100, + total_path=None, + stop_after_empty_page=True, + ) + response = Mock(Response, json=lambda: {"items": []}) + no_data_found: List[Any] = [] + paginator.update_state(response, no_data_found) # Page 1 + assert paginator.has_next_page is False + @pytest.mark.usefixtures("mock_api_server") class TestPageNumberPaginator: @@ -372,6 +386,21 @@ def test_maximum_page(self): assert paginator.current_value == 3 assert paginator.has_next_page is False + def test_stop_after_empty_page(self): + paginator = PageNumberPaginator( + base_page=1, + page=1, + maximum_page=5, + stop_after_empty_page=True, + total_path=None, + ) + response = Mock(Response, json=lambda: {"items": []}) + no_data_found: List[Any] = [] + assert paginator.has_next_page is True + paginator.update_state(response, no_data_found) + assert paginator.current_value == 1 + assert paginator.has_next_page is False + def test_client_pagination_one_based(self, rest_client): pages_iter = rest_client.paginate( "/posts", From 8d4ffa9e49f083866c507de68c57196332e0493c Mon Sep 17 00:00:00 2001 From: Willi Date: Fri, 9 Aug 2024 15:58:55 +0530 Subject: [PATCH 2/9] Defaults RangePaginator to stop after having received an empty page --- dlt/sources/helpers/rest_client/paginators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 078d4b0a87..a96413d84e 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -96,7 +96,7 @@ def __init__( maximum_value: Optional[int] = None, total_path: Optional[jsonpath.TJsonPath] = None, error_message_items: str = "items", - stop_after_empty_page: bool = False, + stop_after_empty_page: bool = True, ): """ Args: @@ -237,7 +237,7 @@ def __init__( page_param: str = "page", total_path: jsonpath.TJsonPath = "total", maximum_page: Optional[int] = None, - stop_after_empty_page: bool = False, + stop_after_empty_page: bool = True, ): """ Args: @@ -340,7 +340,7 @@ def __init__( limit_param: str = "limit", total_path: jsonpath.TJsonPath = "total", maximum_offset: Optional[int] = None, - stop_after_empty_page: bool = False, + stop_after_empty_page: bool = True, ) -> None: """ Args: From e9ecf88a741033034b7beff4fcb0c3e8a12d12e9 Mon Sep 17 00:00:00 2001 From: Willi Date: Mon, 12 Aug 2024 15:31:04 +0530 Subject: [PATCH 3/9] Documents how to stop paginator, updates docs on json_link --- dlt/sources/helpers/rest_client/paginators.py | 12 +++-- .../verified-sources/rest_api.md | 8 +-- .../docs/general-usage/http/rest-client.md | 51 +++++++++++++++---- 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index a96413d84e..083b95da18 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -96,7 +96,7 @@ def __init__( maximum_value: Optional[int] = None, total_path: Optional[jsonpath.TJsonPath] = None, error_message_items: str = "items", - stop_after_empty_page: bool = True, + stop_after_empty_page: Optional[bool] = True, ): """ Args: @@ -117,6 +117,8 @@ def __init__( If not provided, `maximum_value` must be specified. error_message_items (str): The name of the items in the error message. Defaults to 'items'. + stop_after_empty_page (bool): Whether pagination should stop when + a page contains no result items. Defaults to `True`. """ super().__init__() if total_path is None and maximum_value is None: @@ -237,7 +239,7 @@ def __init__( page_param: str = "page", total_path: jsonpath.TJsonPath = "total", maximum_page: Optional[int] = None, - stop_after_empty_page: bool = True, + stop_after_empty_page: Optional[bool] = True, ): """ Args: @@ -255,6 +257,8 @@ def __init__( will stop once this page is reached or exceeded, even if more data is available. This allows you to limit the maximum number of pages for pagination. Defaults to None. + stop_after_empty_page (bool): Whether pagination should stop when + a page contains no result items. Defaults to `True`. """ if total_path is None and maximum_page is None: raise ValueError("Either `total_path` or `maximum_page` must be provided.") @@ -340,7 +344,7 @@ def __init__( limit_param: str = "limit", total_path: jsonpath.TJsonPath = "total", maximum_offset: Optional[int] = None, - stop_after_empty_page: bool = True, + stop_after_empty_page: Optional[bool] = True, ) -> None: """ Args: @@ -358,6 +362,8 @@ def __init__( pagination will stop once this offset is reached or exceeded, even if more data is available. This allows you to limit the maximum range for pagination. Defaults to None. + stop_after_empty_page (bool): Whether pagination should stop when + a page contains no result items. Defaults to `True`. """ if total_path is None and maximum_offset is None: raise ValueError("Either `total_path` or `maximum_offset` must be provided.") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 4b72b3276e..b4d2d08daa 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -371,7 +371,7 @@ You can configure the pagination for the `posts` resource like this: { "path": "posts", "paginator": { - "type": "json_response", + "type": "json_link", "next_url_path": "pagination.next", } } @@ -380,7 +380,7 @@ You can configure the pagination for the `posts` resource like this: Alternatively, you can use the paginator instance directly: ```py -from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator +from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator # ... @@ -402,8 +402,8 @@ These are the available paginators: | ------------ | -------------- | ----------- | | `json_link` | [JSONLinkPaginator](../../general-usage/http/rest-client.md#jsonresponsepaginator) | The link to the next page is in the body (JSON) of the response.
*Parameters:*
  • `next_url_path` (str) - the JSONPath to the next page URL
| | `header_link` | [HeaderLinkPaginator](../../general-usage/http/rest-client.md#headerlinkpaginator) | The links to the next page are in the response headers.
*Parameters:*
  • `link_header` (str) - the name of the header containing the links. Default is "next".
| -| `offset` | [OffsetPaginator](../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided.
*Parameters:*
  • `limit` (int) - the maximum number of items to retrieve in each request
  • `offset` (int) - the initial offset for the first request. Defaults to `0`
  • `offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"
  • `limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"
  • `total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset`
  • `maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count
| -| `page_number` | [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided.
*Parameters:*
  • `base_page` (int) - the starting page number. Defaults to `0`
  • `page_param` (str) - the query parameter name for the page number. Defaults to "page"
  • `total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page`
  • `maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached
| +| `offset` | [OffsetPaginator](../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided.
*Parameters:*
  • `limit` (int) - the maximum number of items to retrieve in each request
  • `offset` (int) - the initial offset for the first request. Defaults to `0`
  • `offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"
  • `limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"
  • `total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`
  • `maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| +| `page_number` | [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided.
*Parameters:*
  • `base_page` (int) - the starting page number. Defaults to `0`
  • `page_param` (str) - the query parameter name for the page number. Defaults to "page"
  • `total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`
  • `maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| | `cursor` | [JSONResponseCursorPaginator](../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON).
*Parameters:*
  • `cursor_path` (str) - the JSONPath to the cursor value. Defaults to "cursors.next"
  • `cursor_param` (str) - the query parameter name for the cursor. Defaults to "after"
| | `single_page` | SinglePagePaginator | The response will be interpreted as a single-page response, ignoring possible pagination metadata. | | `auto` | `None` | Explicitly specify that the source should automatically detect the pagination method. | diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index ddd66a233b..9451ca689d 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -183,8 +183,9 @@ need to specify the paginator when the API uses a different relation type. - `offset`: The initial offset for the first request. Defaults to `0`. - `offset_param`: The name of the query parameter used to specify the offset. Defaults to `"offset"`. - `limit_param`: The name of the query parameter used to specify the limit. Defaults to `"limit"`. -- `total_path`: A JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset`. +- `total_path`: A JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`. - `maximum_offset`: Optional maximum offset value. Limits pagination even without total count. +- `stop_after_empty_page`: Whether pagination should stop when a page contains no result items. Defaults to `True`. **Example:** @@ -198,7 +199,7 @@ E.g. `https://api.example.com/items?offset=0&limit=100`, `https://api.example.co } ``` -You can paginate through responses from this API using `OffsetPaginator`: +You can paginate through responses from this API using the `OffsetPaginator`: ```py client = RESTClient( @@ -210,20 +211,34 @@ client = RESTClient( ) ``` -In a different scenario where the API does not provide the total count, you can use `maximum_offset` to limit the pagination: +Pagination stops by default when a page contains no records. This is especially useful when the API does not provide the total item count. +Here, the `total_path` parameter is set to `None` because the API does not provide the total count. ```py client = RESTClient( base_url="https://api.example.com", paginator=OffsetPaginator( limit=100, - maximum_offset=1000, - total_path=None + total_path=None, ) ) ``` -Note, that in this case, the `total_path` parameter is set explicitly to `None` to indicate that the API does not provide the total count. +Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_offset` is reached before the first empty page then pagination stops: + +```py +client = RESTClient( + base_url="https://api.example.com", + paginator=OffsetPaginator( + limit=10, + maximum_offset=20, # limits response to 20 records + total_path=None, + ) +) +``` + +You can disable automatic stoppage of pagination by setting `stop_after_stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_offset` to guarantee that the paginator terminates. + #### PageNumberPaginator @@ -234,8 +249,9 @@ Note, that in this case, the `total_path` parameter is set explicitly to `None` - `base_page`: The index of the initial page from the API perspective. Normally, it's 0-based or 1-based (e.g., 1, 2, 3, ...) indexing for the pages. Defaults to 0. - `page`: The page number for the first request. If not provided, the initial value will be set to `base_page`. - `page_param`: The query parameter name for the page number. Defaults to `"page"`. -- `total_path`: A JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page`. +- `total_path`: A JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`. - `maximum_page`: Optional maximum page number. Stops pagination once this page is reached. +- `stop_after_empty_page`: Whether pagination should stop when a page contains no result items. Defaults to `True`. **Example:** @@ -248,7 +264,7 @@ Assuming an API endpoint `https://api.example.com/items` paginates by page numbe } ``` -You can paginate through responses from this API using `PageNumberPaginator`: +You can paginate through responses from this API using the `PageNumberPaginator`: ```py client = RESTClient( @@ -259,19 +275,32 @@ client = RESTClient( ) ``` -If the API does not provide the total number of pages: +Pagination stops by default when a page contains no records. This is especially useful when the API does not provide the total item count. +Here, the `total_path` parameter is set to `None` because the API does not provide the total count. ```py client = RESTClient( base_url="https://api.example.com", paginator=PageNumberPaginator( - maximum_page=5, # Stops after fetching 5 pages total_path=None ) ) ``` -Note, that in the case above, the `total_path` parameter is set explicitly to `None` to indicate that the API does not provide the total count. +Additionally, you can limit pagination with `maximum_offset`, for example during development. If `maximum_page` is reached before the first empty page then pagination stops: + +```py +client = RESTClient( + base_url="https://api.example.com", + paginator=OffsetPaginator( + maximum_page=2, # limits response to 2 pages + total_path=None, + ) +) +``` + +You can disable automatic stoppage of pagination by setting `stop_after_stop_after_empty_page = False`. In this case, you must provide either `total_path` or `maximum_page` to guarantee that the paginator terminates. + #### JSONResponseCursorPaginator From 5e78dcc45efc3400811c00dc1ce1bb7564ba3f6c Mon Sep 17 00:00:00 2001 From: Willi Date: Mon, 12 Aug 2024 17:05:28 +0530 Subject: [PATCH 4/9] Either total_path or maximum_value or stop_after_empty_pages is required --- dlt/sources/helpers/rest_client/paginators.py | 18 ++++-- .../helpers/rest_client/test_paginators.py | 56 +++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 083b95da18..888539a64d 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -121,8 +121,10 @@ def __init__( a page contains no result items. Defaults to `True`. """ super().__init__() - if total_path is None and maximum_value is None: - raise ValueError("Either `total_path` or `maximum_value` must be provided.") + if total_path is None and maximum_value is None and not stop_after_empty_page: + raise ValueError( + "Either `total_path` or `maximum_value` or stop_after_empty_page must be provided." + ) self.param_name = param_name self.current_value = initial_value self.value_step = value_step @@ -260,8 +262,10 @@ def __init__( stop_after_empty_page (bool): Whether pagination should stop when a page contains no result items. Defaults to `True`. """ - if total_path is None and maximum_page is None: - raise ValueError("Either `total_path` or `maximum_page` must be provided.") + if total_path is None and maximum_page is None and not stop_after_empty_page: + raise ValueError( + "Either `total_path` or `maximum_page` or `stop_after_empty_page` must be provided." + ) page = page if page is not None else base_page @@ -365,8 +369,10 @@ def __init__( stop_after_empty_page (bool): Whether pagination should stop when a page contains no result items. Defaults to `True`. """ - if total_path is None and maximum_offset is None: - raise ValueError("Either `total_path` or `maximum_offset` must be provided.") + if total_path is None and maximum_offset is None and not stop_after_empty_page: + raise ValueError( + "Either `total_path` or `maximum_offset` or `stop_after_empty_page` must be provided." + ) super().__init__( param_name=offset_param, initial_value=offset, diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 9e4ccada72..7357169101 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -326,6 +326,36 @@ def test_stop_after_empty_page(self): paginator.update_state(response, no_data_found) # Page 1 assert paginator.has_next_page is False + def test_guarantee_termination(self): + OffsetPaginator( + limit=10, + total_path=None, + ) + + OffsetPaginator( + limit=10, + total_path=None, + maximum_offset=1, + stop_after_empty_page=False, + ) + + with pytest.raises(ValueError) as e: + OffsetPaginator( + limit=10, + total_path=None, + stop_after_empty_page=False, + ) + assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided") + + with pytest.raises(ValueError) as e: + OffsetPaginator( + limit=10, + total_path=None, + stop_after_empty_page=False, + maximum_offset=None, + ) + assert e.match("`total_path` or `maximum_offset` or `stop_after_empty_page` must be provided") + @pytest.mark.usefixtures("mock_api_server") class TestPageNumberPaginator: @@ -431,6 +461,32 @@ def test_client_pagination_zero_based(self, rest_client): assert_pagination(pages) + def test_guarantee_termination(self): + PageNumberPaginator( + total_path=None, + ) + + PageNumberPaginator( + total_path=None, + maximum_page=1, + stop_after_empty_page=False, + ) + + with pytest.raises(ValueError) as e: + PageNumberPaginator( + total_path=None, + stop_after_empty_page=False, + ) + assert e.match("`total_path` or `maximum_page` or `stop_after_empty_page` must be provided") + + with pytest.raises(ValueError) as e: + PageNumberPaginator( + total_path=None, + stop_after_empty_page=False, + maximum_page=None, + ) + assert e.match("`total_path` or `maximum_page` or `stop_after_empty_page` must be provided") + @pytest.mark.usefixtures("mock_api_server") class TestJSONResponseCursorPaginator: From 44b82749365592de0b12879fb564d56c05120c72 Mon Sep 17 00:00:00 2001 From: Willi Date: Mon, 12 Aug 2024 17:55:20 +0530 Subject: [PATCH 5/9] updates docs to new type signature --- dlt/sources/helpers/rest_client/paginators.py | 14 +++++++------- .../website/docs/general-usage/http/rest-client.md | 8 +++++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 888539a64d..993cbf7f26 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -39,7 +39,7 @@ def init_request(self, request: Request) -> None: # noqa: B027, optional overri pass @abstractmethod - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: """Updates the paginator's state based on the response from the API. This method should extract necessary pagination details (like next page @@ -73,7 +73,7 @@ def __str__(self) -> str: class SinglePagePaginator(BasePaginator): """A paginator for single-page API responses.""" - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: self._has_next_page = False def update_request(self, request: Request) -> None: @@ -140,7 +140,7 @@ def init_request(self, request: Request) -> None: request.params[self.param_name] = self.current_value - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: if self._stop_after_this_page(data): self._has_next_page = False else: @@ -164,7 +164,7 @@ def update_state(self, response: Response, data: List[Any] = None) -> None: ): self._has_next_page = False - def _stop_after_this_page(self, data: List[Any]) -> bool: + def _stop_after_this_page(self, data: Optional[List[Any]]) -> bool: return self.stop_after_empty_page and data == [] def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: @@ -508,7 +508,7 @@ def __init__(self, links_next_key: str = "next") -> None: super().__init__() self.links_next_key = links_next_key - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: """Extracts the next page URL from the 'Link' header in the response.""" self._next_reference = response.links.get(self.links_next_key, {}).get("url") @@ -563,7 +563,7 @@ def __init__( super().__init__() self.next_url_path = jsonpath.compile_path(next_url_path) - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: """Extracts the next page URL from the JSON response.""" values = jsonpath.find_values(self.next_url_path, response.json()) self._next_reference = values[0] if values else None @@ -642,7 +642,7 @@ def __init__( self.cursor_path = jsonpath.compile_path(cursor_path) self.cursor_param = cursor_param - def update_state(self, response: Response, data: List[Any] = None) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: """Extracts the cursor value from the JSON response.""" values = jsonpath.find_values(self.cursor_path, response.json()) self._next_reference = values[0] if values else None diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 9451ca689d..40c83f8c5b 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -339,7 +339,7 @@ When working with APIs that use non-standard pagination schemes, or when you nee - `init_request(request: Request) -> None`: This method is called before making the first API call in the `RESTClient.paginate` method. You can use this method to set up the initial request query parameters, headers, etc. For example, you can set the initial page number or cursor value. -- `update_state(response: Response) -> None`: This method updates the paginator's state based on the response of the API call. Typically, you extract pagination details (like the next page reference) from the response and store them in the paginator instance. +- `update_state(response: Response, data: Optional[List[Any]]) -> None`: This method updates the paginator's state based on the response of the API call. Typically, you extract pagination details (like the next page reference) from the response and store them in the paginator instance. - `update_request(request: Request) -> None`: Before making the next API call in `RESTClient.paginate` method, `update_request` is used to modify the request with the necessary parameters to fetch the next page (based on the current state of the paginator). For example, you can add query parameters to the request, or modify the URL. @@ -348,6 +348,7 @@ When working with APIs that use non-standard pagination schemes, or when you nee Suppose an API uses query parameters for pagination, incrementing an page parameter for each subsequent page, without providing direct links to next pages in its responses. E.g. `https://api.example.com/posts?page=1`, `https://api.example.com/posts?page=2`, etc. Here's how you could implement a paginator for this scheme: ```py +from typing import Any, List, Optional from dlt.sources.helpers.rest_client.paginators import BasePaginator from dlt.sources.helpers.requests import Response, Request @@ -361,7 +362,7 @@ class QueryParamPaginator(BasePaginator): # This will set the initial page number (e.g. page=1) self.update_request(request) - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: # Assuming the API returns an empty list when no more data is available if not response.json(): self._has_next_page = False @@ -399,6 +400,7 @@ def get_data(): Some APIs use POST requests for pagination, where the next page is fetched by sending a POST request with a cursor or other parameters in the request body. This is frequently used in "search" API endpoints or other endpoints with big payloads. Here's how you could implement a paginator for a case like this: ```py +from typing import Any, List, Optional from dlt.sources.helpers.rest_client.paginators import BasePaginator from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.requests import Response, Request @@ -408,7 +410,7 @@ class PostBodyPaginator(BasePaginator): super().__init__() self.cursor = None - def update_state(self, response: Response) -> None: + def update_state(self, response: Response, data: Optional[List[Any]] = None) -> None: # Assuming the API returns an empty list when no more data is available if not response.json(): self._has_next_page = False From 982b448d533303cef803b681a2150d3f3a531f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Willi=20M=C3=BCller?= Date: Thu, 15 Aug 2024 13:59:12 +0200 Subject: [PATCH 6/9] improves formatting in error message Co-authored-by: Anton Burnashev --- dlt/sources/helpers/rest_client/paginators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 993cbf7f26..f87eaea873 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -123,7 +123,7 @@ def __init__( super().__init__() if total_path is None and maximum_value is None and not stop_after_empty_page: raise ValueError( - "Either `total_path` or `maximum_value` or stop_after_empty_page must be provided." + "Either `total_path` or `maximum_value` or `stop_after_empty_page` must be provided." ) self.param_name = param_name self.current_value = initial_value From 49b45fb4592e53e2d0d7eaf09c1c4279927b7853 Mon Sep 17 00:00:00 2001 From: Willi Date: Fri, 16 Aug 2024 17:12:21 +0530 Subject: [PATCH 7/9] sets default argument to None --- dlt/sources/helpers/rest_client/paginators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index f87eaea873..91b364c395 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -164,7 +164,7 @@ def update_state(self, response: Response, data: Optional[List[Any]] = None) -> ): self._has_next_page = False - def _stop_after_this_page(self, data: Optional[List[Any]]) -> bool: + def _stop_after_this_page(self, data: Optional[List[Any]]=None) -> bool: return self.stop_after_empty_page and data == [] def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: From 1f26fe74587fb13046ce0646fe97426150283b65 Mon Sep 17 00:00:00 2001 From: Willi Date: Fri, 16 Aug 2024 17:13:39 +0530 Subject: [PATCH 8/9] passes non-empty list to paginator.update_state() and interprets both None and [] as "no data" --- dlt/sources/helpers/rest_client/paginators.py | 2 +- .../helpers/rest_client/test_paginators.py | 34 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 91b364c395..632c93d0c7 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -165,7 +165,7 @@ def update_state(self, response: Response, data: Optional[List[Any]] = None) -> self._has_next_page = False def _stop_after_this_page(self, data: Optional[List[Any]]=None) -> bool: - return self.stop_after_empty_page and data == [] + return self.stop_after_empty_page and not data def _handle_missing_total(self, response_json: Dict[str, Any]) -> None: raise ValueError( diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 7357169101..7ae6aa10dc 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -242,7 +242,7 @@ class TestOffsetPaginator: def test_update_state(self): paginator = OffsetPaginator(offset=0, limit=10) response = Mock(Response, json=lambda: {"total": 20}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 10 assert paginator.has_next_page is True @@ -253,7 +253,7 @@ def test_update_state(self): def test_update_state_with_string_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {"total": "20"}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 10 assert paginator.has_next_page is True @@ -261,13 +261,13 @@ def test_update_state_with_invalid_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {"total": "invalid"}) with pytest.raises(ValueError): - paginator.update_state(response) + paginator.update_state(response, data=[{}]) def test_update_state_without_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): - paginator.update_state(response) + paginator.update_state(response, data=[{}]) def test_init_request(self): paginator = OffsetPaginator(offset=123, limit=42) @@ -281,7 +281,7 @@ def test_init_request(self): response = Mock(Response, json=lambda: {"total": 200}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) # Test for the next request next_request = Mock(spec=Request) @@ -295,11 +295,11 @@ def test_init_request(self): def test_maximum_offset(self): paginator = OffsetPaginator(offset=0, limit=50, maximum_offset=100, total_path=None) response = Mock(Response, json=lambda: {"items": []}) - paginator.update_state(response) # Offset 0 to 50 + paginator.update_state(response, data=[{}]) # Offset 0 to 50 assert paginator.current_value == 50 assert paginator.has_next_page is True - paginator.update_state(response) # Offset 50 to 100 + paginator.update_state(response, data=[{}]) # Offset 50 to 100 assert paginator.current_value == 100 assert paginator.has_next_page is False @@ -362,22 +362,22 @@ class TestPageNumberPaginator: def test_update_state(self): paginator = PageNumberPaginator(base_page=1, page=1, total_path="total_pages") response = Mock(Response, json=lambda: {"total_pages": 3}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 2 assert paginator.has_next_page is True - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 3 assert paginator.has_next_page is True # Test for reaching the end - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.has_next_page is False def test_update_state_with_string_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total": "3"}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) assert paginator.current_value == 2 assert paginator.has_next_page is True @@ -385,34 +385,34 @@ def test_update_state_with_invalid_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total_pages": "invalid"}) with pytest.raises(ValueError): - paginator.update_state(response) + paginator.update_state(response, data=[{}]) def test_update_state_without_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): - paginator.update_state(response) + paginator.update_state(response, data=[{}]) def test_update_request(self): paginator = PageNumberPaginator(base_page=1, page=1, page_param="page") request = Mock(Request) response = Mock(Response, json=lambda: {"total": 3}) - paginator.update_state(response) + paginator.update_state(response, data=[{}]) request.params = {} paginator.update_request(request) assert request.params["page"] == 2 - paginator.update_state(response) + paginator.update_state(response, data=[{}]) paginator.update_request(request) assert request.params["page"] == 3 def test_maximum_page(self): paginator = PageNumberPaginator(base_page=1, page=1, maximum_page=3, total_path=None) response = Mock(Response, json=lambda: {"items": []}) - paginator.update_state(response) # Page 1 + paginator.update_state(response, data=[{}]) # Page 1 assert paginator.current_value == 2 assert paginator.has_next_page is True - paginator.update_state(response) # Page 2 + paginator.update_state(response, data=[{}]) # Page 2 assert paginator.current_value == 3 assert paginator.has_next_page is False From 83bab151a81ad3e3beaad8b4486741bd3e28d2fa Mon Sep 17 00:00:00 2001 From: Willi Date: Mon, 19 Aug 2024 17:55:44 +0530 Subject: [PATCH 9/9] refactors magic to telling name --- .../helpers/rest_client/test_paginators.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 7ae6aa10dc..5c9f484bbc 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -17,6 +17,8 @@ from .conftest import assert_pagination +NON_EMPTY_PAGE = [{"some": "data"}] + @pytest.mark.usefixtures("mock_api_server") class TestHeaderLinkPaginator: @@ -242,7 +244,7 @@ class TestOffsetPaginator: def test_update_state(self): paginator = OffsetPaginator(offset=0, limit=10) response = Mock(Response, json=lambda: {"total": 20}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 10 assert paginator.has_next_page is True @@ -253,7 +255,7 @@ def test_update_state(self): def test_update_state_with_string_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {"total": "20"}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 10 assert paginator.has_next_page is True @@ -261,13 +263,13 @@ def test_update_state_with_invalid_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {"total": "invalid"}) with pytest.raises(ValueError): - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) def test_update_state_without_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) def test_init_request(self): paginator = OffsetPaginator(offset=123, limit=42) @@ -281,7 +283,7 @@ def test_init_request(self): response = Mock(Response, json=lambda: {"total": 200}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) # Test for the next request next_request = Mock(spec=Request) @@ -295,11 +297,11 @@ def test_init_request(self): def test_maximum_offset(self): paginator = OffsetPaginator(offset=0, limit=50, maximum_offset=100, total_path=None) response = Mock(Response, json=lambda: {"items": []}) - paginator.update_state(response, data=[{}]) # Offset 0 to 50 + paginator.update_state(response, data=NON_EMPTY_PAGE) # Offset 0 to 50 assert paginator.current_value == 50 assert paginator.has_next_page is True - paginator.update_state(response, data=[{}]) # Offset 50 to 100 + paginator.update_state(response, data=NON_EMPTY_PAGE) # Offset 50 to 100 assert paginator.current_value == 100 assert paginator.has_next_page is False @@ -362,22 +364,22 @@ class TestPageNumberPaginator: def test_update_state(self): paginator = PageNumberPaginator(base_page=1, page=1, total_path="total_pages") response = Mock(Response, json=lambda: {"total_pages": 3}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 2 assert paginator.has_next_page is True - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 3 assert paginator.has_next_page is True # Test for reaching the end - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.has_next_page is False def test_update_state_with_string_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total": "3"}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) assert paginator.current_value == 2 assert paginator.has_next_page is True @@ -385,34 +387,34 @@ def test_update_state_with_invalid_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total_pages": "invalid"}) with pytest.raises(ValueError): - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) def test_update_state_without_total_pages(self): paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) def test_update_request(self): paginator = PageNumberPaginator(base_page=1, page=1, page_param="page") request = Mock(Request) response = Mock(Response, json=lambda: {"total": 3}) - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) request.params = {} paginator.update_request(request) assert request.params["page"] == 2 - paginator.update_state(response, data=[{}]) + paginator.update_state(response, data=NON_EMPTY_PAGE) paginator.update_request(request) assert request.params["page"] == 3 def test_maximum_page(self): paginator = PageNumberPaginator(base_page=1, page=1, maximum_page=3, total_path=None) response = Mock(Response, json=lambda: {"items": []}) - paginator.update_state(response, data=[{}]) # Page 1 + paginator.update_state(response, data=NON_EMPTY_PAGE) # Page 1 assert paginator.current_value == 2 assert paginator.has_next_page is True - paginator.update_state(response, data=[{}]) # Page 2 + paginator.update_state(response, data=NON_EMPTY_PAGE) # Page 2 assert paginator.current_value == 3 assert paginator.has_next_page is False