From 8e665ac4b0c4542dd106225477c0f98bbf438873 Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 22 Mar 2024 10:57:21 +0100 Subject: [PATCH] fixes a couple of typechecking errors in the docs --- docs/tools/lint_setup/template.py | 4 +++- .../docs/dlt-ecosystem/destinations/destination.md | 2 +- .../dlt-ecosystem/verified-sources/airtable.md | 2 +- .../docs/dlt-ecosystem/verified-sources/chess.md | 6 ++++-- .../dlt-ecosystem/verified-sources/filesystem.md | 3 ++- .../docs/dlt-ecosystem/verified-sources/jira.md | 3 ++- .../dlt-ecosystem/verified-sources/pipedrive.md | 7 ++++--- .../docs/general-usage/credentials/config_specs.md | 14 +++++++------- .../pseudonymizing_columns.md | 6 +++--- .../customising-pipelines/renaming_columns.md | 4 ++-- .../user_agent_device_data_enrichment.md | 8 ++++---- docs/website/docs/general-usage/resource.md | 4 +++- .../website/docs/general-usage/schema-contracts.md | 2 +- .../website/docs/general-usage/schema-evolution.md | 2 +- docs/website/docs/general-usage/schema.md | 4 ++-- .../deploy-gcp-cloud-function-as-webhook.md | 6 +++--- 16 files changed, 43 insertions(+), 34 deletions(-) diff --git a/docs/tools/lint_setup/template.py b/docs/tools/lint_setup/template.py index dcfada63f6..6b207ceb0b 100644 --- a/docs/tools/lint_setup/template.py +++ b/docs/tools/lint_setup/template.py @@ -8,8 +8,8 @@ import os import pendulum -from pendulum import DateTime from datetime import datetime # noqa: I251 +from pendulum import DateTime import dlt from dlt.common import json @@ -26,6 +26,7 @@ BaseConfiguration, ) from dlt.common.storages.configuration import FileSystemCredentials +from dlt.pipeline.exceptions import PipelineStepFailed # some universal variables pipeline: dlt.Pipeline = None # type: ignore[assignment] @@ -33,3 +34,4 @@ ex: Exception = None # type: ignore[assignment] load_info: LoadInfo = None # type: ignore[assignment] url: str = None # type: ignore[assignment] +my_resource: DltResource = None # type: ignore[assignment] \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md index 60753d90b5..c9a0bff022 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/destination.md +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -54,7 +54,7 @@ The full signature of the destination decorator plus its function is the followi loader_file_format="jsonl", name="my_custom_destination", naming_convention="direct", - max_nesting_level=0, + max_table_nesting=0, skip_dlt_columns_and_tables=True ) def my_destination(items: TDataItems, table: TTableSchema) -> None: diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md index bd04dbfcf3..f6b16ef944 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md @@ -215,7 +215,7 @@ verified source. base_id = base_id, table_names = table_names ) - load_info = pipeline.run(airtables, write_deposition = "replace") + load_info = pipeline.run(airtables, write_disposition = "replace") ``` > You have the option to use table names or table IDs in the code above, in place of "Table1" and diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md index 6ae457d1e6..62776b5c53 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md @@ -127,7 +127,9 @@ def players_profiles(players: List[str]) -> Iterator[TDataItem]: @dlt.defer def _get_profile(username: str) -> TDataItem: return get_path_with_retry(f"player/{username}") - ... + + for username in players: + yield _get_profile(username) ``` `players`: Is a list of player usernames for which you want to fetch profile data. @@ -161,7 +163,7 @@ def players_games( ) -> Iterator[Callable[[], List[TDataItem]]]: # gets a list of already checked(loaded) archives. checked_archives = dlt.current.resource_state().setdefault("archives", []) - ... + yield {} # return your retrieved data here ``` `players`: Is a list of player usernames for which you want to fetch games. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md index bf30da8882..9e0d46c563 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md @@ -453,7 +453,8 @@ verified source. ) # pretty print the information on data that was loaded print(load_info) - print(listing)(pipeline.last_trace.last_normalize_info) + print(listing) + print(pipeline.last_trace.last_normalize_info) ``` 1. Cleanup after loading: diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md index 38dacb0541..26c4462c34 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md @@ -173,7 +173,8 @@ The resource function searches issues using JQL queries and then loads them to t ```py @dlt.resource(write_disposition="replace") def issues(jql_queries: List[str]) -> Iterable[TDataItem]: - api_path = "rest/api/3/search" + api_path = "rest/api/3/search" + return {} # return the retrieved values here ``` `jql_queries`: Accepts a list of JQL queries. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index 3dc815d53b..1e570bfe7a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -213,10 +213,11 @@ create and store a mapping of custom fields for different entities in the source ```py @dlt.resource(selected=False) def create_state(pipedrive_api_key: str) -> Iterator[Dict[str, Any]]: - def _get_pages_for_rename( - entity: str, fields_entity: str, pipedrive_api_key: str - ) -> Dict[str, Any]: + def _get_pages_for_rename( + entity: str, fields_entity: str, pipedrive_api_key: str + ) -> Dict[str, Any]: ... + yield _get_pages_for_rename("", "", "") ``` It processes each entity in ENTITY_MAPPINGS, updating the custom fields mapping if a related fields diff --git a/docs/website/docs/general-usage/credentials/config_specs.md b/docs/website/docs/general-usage/credentials/config_specs.md index e93e1c466a..e66939fc39 100644 --- a/docs/website/docs/general-usage/credentials/config_specs.md +++ b/docs/website/docs/general-usage/credentials/config_specs.md @@ -94,7 +94,7 @@ credentials = ConnectionStringCredentials() credentials.drivername = "postgresql" credentials.database = "my_database" credentials.username = "my_user" -credentials.password = "my_password" +credentials.password = "my_password" # type: ignore credentials.host = "localhost" credentials.port = 5432 @@ -118,12 +118,12 @@ It also allows for the addition of scopes and provides methods for client authen Usage: ```py -credentials = OAuth2Credentials( - client_id="CLIENT_ID", - client_secret="CLIENT_SECRET", - refresh_token="REFRESH_TOKEN", - scopes=["scope1", "scope2"] -) +credentials = OAuth2Credentials({ + "client_id": "CLIENT_ID", + "client_secret": "CLIENT_SECRET", + "refresh_token": "REFRESH_TOKEN", + "scopes": ["scope1", "scope2"] +}) # Authorize the client credentials.auth() diff --git a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md index ba0b13636b..eff6f795ac 100644 --- a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md @@ -51,11 +51,11 @@ for row in dummy_source().dummy_data.add_map(pseudonymize_name): # 1. Create an instance of the source so you can edit it. data_source = dummy_source() # 2. Modify this source instance's resource -data_source = data_source.dummy_data.add_map(pseudonymize_name) +data_resource = data_source.dummy_data.add_map(pseudonymize_name) # 3. Inspect your result -for row in data_source: +for row in data_resource: print(row) pipeline = dlt.pipeline(pipeline_name='example', destination='bigquery', dataset_name='normalized_data') -load_info = pipeline.run(data_source) +load_info = pipeline.run(data_resource) ``` diff --git a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md index 04e4d33b13..4cbb4d7b32 100644 --- a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md @@ -44,10 +44,10 @@ def replace_umlauts_in_dict_keys(d): data_source = dummy_source() # 2. Modify this source instance's resource -data_source = data_source.dummy_data().add_map(replace_umlauts_in_dict_keys) +data_resource = data_source.dummy_data().add_map(replace_umlauts_in_dict_keys) # 3. Inspect your result -for row in data_source: +for row in data_resource: print(row) # {'Objekt_0': {'Groesse': 0, 'Aequivalenzpruefung': True}} diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md index 6b07845689..3aadb2f982 100644 --- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -127,7 +127,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Create `fetch_average_price()` function as follows: ```py - import datetime + from datetime import datetime, timedelta import requests # Uncomment transformer function if it is to be used as a transformer, @@ -160,7 +160,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the device_info = dlt.current.resource_state().setdefault("devices", {}) # Current timestamp for checking the last update - current_timestamp = datetime.datetime.now() + current_timestamp = datetime.now() # Print the current device information # print(device_info) # if you need to check state @@ -172,10 +172,10 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the # Calculate the time since the last update last_updated = ( current_timestamp - - device_data.get('timestamp', datetime.datetime.min) + device_data.get('timestamp', datetime.min) ) # Check if the device is not in state or data is older than 180 days - if device not in device_info or last_updated > datetime.timedelta(days=180): + if device not in device_info or last_updated > timedelta(days=180): try: # Make an API request to fetch device prices response = requests.get("https://serpapi.com/search", params={ diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index e2e95d937f..67609b8989 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -63,6 +63,7 @@ accepts following arguments: ... # the `table_schema` method gets table schema generated by a resource + # TODO: needs fixing print(get_users().table_schema()) ``` @@ -154,6 +155,7 @@ def repo_events() -> Iterator[TDataItems]: # the `table_schema` method gets table schema generated by a resource and takes optional # data item to evaluate dynamic hints +# TODO: needs fixing print(repo_events().table_schema({"type": "WatchEvent", id:...})) ``` @@ -283,7 +285,7 @@ def get_orders(): yield o # users and orders will be iterated in parallel in two separate threads -pipeline.run(get_users(), get_orders()) +pipeline.run[(get_users(), get_orders()]) ``` Async generators are automatically extracted concurrently with other resources: diff --git a/docs/website/docs/general-usage/schema-contracts.md b/docs/website/docs/general-usage/schema-contracts.md index 1b5e67357a..c79d240520 100644 --- a/docs/website/docs/general-usage/schema-contracts.md +++ b/docs/website/docs/general-usage/schema-contracts.md @@ -124,7 +124,7 @@ As with any other exception coming from pipeline run, it will be re-raised via ` ```py try: pipeline.run() -except Exception as pip_ex: +except PipelineStepFailed as pip_ex: if pip_ex.step == "normalize": if isinstance(pip_ex.__context__.__context__, DataValidationError): ... diff --git a/docs/website/docs/general-usage/schema-evolution.md b/docs/website/docs/general-usage/schema-evolution.md index 377df0e47f..dd3aa0bf8a 100644 --- a/docs/website/docs/general-usage/schema-evolution.md +++ b/docs/website/docs/general-usage/schema-evolution.md @@ -163,7 +163,7 @@ data = [{ pipeline = dlt.pipeline("organizations_pipeline", destination="duckdb") # Adding not null constraint -pipeline.run(data, table_name="org", columns={"room": {"data_type": "integer", "nullable": False}}) +pipeline.run(data, table_name="org", columns={"room": {"data_type": "bigint", "nullable": False}}) ``` During pipeline execution a data validation error indicates that a removed column is being passed as null. diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 164814010d..9b0d8ec622 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -317,7 +317,7 @@ def textual(nesting_level: int): schema.remove_type_detection("iso_timestamp") # convert UNIX timestamp (float, withing a year from NOW) into timestamp schema.add_type_detection("timestamp") - schema.compile_settings() + schema._compile_settings() - return dlt.resource(...) + return dlt.resource([]) ``` diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md index fc32aa2c30..29a0ae86f8 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md @@ -17,10 +17,10 @@ You can setup GCP cloud function webhook using `dlt` as follows: ```py import dlt - import json import time from google.cloud import bigquery - + from dlt.common import json + def your_webhook(request): # Extract relevant data from the request payload data = request.get_json() @@ -40,7 +40,7 @@ You can setup GCP cloud function webhook using `dlt` as follows: 7. Set the function name as "your_webhook" in the Entry point field. 8. In the requirements.txt file, specify the necessary packages: - ```py + ```text # Function dependencies, for example: # package>=version dlt