diff --git a/Makefile b/Makefile index 5aa2b2786c..cd355a48ee 100644 --- a/Makefile +++ b/Makefile @@ -60,8 +60,8 @@ format: # poetry run isort ./ test-and-lint-snippets: - poetry run mypy --config-file mypy.ini docs/website docs/examples - poetry run flake8 --max-line-length=200 docs/website docs/examples + poetry run mypy --config-file mypy.ini docs/website docs/examples docs/tools + poetry run flake8 --max-line-length=200 docs/website docs/examples docs/tools cd docs/website/docs && poetry run pytest --ignore=node_modules lint-security: diff --git a/docs/snippets/performance_chunking.py b/docs/snippets/performance_chunking.py deleted file mode 100644 index 83c09896e4..0000000000 --- a/docs/snippets/performance_chunking.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -description: Chunking example -tags: performance -""" - -import dlt - -if __name__ == "__main__": - - def get_rows(limit): - yield from map(lambda n: {"row": n}, range(limit)) - - @dlt.resource - def database_cursor(): - # here we yield each row returned from database separately - yield from get_rows(10000) - - assert len(list(database_cursor())) == 10000 \ No newline at end of file diff --git a/docs/snippets/performance_chunking_2.py b/docs/snippets/performance_chunking_2.py deleted file mode 100644 index 1d80d8c36a..0000000000 --- a/docs/snippets/performance_chunking_2.py +++ /dev/null @@ -1,23 +0,0 @@ -""" -description: Chunking example -tags: performance -""" - -import dlt - -if __name__ == "__main__": - - from itertools import islice - - def get_rows(limit): - yield from map(lambda n: {"row": n}, range(limit)) - - @dlt.resource - def database_cursor_chunked(): - # here we yield chunks of size 1000 - rows = get_rows(10000) - while item_slice := list(islice(rows, 1000)): - print(f"got chunk of length {len(item_slice)}") - yield item_slice - - assert len(list(database_cursor_chunked())) == 10000 diff --git a/docs/snippets/performance_parallel_awaitables.py b/docs/snippets/performance_parallel_awaitables.py deleted file mode 100644 index 4afbd334b7..0000000000 --- a/docs/snippets/performance_parallel_awaitables.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -description: Extracting awaitables in parallel -tags: performance, extract, parallelization -""" - -import dlt - -if __name__ == "__main__": - - import asyncio - from threading import current_thread - - @dlt.resource - async def a_list_items(start, limit): - # simulate a slow REST API where you wait 0.3 sec for each item - index = start - while index < start + limit: - await asyncio.sleep(0.3) - yield index - index += 1 - - @dlt.transformer - async def a_get_details(item_id): - # simulate a slow REST API where you wait 0.3 sec for each item - await asyncio.sleep(0.3) - print(f"item_id {item_id} in thread {current_thread().name}") - # just return the results, if you yield, generator will be evaluated in main thread - return {"row": item_id} - - result = list(a_list_items(0, 10) | a_get_details) - print(result) - - assert len(result) == 10 \ No newline at end of file diff --git a/docs/snippets/performance_parallel_extract_callables.py b/docs/snippets/performance_parallel_extract_callables.py deleted file mode 100644 index 883f5cda28..0000000000 --- a/docs/snippets/performance_parallel_extract_callables.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -description: Extracting callables in parallel -tags: performance, extract, parallelization -""" - -import dlt - -if __name__ == "__main__": - - import time - from threading import current_thread - - @dlt.resource(parallelized=True) - def list_users(n_users): - for i in range(1, 1 + n_users): - # Simulate network delay of a rest API call fetching a page of items - if i % 10 == 0: - time.sleep(0.1) - yield i - - @dlt.transformer(parallelized=True) - def get_user_details(user_id): - # Transformer that fetches details for users in a page - time.sleep(0.1) # Simulate latency of a rest API call - print(f"user_id {user_id} in thread {current_thread().name}") - return {"entity": "user", "id": user_id} - - @dlt.resource(parallelized=True) - def list_products(n_products): - for i in range(1, 1 + n_products): - if i % 10 == 0: - time.sleep(0.1) - yield i - - @dlt.transformer(parallelized=True) - def get_product_details(product_id): - time.sleep(0.1) - print(f"product_id {product_id} in thread {current_thread().name}") - return {"entity": "product", "id": product_id} - - @dlt.source - def api_data(): - return [ - list_users(24) | get_user_details, - list_products(32) | get_product_details, - ] - - # evaluate the pipeline and print all the items - # sources are iterators and they are evaluated in the same way in the pipeline.run - result = list(api_data()) - - assert len(result) == 56 \ No newline at end of file diff --git a/docs/snippets/__init__.py b/docs/tools/__init__.py similarity index 100% rename from docs/snippets/__init__.py rename to docs/tools/__init__.py diff --git a/docs/tools/check_embedded_snippets.py b/docs/tools/check_embedded_snippets.py new file mode 100644 index 0000000000..145063d806 --- /dev/null +++ b/docs/tools/check_embedded_snippets.py @@ -0,0 +1,112 @@ +""" +Walks through all markdown files, finds all code snippets, and checks wether they are parseable. +""" +import os +from typing import TypedDict, List +import ast +from textwrap import dedent +import tomlkit +import json +import yaml + +DOCS_DIR = "../website/docs" + +SNIPPET_MARKER = "```" + + +class Snippet(TypedDict): + language: str + code: str + file: str + line: int + + +if __name__ == "__main__": + # discover all markdown files to be processed + markdown_files = [] + for path, directories, files in os.walk(DOCS_DIR): + if "api_reference" in path: + continue + for file in files: + if file.endswith(".md"): + markdown_files.append(os.path.join(path, file)) + + # extract snippets from markdown files + snippets: List[Snippet] = [] + for file in markdown_files: + print(f"Processing file {file}") + + # go line by line and find all code blocks + with open(file, "r") as f: + current_snippet: Snippet = None + lint_count = 0 + for line in f.readlines(): + lint_count += 1 + if line.strip().startswith(SNIPPET_MARKER): + if current_snippet: + # process snippet + snippets.append(current_snippet) + current_snippet["code"] = dedent(current_snippet["code"]) + current_snippet = None + else: + # start new snippet + current_snippet = { + "language": line.strip().split(SNIPPET_MARKER)[1], + "code": "", + "file": file, + "line": lint_count, + } + elif current_snippet: + current_snippet["code"] += line + assert not current_snippet + + # parse python snippets for now + count = {} + total = 0 + failed_count = {} + for snippet in snippets: + language = snippet["language"] or "unknown" + code = snippet["code"] + total += 1 + count[language] = count.get(language, 0) + 1 + print( + "Processing snippet no", + total, + "at line", + snippet["line"], + "in file", + snippet["file"], + "with language", + language, + ) + + # parse snippet by type + try: + if language in ["python", "py"]: + ast.parse(code) + elif language in ["toml"]: + tomlkit.loads(code) + elif language in ["json"]: + json.loads(snippet["code"]) + elif language in ["yaml"]: + yaml.safe_load(code) + # ignore text and shell scripts + elif language in ["text", "sh", "shell", "bash", "bat"]: + pass + elif language in ["sql"]: + pass + elif language in ["unknown"]: + pass + else: + assert False, ( + "Unknown language. Please add a parser or exception for this language, or choose" + " the correct language for the snippet." + ) + except Exception as e: + print(f"Failed to parse snippet: {e}") + failed_count[language] = failed_count.get(language, 0) + 1 + + assert len(snippets) > 100, "Found too few snippets. Something went wrong." # sanity check + + print(count) + print(failed_count) diff --git a/docs/website/check_embedded_snippets.py b/docs/website/check_embedded_snippets.py deleted file mode 100644 index 79f67d490c..0000000000 --- a/docs/website/check_embedded_snippets.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Walks through all markdown files, finds all code snippets, and checks wether they are parseable. -""" -import os -from typing import TypedDict, List -import ast -from textwrap import dedent -import tomlkit - -DOCS_DIR = "./docs" - -SNIPPET_MARKER = "```" - -class Snippet(TypedDict): - language: str - code: str - file: str - line: int - -if __name__ == "__main__": - - markdown_files = [] - for path, directories, files in os.walk(DOCS_DIR): - if "api_reference" in path: - continue - for file in files: - if file.endswith(".md"): - markdown_files.append(os.path.join(path, file)) - - # extract snippets from markdown files - snippets: List[Snippet] = [] - for file in markdown_files: - print(f"Processing file {file}") - with open(file, "r") as f: - current_snippet: Snippet = None - lint_count = 0 - for line in f.readlines(): - lint_count += 1 - if line.strip().startswith(SNIPPET_MARKER): - if current_snippet: - # process snippet - snippets.append(current_snippet) - current_snippet = None - else: - # start new snippet - current_snippet = { - "language": line.strip().split(SNIPPET_MARKER)[1], - "code": "", - "file": file, - "line": lint_count - } - elif current_snippet: - current_snippet["code"] += line - assert not current_snippet - - # parse python snippets for now - count = 0 - failed = 0 - for snippet in snippets: - print("Processing snippet no", count, " at line", snippet["line"], "in file", snippet["file"]) - if snippet["language"] in ["python", "py"]: - count += 1 - try: - ast.parse(dedent(snippet["code"])) - except Exception as e: - print(f"Failed to parse snippet: {e}") - failed += 1 - if snippet["language"] in ["toml"]: - ... - # tomlkit.loads(snippet["code"]) - - - print(count) - - - print(failed) - diff --git a/docs/website/check_snippets.py b/docs/website/check_snippets.py deleted file mode 100644 index fa233b1e5a..0000000000 --- a/docs/website/check_snippets.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Load all snippets in snippet folder, check wether they parse with ast and run them -""" -import typing as t -import os -import ast -import sys - -SNIPPET_DIR = "../snippets" - -def get_snippet_list() -> t.List[str]: - """Get list of available snippets in the snippet folder.""" - return [s.replace(".py", "") for s in os.listdir(SNIPPET_DIR) if s.endswith(".py") and s != "__init__.py"] - -def get_snippet(snippet_name: str) -> str: - """Get the content of a snippet.""" - with open(os.path.join(SNIPPET_DIR, snippet_name + ".py"), "r") as f: - return f.read() - -def parse_snippet(snippet: str) -> bool: - """Parse a snippet with ast.""" - try: - ast.parse(snippet) - print("\033[92m -> Parse ok \033[0m") - return True - except: - print("\033[91m -> Failed to parse snippet, skipping run \033[0m") - return False - -def run_snippet(snippet: str) -> bool: - """Run a snippet.""" - try: - with open(os.devnull, "w") as devnull: - old_stdout = sys.stdout - sys.stdout = devnull - exec(snippet, {"__name__": "__main__"}) - sys.stdout = old_stdout - print("\033[92m -> Run ok \033[0m") - return True - except: - sys.stdout = old_stdout - print("\033[91m -> Failed to run snippet\033[0m") - return False - - -if __name__ == "__main__": - - print("Checking snippets") - snippet_list = get_snippet_list() - failed_parsing = [] - failed_running = [] - print(f"Found {len(snippet_list)} snippets") - - # parse and run all snippets - for s in snippet_list: - print(f"Checking snippet {s}") - - snippet = get_snippet(s) - if parse_snippet(snippet) is False: - failed_parsing.append(snippet) - continue - - # snippet needs to be run in main function for some reason - if run_snippet(snippet) is False: - failed_running.append(snippet) - - print() - if failed_parsing or failed_running: - print(f"\033[91m{len(failed_parsing)} snippets failed to parse, {len(failed_running)} snippets failed to run") - exit(1) - else: - print("\033[92mAll snippets ok") - exit(0) - - diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index 90a175777f..1522761609 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -36,7 +36,7 @@ scalable extraction via micro-batching and parallelism. ## The simplest pipeline: 1 liner to load data with schema evolution -```python +```py import dlt dlt.pipeline(destination='duckdb', dataset_name='mydata').run([{'id': 1, 'name': 'John'}], table_name="users") @@ -52,7 +52,7 @@ named "three". With `dlt`, you can create a pipeline and run it with just a few 1. [Create a pipeline](walkthroughs/create-a-pipeline.md) to the [destination](dlt-ecosystem/destinations). 1. Give this pipeline data and [run it](walkthroughs/run-a-pipeline.md). -```python +```py import dlt pipeline = dlt.pipeline(destination="duckdb", dataset_name="country_data") @@ -84,7 +84,7 @@ In this example, we also run a dbt package and then load the outcomes of the loa This will enable us to log when schema changes occurred and match them to the loaded data for lineage, granting us both column and row level lineage. We also alert the schema change to a Slack channel where hopefully the producer and consumer are subscribed. -```python +```py import dlt # have data? dlt likes data @@ -105,7 +105,7 @@ load_info = pipeline.run( ) ``` Add dbt runner, optionally with venv: -```python +```py venv = dlt.dbt.get_venv(pipeline) dbt = dlt.dbt.package( pipeline, @@ -122,7 +122,7 @@ pipeline.run([models_info], table_name="transform_status", write_disposition='ap ``` Let's alert any schema changes: -```python +```py from dlt.common.runtime.slack import send_slack_message slack_hook = "https://hooks.slack.com/services/xxx/xxx/xxx" @@ -211,7 +211,7 @@ that only one instance of each event is present. You can use the merge write disposition as follows: -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def github_repo_events(): yield from _get_event_pages() @@ -260,7 +260,7 @@ into DAGs, providing cross-database compatibility and various features such as t backfills, testing, and troubleshooting. You can use the dbt runner in `dlt` to seamlessly integrate dbt into your pipeline. Here's an example of running a dbt package after loading the data: -```python +```py import dlt from pipedrive import pipedrive_source @@ -275,7 +275,7 @@ load_info = pipeline.run(pipedrive_source()) print(load_info) ``` Now transform from loaded data to dbt dataset: -```python +```py pipeline = dlt.pipeline( pipeline_name='pipedrive', destination='bigquery', @@ -306,7 +306,7 @@ transformations using SQL statements. You can execute SQL statements that change or manipulate data within tables. Here's an example of inserting a row into the `customers` table using the `dlt` SQL client: -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") with pipeline.sql_client() as client: @@ -324,7 +324,7 @@ You can fetch query results as Pandas data frames and perform transformations us functionalities. Here's an example of reading data from the `issues` table in DuckDB and counting reaction types using Pandas: -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index b376337e77..644f1bff5c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -18,7 +18,7 @@ pip install dlt[athena] ### 1. Initialize the dlt project Let's start by initializing a new `dlt` project as follows: - ```bash + ```shell dlt init chess athena ``` > šŸ’” This command will initialize your pipeline with chess as the source and AWS Athena as the destination using the filesystem staging destination. @@ -35,7 +35,7 @@ or with `pip install dlt[athena]`, which will install `s3fs`, `pyarrow`, `pyathe :::caution You may also install the dependencies independently. Try -```sh +```shell pip install dlt pip install s3fs pip install pyarrow @@ -122,7 +122,7 @@ If you decide to change the [filename layout](./filesystem#data-loading) from th ### Iceberg data tables You can save your tables as Iceberg tables to Athena. This will enable you, for example, to delete data from them later if you need to. To switch a resource to the iceberg table format, supply the table_format argument like this: -```python +```py @dlt.resource(table_format="iceberg") def data() -> Iterable[TDataItem]: ... diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index e852bfa9e5..a05820b1cd 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -166,7 +166,7 @@ Alternatively to parquet files, you can specify jsonl as the staging file format ### BigQuery/GCS Staging Example -```python +```py # Create a dlt pipeline that will load # chess player data to the BigQuery destination # via a GCS bucket. @@ -217,7 +217,7 @@ The adapter updates the DltResource with metadata about the destination column a Here is an example of how to use the `bigquery_adapter` method to apply hints to a resource on both the column level and table level: -```python +```py from datetime import date, timedelta import dlt @@ -258,7 +258,7 @@ Some things to note with the adapter's behavior: Note that `bigquery_adapter` updates the resource *inplace*, but returns the resource for convenience, i.e. both the following are valid: -```python +```py bigquery_adapter(my_resource, partition="partition_column_name") my_resource = bigquery_adapter(my_resource, partition="partition_column_name") ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index d00c603c14..013cd38d27 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -148,7 +148,7 @@ Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on co Example to set up Databricks with S3 as a staging destination: -```python +```py import dlt # Create a dlt pipeline that will load @@ -168,7 +168,7 @@ Refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure Example to set up Databricks with Azure as a staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Databricks destination # via staging on Azure Blob Storage diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md new file mode 100644 index 0000000000..76c84b0435 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -0,0 +1,151 @@ +--- +title: šŸ§Ŗ Destination Decorator / Reverse ETL +description: Sink function `dlt` destination for reverse ETL +keywords: [reverse etl, sink, function, decorator, destination] +--- + +# Destination decorator / Reverse ETL + +:::caution +The Destination Decorator is currently in alpha, while we think the interface is stable at this point and all is working pretty well, there still might be +small changes done or bugs found in the next weeks. +::: + +The dlt destination decorator allows you to receive all data passing through your pipeline in a simple function. This can be extremely useful for +reverse ETL, where you are pushing data back to an api. You can also use this for sending data to a queue or a simple database destination that is not +yet supported by dlt, be aware that you will have to manually handle your own migrations in this case. It will also allow you to simply get a path +to the files of your normalized data, so if you need direct access to parquet or jsonl files to copy them somewhere or push them to a database, +you can do this here too. + +## Install dlt for Sink / reverse ETL +** To install the DLT without additional dependencies ** +``` +pip install dlt +``` + +## Setup Guide +### 1. Initialize the dlt project + +Let's start by initializing a new dlt project as follows: + +```shell +dlt init chess sink +``` +> šŸ’” This command will initialize your pipeline with chess as the source and sink as the destination. + +The above command generates several files and directories, including `.dlt/secrets.toml`. + +### 2. Set up a destination function for your pipeline +The sink destination differs from other destinations in that you do not need to provide connection credentials, but rather you provide a function which +gets called for all items loaded during a pipeline run or load operation. For the chess example, you can add the following lines at the top of the file. +With the `@dlt.destination` decorator you can convert + +A very simple dlt pipeline that pushes a list of items into a sink function might look like this: + +```py +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema + +@dlt.destination(batch_size=10) +def my_sink(items: TDataItems, table: TTableSchema) -> None: + print(table["name"]) + print(items) + +pipe = dlt.pipeline("sink_pipeline", destination=my_sink) +pipe.run([1, 2, 3], table_name="items") + +``` + +To enable this destination decorator in your chess example, replace the line `destination='sink'` with `destination=sink` (without the quotes) to directly reference +the sink from your pipeline constructor. Now you can run your pipeline and see the output of all the items coming from the chess pipeline to your console. + +:::tip +1. You can also remove the typing information (TDataItems and TTableSchema) from this example, typing generally are useful to know the shape of the incoming objects though. +2. There are a few other ways for declaring sink functions for your pipeline described below. +::: + +## destination decorator function and signature + +The full signature of the destination decorator plus its function is the following: + +```py +@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_sink", naming="direct") +def sink(items: TDataItems, table: TTableSchema) -> None: + ... +``` + +#### Decorator +* The `batch_size` parameter on the destination decorator defines how many items per function call are batched together and sent as an array. If you set a batch-size of `0`, +instead of passing in actual dataitems, you will receive one call per load job with the path of the file as the items argument. You can then open and process that file +in any way you like. +* The `loader_file_format` parameter on the destination decorator defines in which format files are stored in the load package before being sent to the sink function, +this can be `jsonl` or `parquet`. +* The `name` parameter on the destination decorator defines the name of the destination that get's created by the destination decorator. +* The `naming_convention` parameter on the destination decorator defines the name of the destination that gets created by the destination decorator. This controls +how table and column names are normalized. The default is `direct` which will keep all names the same. + +#### Sink function +* The `items` parameter on the sink function contains the items being sent into the sink function. +* The `table` parameter contains the schema table the current call belongs to including all table hints and columns. For example the table name can be access with table["name"]. Keep in mind that dlt also created special tables prefixed with `_dlt` which you may want to ignore when processing data. +* You can also add config values and secrets to the function arguments, see below! + + +## Adding config variables and secrets +The destination decorator supports settings and secrets variables. If you, for example, plan to connect to a service that requires an api secret or a login, you can do the following: + +```py +@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_sink") +def my_sink(items: TDataItems, table: TTableSchema, api_key: dlt.secrets.value) -> None: + ... +``` + + +## Sink destination state +The sink destination keeps a local record of how many DataItems were processed, so if you, for example, use the sink destination to push DataItems to a remote api, and this +api becomes unavailable during the load resulting in a failed dlt pipeline run, you can repeat the run of your pipeline at a later stage and the sink destination will continue +where it left of. For this reason it makes sense to choose a batch size that you can process in one transaction (say one api request or one database transaction) so that if this +request or transaction fail repeatedly you can repeat it at the next run without pushing duplicate data to your remote location. + + + +And add the api key to your toml: + +```toml +[destination.my_sink] +api_key="some secrets" +``` + + +## Concurrency +Calls to the sink function by default will be executed on multiple threads, so you need to make sure you are not using any non-thread-safe nonlocal or global variables from outside +your sink function. If, for whichever reason, you need to have all calls be executed from the same thread, you can set the `workers` config variable of the load step to 1. For performance +reasons we recommend to keep the multithreaded approach and make sure that you, for example, are using threadsafe connection pools to a remote database or queue. + +## Referencing the sink function +There are multiple ways to reference the sink function you want to use. These are: + +```py +# file my_pipeline.py + +@dlt.destination(batch_size=10) +def local_sink_func(items: TDataItems, table: TTableSchema) -> None: + ... + +# reference function directly +p = dlt.pipeline(name="my_pipe", destination=local_sink_func) + +# fully qualified string to function location (can be used from config.toml or env vars) +p = dlt.pipeline(name="my_pipe", destination="sink", credentials="my_pipeline.local_sink_func") + +# via destination reference +p = dlt.pipeline(name="my_pipe", destination=Destination.from_reference("sink", credentials=local_sink_func, environment="staging")) +``` + +## Write disposition + +The sink destination will forward all normalized DataItems encountered during a pipeline run to the sink function, so there is no notion of write dispositions for the sink. + +## Staging support + +The sink destination does not support staging files in remote locations before being called at this time. If you need this feature, please let us know. + diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 9452a80c50..eba396ceaf 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -47,7 +47,7 @@ naming="duck_case" ``` or via the env variable `SCHEMA__NAMING` or directly in the code: -```python +```py dlt.config["schema.naming"] = "duck_case" ``` :::caution @@ -73,7 +73,7 @@ You can configure the following file formats to load data to duckdb: By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. The `duckdb` credentials do not require any secret values. You are free to pass the configuration explicitly via the `credentials` parameter to `dlt.pipeline` or `pipeline.run` methods. For example: -```python +```py # will load data to files/data.db database file p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='chess_data', full_refresh=False, credentials="files/data.db") @@ -82,7 +82,7 @@ p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='ches ``` The destination accepts a `duckdb` connection instance via `credentials`, so you can also open a database connection yourself and pass it to `dlt` to use. `:memory:` databases are supported. -```python +```py import duckdb db = duckdb.connect() p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='chess_data', full_refresh=False, credentials=db) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index ba323b3d7f..d532915180 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -17,7 +17,7 @@ This installs `s3fs` and `botocore` packages. You may also install the dependencies independently. Try: -```sh +```shell pip install dlt pip install s3fs ``` @@ -29,7 +29,7 @@ so pip does not fail on backtracking. ### 1. Initialise the dlt project Let's start by initialising a new dlt project as follows: - ```bash + ```shell dlt init chess filesystem ``` > šŸ’” This command will initialise your pipeline with chess as the source and the AWS S3 filesystem as the destination. diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index 1288b9caac..01209a1174 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -83,7 +83,7 @@ If your connection is of poor quality and you get a timeout when executing a DML ### I see some exception with home_dir missing when opening `md:` connection. Some internal component (HTTPS) requires the **HOME** env variable to be present. Export such a variable to the command line. Here is what we do in our tests: -```python +```py os.environ["HOME"] = "/tmp" ``` before opening the connection. diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 5ed4b69707..8780fdbd56 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -62,7 +62,7 @@ destination.mssql.credentials="mssql://loader:@loader.database.windows ``` To pass credentials directly, you can use the `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. -```python +```py pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="mssql://loader:@loader.database.windows.net/dlt_data?connect_timeout=15") ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index 10b935c083..b198e854b2 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -66,7 +66,7 @@ destination.postgres.credentials="postgresql://loader:@localhost/dlt_d ``` To pass credentials directly, you can use the `credentials` argument passed to the `dlt.pipeline` or `pipeline.run` methods. -```python +```py pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="postgresql://loader:@localhost/dlt_data") ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index ff37252852..3cb9a2438f 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -13,7 +13,7 @@ This destination helps you load data into Qdrant from [dlt resources](../../gene 1. To use Qdrant as a destination, make sure `dlt` is installed with the `qdrant` extra: -```bash +```shell pip install dlt[qdrant] ``` @@ -31,7 +31,7 @@ If no configuration options are provided, the default fallback will be `http://l 3. Define the source of the data. For starters, let's load some data from a simple data structure: -```python +```py import dlt from dlt.destinations.adapters import qdrant_adapter @@ -53,7 +53,7 @@ movies = [ 4. Define the pipeline: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="qdrant", @@ -63,7 +63,7 @@ pipeline = dlt.pipeline( 5. Run the pipeline: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -74,7 +74,7 @@ info = pipeline.run( 6. Check the results: -```python +```py print(info) ``` @@ -86,7 +86,7 @@ To use vector search after the data has been loaded, you must specify which fiel The `qdrant_adapter` is a helper function that configures the resource for the Qdrant destination: -```python +```py qdrant_adapter(data, embed) ``` @@ -99,7 +99,7 @@ Returns: [DLT resource](../../general-usage/resource.md) object that you can pas Example: -```python +```py qdrant_adapter( resource, embed=["title", "description"], @@ -122,7 +122,7 @@ The [replace](../../general-usage/full-loading.md) disposition replaces the data In the movie example from the [setup guide](#setup-guide), we can use the `replace` disposition to reload the data every time we run the pipeline: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -137,7 +137,7 @@ info = pipeline.run( The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination. For the `merge` disposition, you need to specify a `primary_key` for the resource: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -170,7 +170,7 @@ However, if you prefer to have class names without the dataset prefix, skip the For example: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="qdrant", diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index bc03dbbbeb..fca49732e4 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -17,13 +17,13 @@ pip install dlt[redshift] Let's start by initializing a new dlt project as follows: -```bash +```shell dlt init chess redshift ``` > šŸ’” This command will initialize your pipeline with chess as the source and Redshift as the destination. The above command generates several files and directories, including `.dlt/secrets.toml` and a requirements file for Redshift. You can install the necessary dependencies specified in the requirements file by executing it as follows: -```bash +```shell pip install -r requirements.txt ``` or withĀ `pip install dlt[redshift]`,Ā which installs theĀ `dlt`Ā library and the necessary dependencies for working with Amazon Redshift as a destination. @@ -52,7 +52,7 @@ To load data into Redshift, you need to create a Redshift cluster and enable acc 2. The "host" is derived from the cluster endpoint specified in the ā€œGeneral Configuration.ā€ For example: - ```bash + ```shell # If the endpoint is: redshift-cluster-1.cv3cmsy7t4il.us-east-1.redshift.amazonaws.com:5439/your_database_name # Then the host is: @@ -108,7 +108,7 @@ staging_iam_role="arn:aws:iam::..." ### Redshift/S3 staging example code -```python +```py # Create a dlt pipeline that will load # chess player data to the redshift destination # via staging on s3 diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index a6058a255e..d5227ac16f 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -167,7 +167,7 @@ stage_name=PUBLIC.my_s3_stage To run Snowflake with S3 as the staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Snowflake destination # via staging on S3 @@ -196,7 +196,7 @@ stage_name=PUBLIC.my_gcs_stage To run Snowflake with GCS as the staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Snowflake destination # via staging on GCS @@ -227,7 +227,7 @@ stage_name=PUBLIC.my_azure_stage To run Snowflake with Azure as the staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Snowflake destination # via staging on Azure diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index bac184fd41..adb1331700 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -86,7 +86,7 @@ destination.synapse.credentials = "synapse://loader:your_loader_password@your_sy ``` To pass credentials directly you can use the `credentials` argument of `dlt.destinations.synapse(...)`: -```python +```py pipeline = dlt.pipeline( pipeline_name='chess', destination=dlt.destinations.synapse( @@ -117,7 +117,7 @@ Data is loaded via `INSERT` statements by default. ## Table index type The [table index type](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index) of the created tables can be configured at the resource level with the `synapse_adapter`: -```python +```py info = pipeline.run( synapse_adapter( data=your_resource, @@ -156,7 +156,7 @@ Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.m To run Synapse with staging on Azure Blob Storage: -```python +```py # Create a dlt pipeline that will load # chess player data to the snowflake destination # via staging on Azure Blob Storage diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 6bd52acd35..7267feb055 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -13,7 +13,7 @@ This destination helps you load data into Weaviate from [dlt resources](../../ge 1. To use Weaviate as a destination, make sure dlt is installed with the 'weaviate' extra: -```bash +```shell pip install dlt[weaviate] ``` @@ -41,7 +41,7 @@ The `url` will default to **http://localhost:8080** and `api_key` is not defined 3. Define the source of the data. For starters, let's load some data from a simple data structure: -```python +```py import dlt from dlt.destinations.adapters import weaviate_adapter @@ -63,7 +63,7 @@ movies = [ 4. Define the pipeline: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="weaviate", @@ -73,7 +73,7 @@ pipeline = dlt.pipeline( 5. Run the pipeline: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -84,7 +84,7 @@ info = pipeline.run( 6. Check the results: -```python +```py print(info) ``` @@ -96,7 +96,7 @@ Weaviate destination is different from other [dlt destinations](../destinations/ The `weaviate_adapter` is a helper function that configures the resource for the Weaviate destination: -```python +```py weaviate_adapter(data, vectorize, tokenization) ``` @@ -109,7 +109,7 @@ Returns: a [dlt resource](../../general-usage/resource.md) object that you can p Example: -```python +```py weaviate_adapter( resource, vectorize=["title", "description"], @@ -133,7 +133,7 @@ The [replace](../../general-usage/full-loading.md) disposition replaces the data In the movie example from the [setup guide](#setup-guide), we can use the `replace` disposition to reload the data every time we run the pipeline: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -148,7 +148,7 @@ info = pipeline.run( The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data in the destination. For the `merge` disposition, you would need to specify a `primary_key` for the resource: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -203,7 +203,7 @@ However, if you prefer to have class names without the dataset prefix, skip the For example: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="weaviate", diff --git a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md index ff73e3741e..641be9a106 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md @@ -25,6 +25,6 @@ It is also supported by: **filesystem**. By setting the `loader_file_format` argument to `insert_values` in the run command, the pipeline will store your data in the INSERT format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="insert_values") ``` diff --git a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md index 130464578e..7467c6f639 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md @@ -27,6 +27,6 @@ This format is used by default by: **BigQuery**, **Snowflake**, **filesystem**. By setting the `loader_file_format` argument to `jsonl` in the run command, the pipeline will store your data in the jsonl format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="jsonl") ``` diff --git a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md index cc2fcfb200..287a5daf1e 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md @@ -10,7 +10,7 @@ keywords: [parquet, file formats] To use this format, you need a `pyarrow` package. You can get this package as a `dlt` extra as well: -```sh +```shell pip install dlt[parquet] ``` @@ -20,7 +20,7 @@ Supported by: **BigQuery**, **DuckDB**, **Snowflake**, **filesystem**, **Athena* By setting the `loader_file_format` argument to `parquet` in the run command, the pipeline will store your data in the parquet format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="parquet") ``` diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md index d2ed03a2a2..e3a60dfa51 100644 --- a/docs/website/docs/dlt-ecosystem/staging.md +++ b/docs/website/docs/dlt-ecosystem/staging.md @@ -48,7 +48,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel 4. **Chain staging to destination and request `parquet` file format.** Pass the `staging` argument to `dlt.pipeline`. It works like the destination `argument`: - ```python + ```py # Create a dlt pipeline that will load # chess player data to the redshift destination # via staging on s3 @@ -60,7 +60,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel ) ``` `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify `parquet` file format (just to demonstrate how to do it): - ```python + ```py info = pipeline.run(chess(), loader_file_format="parquet") ``` diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index 1cf7a91bfb..42f31d4875 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -33,7 +33,7 @@ Included below is another example where we run a `dlt` pipeline and then a dbt p > šŸ’” Docstrings are available to read in your IDE. -```python +```py # load all pipedrive endpoints to pipedrive_raw dataset pipeline = dlt.pipeline( pipeline_name='pipedrive', diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md index 43321aab97..a65917fd64 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md @@ -11,7 +11,7 @@ keywords: [transform, sql] The DBT Cloud Client is a Python class designed to interact with the dbt Cloud API (version 2). It provides methods to perform various operations on dbt Cloud, such as triggering job runs and retrieving job run statuses. -```python +```py from dlt.helpers.dbt_cloud import DBTCloudClientV2 # Initialize the client @@ -36,7 +36,7 @@ They simplify the process of triggering and monitoring job runs in dbt Cloud. This function triggers a job run in dbt Cloud using the specified configuration. It supports various customization options and allows for monitoring the job's status. -```python +```py from dlt.helpers.dbt_cloud import run_dbt_cloud_job # Trigger a job run with default configuration @@ -58,7 +58,7 @@ If you have already started a job run and have a run ID, then you can use the `g This function retrieves the full information about a specific dbt Cloud job run. It also supports options for waiting until the run is complete. -```python +```py from dlt.helpers.dbt_cloud import get_dbt_cloud_run_status # Retrieve status for a specific run diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md index dc2fc6d40a..5a82d8be66 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ b/docs/website/docs/dlt-ecosystem/transformations/pandas.md @@ -11,7 +11,7 @@ natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to th dataframes can be really fast! The example below reads GitHub reactions data from the `issues` table and counts the reaction types. -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index 6131cac85a..72c97f5503 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -12,7 +12,7 @@ including statements that change the database schema or data in the tables. In t insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi` connection. -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") try: with pipeline.sql_client() as client: @@ -27,7 +27,7 @@ try: In the case of SELECT queries, the data is returned as a list of rows, with the elements of a row corresponding to selected columns. -```python +```py try: with pipeline.sql_client() as client: res = client.execute_sql( diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md index 0baf1917d1..4e3e4788d3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md @@ -45,7 +45,7 @@ Sources and resources that can be loaded using this verified source are: Upon logging into Airtable and accessing your base or table, you'll notice a URL in your browser's address bar resembling: -```bash +```shell https://airtable.com/appve10kl227BIT4GV/tblOUnZVLFWbemTP1/viw3qtF76bRQC3wKx/rec9khXgeTotgCQ62?blocks=hide ``` @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init airtable duckdb ``` @@ -116,20 +116,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python airtable_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -147,7 +147,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrieves tables from given Airtable base. -```python +```py @dlt.source def airtable_source( base_id: str = dlt.config.value, @@ -167,7 +167,7 @@ tables in the schema are loaded. This function retrieves data from a single Airtable table. -```python +```py def airtable_resource( api: pyairtable.Api, base_id: str, @@ -186,7 +186,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="airtable", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -196,7 +196,7 @@ verified source. 1. To load the entire base: - ```python + ```py base_id = "Please set me up!" # The id of the base. airtables = airtable_source(base_id=base_id)) @@ -205,7 +205,7 @@ verified source. 1. To load selected tables from a base table: - ```python + ```py base_id = "Please set me up!" # The id of the base. table_names = ["Table1","Table2"] # A list of table IDs or table names to load. @@ -221,7 +221,7 @@ verified source. 1. To load data and apply hints to a specific column: - ```python + ```py base_id = "Please set me up!" # The id of the base. table_names = ["Table1","Table2"] # A list of table IDs or table names to load. resource_name = "Please set me up!" # The table name we want to apply hints. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md index 4118902a6c..acea8ae5dc 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md @@ -57,7 +57,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init kinesis duckdb ``` @@ -110,16 +110,16 @@ For more information, read [Credentials](../../general-usage/credentials). 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python kinesis_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `kinesis_pipeline`. You may @@ -138,7 +138,7 @@ This resource reads a Kinesis stream and yields messages. It supports [incremental loading](../../general-usage/incremental-loading) and parses messages as json by default. -```python +```py @dlt.resource( name=lambda args: args["stream_name"], primary_key="_kinesis_msg_id", @@ -212,7 +212,7 @@ verified source. 1. Configure the [pipeline](../../general-usage/pipeline) by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="kinesis_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -222,7 +222,7 @@ verified source. 1. To load messages from a stream from the last one hour: - ```python + ```py # the resource below will take its name from the stream name, # it can be used multiple times by default it assumes that Data is json and parses it, # here we disable that to just get bytes in data elements of the message @@ -237,7 +237,7 @@ verified source. 1. For incremental Kinesis streams, to fetch only new messages: - ```python + ```py #running pipeline will get only new messages info = pipeline.run(kinesis_stream_data) message_counts = pipeline.last_trace.last_normalize_info.row_counts @@ -249,7 +249,7 @@ verified source. 1. To parse json with a simple decoder: - ```python + ```py def _maybe_parse_json(item: TDataItem) -> TDataItem: try: item.update(json.loadb(item["data"])) @@ -263,7 +263,7 @@ verified source. 1. To read Kinesis messages and send them somewhere without using a pipeline: - ```python + ```py from dlt.common.configuration.container import Container from dlt.common.pipeline import StateInjectableContext diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index df968422d7..915a9d297a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -25,7 +25,7 @@ To write an Arrow source, pass any `pyarrow.Table`, `pyarrow.RecordBatch` or `pa This example loads a Pandas dataframe to a Snowflake table: -```python +```py import dlt from dlt.common import pendulum import pandas as pd @@ -45,7 +45,7 @@ pipeline.run(df, table_name="orders") A `pyarrow` table can be loaded in the same way: -```python +```py import pyarrow as pa # Create dataframe and pipeline same as above @@ -96,7 +96,7 @@ Usage is the same as without other dlt resources. Refer to the [incremental load Example: -```python +```py import dlt from dlt.common import pendulum import pandas as pd @@ -144,7 +144,7 @@ All struct types are represented as `complex` and will be loaded as JSON (if des even if they are present in the destination. If you want to represent nested data as separated tables, you must yield panda frames and arrow tables as records. In the examples above: -```python +```py # yield panda frame as records pipeline.run(df.to_dict(orient='records'), table_name="orders") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md index 8554cdd376..7617566a35 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md @@ -56,7 +56,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init asana_dlt duckdb ``` @@ -94,16 +94,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python asana_dlt_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `asana`, you may also use any @@ -127,7 +127,7 @@ it is important to note the complete list of the default endpoints given in This is a `dlt.source` function, which returns a list of DltResource objects: "workspaces", "projects", "sections","tags","tasks","stories", "teams", and "users". -```python +```py @dlt.source def asana_source(access_token: str = dlt.secrets.value) -> Any: return [ @@ -142,7 +142,7 @@ def asana_source(access_token: str = dlt.secrets.value) -> Any: This is a `dlt.resource` function, which returns collections of tasks and related information. -```python +```py @dlt.resource(write_disposition="replace") def workspaces( access_token: str = dlt.secrets.value, @@ -171,7 +171,7 @@ transformer functions transform or process data from one or more resources. The transformer function `projects` process data from the `workspaces` resource. It fetches and returns a list of projects for a given workspace from Asana. -```python +```py @dlt.transformer( data_from=workspaces, write_disposition="replace", @@ -200,7 +200,7 @@ It uses `@dlt.defer` decorator to enable parallel run in thread pool. This [incremental](../../general-usage/incremental-loading.md) resource-transformer fetches all tasks for a given project from Asana. -```python +```py @dlt.transformer(data_from=projects, write_disposition="merge", primary_key="gid") def tasks( project_array: t.List[TDataItem], @@ -235,7 +235,7 @@ these steps: 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="asana_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -248,13 +248,13 @@ these steps: 1. To load the data from all the fields, you can utilise the `asana_source` method as follows: - ```python + ```py load_data = asana_source() ``` 1. Use the method `pipeline.run()` to execute the pipeline. - ```python + ```py load_info = pipeline.run(load_data) # print the information on data that was loaded print(load_info) @@ -263,7 +263,7 @@ these steps: 1. To use the method `pipeline.run()` to load custom endpoints ā€œworkspacesā€ and ā€œprojectsā€, the above script may be modified as: - ```python + ```py load_info = pipeline.run(load_data.with_resources("workspaces", "projects")) # print the information on data that was loaded print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md index 7f01b83f08..05331148f6 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md @@ -36,7 +36,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init chess duckdb ``` @@ -66,20 +66,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python chess_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -98,7 +98,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This is a `dlt.source` function for the Chess.com API named "chess", which returns a sequence of DltResource objects. That we'll discuss in subsequent sections as resources. -```python +```py dlt.source(name="chess") def source( players: List[str], start_month: str = None, end_month: str = None @@ -120,7 +120,7 @@ to fetch game data (in "YYYY/MM" format). This is a `dlt.resource` function, which returns player profiles for a list of player usernames. -```python +```py @dlt.resource(write_disposition="replace") def players_profiles(players: List[str]) -> Iterator[TDataItem]: @@ -138,7 +138,7 @@ It uses `@dlt.defer` decorator to enable parallel run in thread pool. This is a `dlt.resource` function, which returns url to game archives for specified players. -```python +```py @dlt.resource(write_disposition="replace", selected=False) def players_archives(players: List[str]) -> Iterator[List[TDataItem]]: ... @@ -154,7 +154,7 @@ runs. This incremental resource takes data from players and returns games for the last month if not specified otherwise. -```python +```py @dlt.resource(write_disposition="append") def players_games( players: List[str], start_month: str = None, end_month: str = None @@ -186,7 +186,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="chess_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -199,7 +199,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. To load the data from all the resources for specific players (e.g. for November), you can utilise the `source` method as follows: - ```python + ```py # Loads games for Nov 2022 data = source( ["magnuscarlsen", "vincentkeymer", "dommarajugukesh", "rpragchess"], @@ -210,7 +210,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. Use the method `pipeline.run()` to execute the pipeline. - ```python + ```py info = pipeline.run(data) # print the information on data that was loaded print(info) @@ -219,7 +219,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. To load data from specific resources like "players_games" and "player_profiles", modify the above code as: - ```python + ```py info = pipeline.run(data.with_resources("players_games", "players_profiles")) # print the information on data that was loaded print(info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md index dea97921b4..e788084fe3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md @@ -66,7 +66,7 @@ By default, Facebook access tokens have a short lifespan of one hour. To exchang Facebook access token for a long-lived token, update the `.dlt/secrets.toml` with client_id, and client_secret and execute the provided Python code. -```python +```py from facebook_ads import get_long_lived_token print(get_long_lived_token("your short-lived token") ``` @@ -77,7 +77,7 @@ the above code snippet. To retrieve the expiry date and the associated scopes of the token, you can use the following command: -```python +```py from facebook_ads import debug_access_token debug_access_token() ``` @@ -101,7 +101,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init facebook_ads duckdb ``` @@ -158,16 +158,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python facebook_ads_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `facebook_ads`, you may also @@ -191,7 +191,7 @@ it is important to note the complete list of the default endpoints given in This function returns a list of resources to load campaigns, ad sets, ads, creatives, and ad leads data from Facebook Marketing API. -```python +```py @dlt.source(name="facebook_ads") def facebook_ads_source( account_id: str = dlt.config.value, @@ -220,7 +220,7 @@ were issued i.e. 'v17.0'. Defaults to the _facebook_business_ library default ve The ads function fetches ad data. It retrieves ads from a specified account with specific fields and states. -```python +```py @dlt.resource(primary_key="id", write_disposition="replace") def ads( fields: Sequence[str] = DEFAULT_AD_FIELDS, @@ -254,7 +254,7 @@ The default fields are defined in This function returns a list of resources to load facebook_insights. -```python +```py @dlt.source(name="facebook_ads") def facebook_insights_source( account_id: str = dlt.config.value, @@ -315,7 +315,7 @@ were issued i.e. 'v17.0'. Defaults to the facebook_business library default vers This function fetches Facebook insights data incrementally from a specified start date until the current date, in day steps. -```python +```py @dlt.resource(primary_key=INSIGHTS_PRIMARY_KEY, write_disposition="merge") def facebook_insights( date_start: dlt.sources.incremental[str] = dlt.sources.incremental( @@ -337,7 +337,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="facebook_ads", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -350,7 +350,7 @@ verified source. 1. To load all the data from, campaigns, ad sets, ads, ad creatives and leads. - ```python + ```py load_data = facebook_ads_source() load_info = pipeline.run(load_data) print(load_info) @@ -359,7 +359,7 @@ verified source. 1. To merge the Facebook Ads with the state ā€œDISAPPROVEDā€ and with ads state ā€œPAUSEDā€ you can do the following: - ```python + ```py load_data = facebook_ads_source() # It is recommended to enable root key propagation on a source that is not a merge one by default. this is not required if you always use merge but below we start with replace load_data.root_key = True @@ -382,7 +382,7 @@ verified source. 1. To load data with a custom field, for example, to load only ā€œidā€ from Facebook ads, you can do the following: - ```python + ```py load_data = facebook_ads_source() # Only loads add ids, works the same for campaigns, leads etc. load_data.ads.bind(fields=("id",)) @@ -395,7 +395,7 @@ verified source. demonstrates how to enrich objects by adding an enrichment transformation that includes additional fields. - ```python + ```py # You can reduce the chunk size for smaller requests load_data = facebook_ads_source(chunk_size=2) @@ -429,7 +429,7 @@ verified source. breakdowns, etc. As defined in the `facebook_insights_source`. This function generates daily reports for a specified number of past days. - ```python + ```py load_data = facebook_insights_source( initial_load_past_days=30, attribution_window_days_lag= 7, diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md index aed19838ef..5c57cf889e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md @@ -81,7 +81,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init filesystem duckdb ``` @@ -150,32 +150,32 @@ For more information, read the 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. Install optional modules: - For AWS S3: - ```bash + ```shell pip install s3fs ``` - For Azure blob: - ```bash + ```shell pip install adlfs>=2023.9.0 ``` - GCS storage: No separate module needed. 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python filesystem_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -197,7 +197,7 @@ This source offers chunked file readers as resources, which can be optionally cu - `read_jsonl()` - `read_parquet()` -```python +```py @dlt.source(_impl_cls=ReadersSource, spec=FilesystemConfigurationResource) def readers( bucket_url: str = dlt.secrets.value, @@ -225,7 +225,7 @@ This resource lists files in `bucket_url` based on the `file_glob` pattern, retu [FileItem](https://github.com/dlt-hub/dlt/blob/devel/dlt/common/storages/fsspec_filesystem.py#L22) with data access methods. These can be paired with transformers for enhanced processing. -```python +```py @dlt.resource( primary_key="file_url", spec=FilesystemConfigurationResource, standalone=True ) @@ -256,7 +256,7 @@ in bucket URL. To load data into a specific table (instead of the default filesystem table), see the snippet below: -```python +```py @dlt.transformer(standalone=True) def read_csv(items, chunksize: int = 15) ->: """Reads csv file with Pandas chunk by chunk.""" @@ -275,7 +275,7 @@ Use the [standalone filesystem](../../general-usage/resource#declare-a-standalone-resource) resource to list files in s3, GCS, and Azure buckets. This allows you to customize file readers or manage files using [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html). -```python +```py files = filesystem(bucket_url="s3://my_bucket/data", file_glob="csv_folder/*.csv") pipeline.run(files) ``` @@ -327,7 +327,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="standard_filesystem", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -337,7 +337,7 @@ verified source. 1. To read and load CSV files: - ```python + ```py BUCKET_URL = "YOUR_BUCKET_PATH_HERE" # path of the bucket url or local destination met_files = readers( bucket_url=BUCKET_URL, file_glob="directory/*.csv" @@ -358,7 +358,7 @@ verified source. ::: 1. To load only new CSV files with [incremental loading](../../general-usage/incremental-loading): - ```python + ```py # This configuration will only consider new csv files new_files = filesystem(bucket_url=BUCKET_URL, file_glob="directory/*.csv") # add incremental on modification time @@ -369,7 +369,7 @@ verified source. ``` 1. To read and load Parquet and JSONL from a bucket: - ```python + ```py jsonl_reader = readers(BUCKET_URL, file_glob="**/*.jsonl").read_jsonl( chunksize=10000 ) @@ -391,7 +391,7 @@ verified source. 1. To set up a pipeline that reads from an Excel file using a standalone transformer: - ```python + ```py # Define a standalone transformer to read data from an Excel file. @dlt.transformer(standalone=True) def read_excel( @@ -427,7 +427,7 @@ verified source. 1. To copy files locally, add a step in the filesystem resource and then load the listing to the database: - ```python + ```py def _copy(item: FileItemDict) -> FileItemDict: # instantiate fsspec and copy file dest_file = os.path.join(local_folder, item["file_name"]) @@ -459,7 +459,7 @@ verified source. You can get a fsspec client from filesystem resource after it was extracted i.e. in order to delete processed files etc. The filesystem module contains a convenient method `fsspec_from_resource` that can be used as follows: - ```python + ```py from filesystem import filesystem, fsspec_from_resource # get filesystem source gs_resource = filesystem("gs://ci-test-bucket/") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md index 2fd0277500..80b5a7d37f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/github.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/github.md @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init github duckdb ``` @@ -110,16 +110,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python github_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `github_reactions`, you may @@ -137,7 +137,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This `dlt.source` function uses GraphQL to fetch DltResource objects: issues and pull requests along with associated reactions, comments, and reactions to comments. -```python +```py @dlt.source def github_reactions( owner: str, @@ -169,7 +169,7 @@ yet to be implemented. Defaults to None. The `dlt.resource` function employs the `_get_reactions_data` method to retrieve data about issues, their associated comments, and subsequent reactions. -```python +```py dlt.resource( _get_reactions_data( "issues", @@ -193,7 +193,7 @@ on event type. It loads new events only and appends them to tables. > Note: Github allows retrieving up to 300 events for public repositories, so frequent updates are > recommended for active repos. -```python +```py @dlt.source(max_table_nesting=2) def github_repo_events( owner: str, name: str, access_token: str = None @@ -216,7 +216,7 @@ Read more about [nesting levels](../../general-usage/source#reduce-the-nesting-l This `dlt.resource` function serves as the resource for the `github_repo_events` source. It yields repository events as data items. -```python +```py dlt.resource(primary_key="id", table_name=lambda i: i["type"]) # type: ignore def repo_events( last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental( @@ -244,7 +244,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -258,7 +258,7 @@ verified source. 1. To load all the data from repo on issues, pull requests, their comments and reactions, you can do the following: - ```python + ```py load_data = github_reactions("duckdb", "duckdb") load_info = pipeline.run(load_data) print(load_info) @@ -267,7 +267,7 @@ verified source. 1. To load only the first 100 issues, you can do the following: - ```python + ```py load_data = github_reactions("duckdb", "duckdb", max_items=100) load_info = pipeline.run(load_data.with_resources("issues")) print(load_info) @@ -276,7 +276,7 @@ verified source. 1. You can use fetch and process repo events data incrementally. It loads all data during the first run and incrementally in subsequent runs. - ```python + ```py load_data = github_repo_events( "duckdb", "duckdb", access_token=os.getenv(ACCESS_TOKEN) ) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md index b6a3a0a5a8..874f209bf7 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md @@ -93,7 +93,7 @@ follow these steps: After configuring "client_id", "client_secret", and "project_id" in "secrets.toml", to generate the refresh token, run the following script from the root folder: -```bash +```shell python google_analytics/setup_script_gcp_oauth.py ``` @@ -128,7 +128,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init google_analytics duckdb ``` @@ -214,16 +214,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python google_analytics_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is @@ -241,7 +241,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function returns a list of resources including metadata, metrics, and dimensions data from the Google Analytics API. -```python +```py @dlt.source(max_table_nesting=2) def google_analytics( credentials: Union[ GcpOAuthCredentials, GcpServiceAccountCredential ] = dlt.secrets.value, @@ -269,7 +269,7 @@ set to 1000. This function retrieves all the metrics and dimensions for a report from a Google Analytics project. -```python +```py @dlt.resource(selected=False) def get_metadata(client: Resource, property_id: int) -> Iterator[Metadata]: ``` @@ -284,7 +284,7 @@ def get_metadata(client: Resource, property_id: int) -> Iterator[Metadata]: This transformer function extracts data using metadata and populates a table called "metrics" with the data from each metric. -```python +```py @dlt.transformer(data_from=get_metadata, write_disposition="replace", name="metrics") def metrics_table(metadata: Metadata) -> Iterator[TDataItem]: for metric in metadata.metrics: @@ -304,7 +304,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="google_analytics", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -317,7 +317,7 @@ verified source. 1. To load all the data from metrics and dimensions: - ```python + ```py load_data = google_analytics() load_info = pipeline.run(load_data) print(load_info) @@ -328,7 +328,7 @@ verified source. 1. To load data from a specific start date: - ```python + ```py load_data = google_analytics(start_date='2023-01-01') load_info = pipeline.run(load_data) print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index 2a5d4b03ab..f07777dcff 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -98,7 +98,7 @@ follow these steps: After configuring "client_id", "client_secret" and "project_id" in "secrets.toml". To generate the refresh token, run the following script from the root folder: - ```bash + ```shell python google_sheets/setup_script_gcp_oauth.py ``` @@ -128,13 +128,13 @@ following: When setting up the pipeline, you can use either the browser-copied URL of your spreadsheet: -```bash +```shell https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing ``` or spreadsheet id (which is a part of the url) -```bash +```shell 1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4 ``` @@ -214,7 +214,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init google_sheets duckdb ``` @@ -296,20 +296,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python google_sheets_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -328,7 +328,7 @@ Also, since recently `dlt`'s no longer recognizing date and time types, so you h Use the `apply_hints` method on the resource to achieve this. Here's how you can do it: -```python +```py for resource in resources: resource.apply_hints(columns={ "total_amount": {"data_type": "double"}, @@ -340,7 +340,7 @@ This will ensure that all values in the `total_amount` column are treated as `do And `date` column will be represented as dates, not integers. For a single resource (e.g. `Sheet1`), you can simply use: -```python +```py source.Sheet1.apply_hints(columns={ "total_amount": {"data_type": "double"}, "date": {"data_type": "timestamp"}, @@ -348,7 +348,7 @@ source.Sheet1.apply_hints(columns={ ``` To get the name of resources, you can use: -```python +```py print(source.resources.keys()) ``` @@ -371,7 +371,7 @@ or set `full_refresh=True`. This function loads data from a Google Spreadsheet. It retrieves data from all specified ranges, whether explicitly defined or named, and obtains metadata for the first two rows within each range. -```python +```py def google_spreadsheet( spreadsheet_url_or_id: str = dlt.config.value, range_names: Sequence[str] = dlt.config.value, @@ -399,7 +399,7 @@ def google_spreadsheet( This function processes each range name provided by the source function, loading its data into separate tables in the destination. -```python +```py dlt.resource( process_range(rows_data, headers=headers, data_types=data_types), name=name, @@ -429,7 +429,7 @@ This table refreshes after each load, storing information on loaded ranges: - Range name as given to the source. - String and parsed representation of the loaded range. -```python +```py dlt.resource( metadata_table, write_disposition="merge", @@ -457,7 +457,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="google_sheets", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -467,7 +467,7 @@ verified source. 1. To load data from explicit range names: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["range_name1", "range_name2"], # Range names @@ -483,7 +483,7 @@ verified source. 1. To load all the range_names from spreadsheet: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=False, @@ -497,7 +497,7 @@ verified source. 1. To load all the sheets from spreadsheet: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=True, @@ -511,7 +511,7 @@ verified source. 1. To load all the sheets and range_names: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=True, @@ -525,7 +525,7 @@ verified source. 1. To load data from multiple spreadsheets: - ```python + ```py load_data1 = google_spreadsheet( "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["Sheet 1!A1:B10"], @@ -543,7 +543,7 @@ verified source. 1. To load with table rename: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["Sheet 1!A1:B10"], @@ -583,7 +583,7 @@ Below is the correct way to set up an Airflow DAG for this purpose: - When adding the Google Spreadsheet task to the pipeline, avoid decomposing it; run it as a single task for efficiency. -```python +```py @dag( schedule_interval='@daily', start_date=pendulum.datetime(2023, 2, 1), diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md index 3a623c7b49..c40831faa7 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md @@ -74,7 +74,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init hubspot duckdb ``` @@ -115,16 +115,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python hubspot_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `hubspot_pipeline`, you may @@ -148,7 +148,7 @@ it is important to note the complete list of the default endpoints given in This function returns a list of resources to load companies, contacts, deals, tickets, products, and web analytics events data into the destination. -```python +```py @dlt.source(name="hubspot") def hubspot( api_key: str = dlt.secrets.value, @@ -166,7 +166,7 @@ specified entities. This resource function fetches data from the "companies" endpoint and loads it to the destination, replacing any existing data. -```python +```py @dlt.resource(name="companies", write_disposition="replace") def companies( api_key: str = api_key, @@ -195,7 +195,7 @@ in addition to the custom properties. Similar to this, resource functions "conta This function loads web analytics events for specific objects from Hubspot API into the destination. -```python +```py @dlt.resource def hubspot_events_for_objects( object_type: THubspotObjectType, @@ -225,7 +225,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="hubspot", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -238,7 +238,7 @@ verified source. 1. To load all the data from contacts, companies, deals, products, tickets, and quotes into the destination. - ```python + ```py load_data = hubspot() load_info = pipeline.run(load_data) print(load_info) @@ -246,7 +246,7 @@ verified source. 1. To load data from contacts and companies, with time history using "with_resources" method. - ```python + ```py load_data = hubspot(include_history=True).with_resources("companies","contacts") load_info = pipeline.run(load_data) print(load_info) @@ -256,7 +256,7 @@ verified source. 1. By default, all the custom properties of a CRM object are extracted. If you want only particular fields, set the flag `include_custom_props=False` and add a list of properties with the `props` arg. - ```python + ```py load_data = hubspot() load_data.contacts.bind(props=["date_of_birth", "degree"], include_custom_props=False) load_info = pipeline.run(load_data.with_resources("contacts")) @@ -264,7 +264,7 @@ verified source. 1. If you want to read all the custom properties of CRM objects and some additional (e.g. Hubspot driven) properties. - ```python + ```py load_data = hubspot() load_data.contacts.bind(props=["hs_content_membership_email", "hs_content_membership_email_confirmed"]) load_info = pipeline.run(load_data.with_resources("contacts")) @@ -273,7 +273,7 @@ verified source. 1. To load the web analytics events of a given object type. - ```python + ```py resource = hubspot_events_for_objects("company", ["7086461639", "7086464459"]) # Here, object type : company, and object ids : 7086461639 and 7086464459 load_info = pipeline.run([resource]) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md index 75106df609..0473682aed 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init inbox duckdb ``` @@ -112,7 +112,7 @@ For more information, read the 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` @@ -128,7 +128,7 @@ For more information, read the 2. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `standard_inbox`, you may also @@ -145,7 +145,7 @@ For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs This function fetches inbox emails, saves attachments locally, and returns uids, messages, and attachments as resources. -```python +```py @dlt.source def inbox_source( host: str = dlt.secrets.value, @@ -182,7 +182,7 @@ def inbox_source( This resource collects email message UIDs (Unique IDs) from the mailbox. -```python +```py @dlt.resource(name="uids") def get_messages_uids( initial_message_num: Optional[ @@ -197,7 +197,7 @@ def get_messages_uids( This resource retrieves emails by UID (Unique IDs), yielding a dictionary with metadata like UID, ID, sender, subject, dates, content type, and body. -```python +```py @dlt.transformer(name="messages", primary_key="message_uid") def get_messages( items: TDataItems, @@ -214,7 +214,7 @@ def get_messages( Similar to the previous resources, resource `get_attachments` extracts email attachments by UID from the IMAP server. It yields file items with attachments in the file_content field and the original email in the message field. -```python +```py @dlt.transformer( name="attachments", primary_key="file_hash", @@ -236,7 +236,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="standard_inbox", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -250,7 +250,7 @@ verified source. - Set `DEFAULT_START_DATE = pendulum.datetime(2023, 10, 1)` in `./inbox/settings.py`. - Use the following code: - ```python + ```py # Retrieve messages from the specified email address. messages = inbox_source(filter_emails=("mycreditcard@bank.com",)).messages # Configure messages to exclude body and name the result "my_inbox". @@ -263,7 +263,7 @@ verified source. > Please refer to inbox_source() docstring for email filtering options by sender, date, or mime type. 3. To load messages from multiple emails, including "community@dlthub.com": - ```python + ```py messages = inbox_source( filter_emails=("mycreditcard@bank.com", "community@dlthub.com.") ).messages @@ -272,7 +272,7 @@ verified source. 4. In `inbox_pipeline.py`, the `pdf_to_text` transformer extracts text from PDFs, treating each page as a separate data item. Using the `pdf_to_text` function to load parsed pdfs from mail to the database: - ```python + ```py filter_emails = ["mycreditcard@bank.com", "community@dlthub.com."] # Email senders attachments = inbox_source( filter_emails=filter_emails, filter_by_mime_type=["application/pdf"] diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md index c796014835..4709961cd1 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md @@ -51,7 +51,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init jira duckdb ``` @@ -102,16 +102,16 @@ For more information, read [General Usage: Credentials.](../../general-usage/cre 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python jira_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`. You may also @@ -134,7 +134,7 @@ it is important to note the complete list of the default endpoints given in This source function creates a list of resources to load data into the destination. -```python +```py @dlt.source def jira( subdomain: str = dlt.secrets.value, @@ -152,7 +152,7 @@ def jira( This function returns a resource for querying issues using JQL [(Jira Query Language)](https://support.atlassian.com/jira-service-management-cloud/docs/use-advanced-search-with-jira-query-language-jql/). -```python +```py @dlt.source def jira_search( subdomain: str = dlt.secrets.value, @@ -168,7 +168,7 @@ for the [jira source](jira.md#source-jira). The resource function searches issues using JQL queries and then loads them to the destination. -```python +```py @dlt.resource(write_disposition="replace") def issues(jql_queries: List[str]) -> Iterable[TDataItem]: api_path = "rest/api/3/search" @@ -186,7 +186,7 @@ above. about pipeline configuration, please refer to our documentation [here](https://dlthub.com/docs/general-usage/pipeline): - ```python + ```py pipeline = dlt.pipeline( pipeline_name="jira_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -196,7 +196,7 @@ above. 2. To load custom endpoints such as ā€œissuesā€ and ā€œusersā€ using the jira source function: - ```python + ```py #Run the pipeline load_info = pipeline.run(jira().with_resources("issues","users")) print(f"Load Information: {load_info}") @@ -205,7 +205,7 @@ above. 3. To load the custom issues using JQL queries, you can use custom queries. Here is an example below: - ```python + ```py # Define the JQL queries as follows queries = [ "created >= -30d order by created DESC", diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md index 5bff03e357..16f53ca845 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md @@ -38,7 +38,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init kafka duckdb ``` @@ -80,20 +80,20 @@ sasl_password="example_secret" 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 2. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python kafka_pipeline.py ``` 3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -108,7 +108,7 @@ For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/ This function retrieves messages from the given Kafka topics. -```python +```py @dlt.resource(name="kafka_messages", table_name=lambda msg: msg["_kafka"]["topic"], standalone=True) def kafka_consumer( topics: Union[str, List[str]], @@ -151,7 +151,7 @@ this offset. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="kafka", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -161,7 +161,7 @@ this offset. 2. To extract several topics: - ```python + ```py topics = ["topic1", "topic2", "topic3"] source = kafka_consumer(topics) @@ -170,7 +170,7 @@ this offset. 3. To extract messages and process them in a custom way: - ```python + ```py def custom_msg_processor(msg: confluent_kafka.Message) -> Dict[str, Any]: return { "_kafka": { @@ -187,7 +187,7 @@ this offset. 4. To extract messages, starting from a timestamp: - ```python + ```py data = kafka_consumer("topic", start_from=pendulum.datetime(2023, 12, 15)) pipeline.run(data) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md index 45841850c6..597964a37b 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md @@ -44,7 +44,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init matomo duckdb ``` @@ -102,16 +102,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python matomo_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `matomo`, you may also @@ -128,7 +128,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function executes and loads a set of reports defined in "queries" for a specific Matomo site identified by "site_id". -```python +```py @dlt.source(max_table_nesting=2) def matomo_reports( api_token: str = dlt.secrets.value, @@ -152,7 +152,7 @@ def matomo_reports( The function loads visits from current day and the past `initial_load_past_days` in first run. In subsequent runs it continues from last load and skips active visits until closed. -```python +```py def matomo_visits( api_token: str = dlt.secrets.value, url: str = dlt.config.value, @@ -184,7 +184,7 @@ def matomo_visits( This function retrieves site visits within a specified timeframe. If a start date is given, it begins from that date. If not, it retrieves all visits up until now. -```python +```py @dlt.resource( name="visits", write_disposition="append", primary_key="idVisit", selected=True ) @@ -215,7 +215,7 @@ def get_last_visits( This function, retrieves unique visit information from get_last_visits. -```python +```py @dlt.transformer( data_from=get_last_visits, write_disposition="merge", @@ -242,7 +242,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="matomo", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -255,7 +255,7 @@ verified source. 1. To load the data from reports. - ```python + ```py data_reports = matomo_reports() load_info = pipeline_reports.run(data_reports) print(load_info) @@ -264,7 +264,7 @@ verified source. 1. To load custom data from reports using queries. - ```python + ```py queries = [ { "resource_name": "custom_report_name", @@ -285,7 +285,7 @@ verified source. 1. To load data from reports and visits. - ```python + ```py data_reports = matomo_reports() data_events = matomo_visits() load_info = pipeline_reports.run([data_reports, data_events]) @@ -294,7 +294,7 @@ verified source. 1. To load data on live visits and visitors, and only retrieve data from today. - ```python + ```py load_data = matomo_visits(initial_load_past_days=1, get_live_event_visitors=True) load_info = pipeline_events.run(load_data) print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md index 9178d2ab6d..15894ce91b 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md @@ -66,30 +66,30 @@ Here are the typical ways to configure MongoDB and their connection URLs: 1. Connect to MongoDB: - ```bash + ```shell mongo "mongodb://dbuser:passwd@your_host:27017" ``` 1. List all Databases: - ```bash + ```shell show dbs ``` 1. View Collections in a Database: 1. Switch to Database: - ```bash + ```shell use your_database_name ``` 1. Display its Collections: - ```bash + ```shell show collections ``` 1. Disconnect: - ```bash + ```shell exit ``` @@ -115,7 +115,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init mongodb duckdb ``` @@ -174,16 +174,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python mongodb_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `local_mongo`, you may also @@ -200,7 +200,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function loads data from a MongoDB database, yielding one or multiple collections to be retrieved. -```python +```py @dlt.source def mongodb( connection_url: str = dlt.secrets.value, @@ -226,7 +226,7 @@ def mongodb( This function fetches a single collection from a MongoDB database using PyMongo. -```python +```py def mongodb_collection( connection_url: str = dlt.secrets.value, database: Optional[str] = dlt.config.value, @@ -247,7 +247,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="mongodb_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -257,7 +257,7 @@ verified source. 1. To load all the collections in a database: - ```python + ```py load_data = mongodb() load_info = pipeline.run(load_data, write_disposition="replace") print(load_info) @@ -265,7 +265,7 @@ verified source. 1. To load a specific collections from the database: - ```python + ```py load_data = mongodb().with_resources("collection_1", "collection_2") load_info = pipeline.run(load_data, write_disposition="replace") print(load_info) @@ -273,7 +273,7 @@ verified source. 1. To load specific collections from the source incrementally: - ```python + ```py load_data = mongodb(incremental=dlt.sources.incremental("date")).with_resources("collection_1") load_info = pipeline.run(load_data, write_disposition = "merge") print(load_info) @@ -282,7 +282,7 @@ verified source. 1. To load data from a particular collection say "movies" incrementally: - ```python + ```py load_data = mongodb_collection( collection="movies", incremental=dlt.sources.incremental( @@ -300,7 +300,7 @@ verified source. 1. To incrementally load a table with an append-only disposition using hints: - ```python + ```py # Suitable for tables where new rows are added, but existing rows aren't updated. # Load data from the 'listingsAndReviews' collection in MongoDB, using 'last_scraped' for incremental addition. airbnb = mongodb().with_resources("listingsAndReviews") @@ -317,7 +317,7 @@ verified source. 1. To load a selected collection and rename it in the destination: - ```python + ```py # Create the MongoDB source and select the "collection_1" collection source = mongodb().with_resources("collection_1") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md index a713121f29..1706d9dec4 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md @@ -46,7 +46,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init mux duckdb ``` @@ -88,16 +88,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python mux_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is @@ -115,7 +115,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function yields resources "asset_resource" and "views_resource" to load video assets and views. -```python +```py @dlt.source def mux_source() -> Iterable[DltResource]: yield assets_resource @@ -126,7 +126,7 @@ def mux_source() -> Iterable[DltResource]: The assets_resource function fetches metadata about video assets from the Mux API's "assets" endpoint. -```python +```py @dlt.resource(write_disposition="merge") def assets_resource( mux_api_access_token: str = dlt.secrets.value, @@ -145,7 +145,7 @@ def assets_resource( This function yields data about every video view from yesterday to be loaded. -```python +```py @dlt.resource(write_disposition="append") def views_resource( mux_api_access_token: str = dlt.secrets.value, @@ -165,7 +165,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="mux_pipeline", # Use a custom name if desired destination="bigquery", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -175,21 +175,21 @@ verified source. 1. To load metadata about every asset to be loaded: - ```python + ```py load_info = pipeline.run(mux_source().with_resources("assets_resource") print(load_info) ``` 1. To load data for each video view from yesterday: - ```python + ```py load_info = pipeline.run(mux_source().with_resources("views_resource") print(load_info) ``` 1. To load both metadata about assets and video views from yesterday: - ```python + ```py load_info = pipeline.run(mux_source()) print(load_info) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md index ffb0becfbb..88e08cba35 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md @@ -50,7 +50,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init notion duckdb ``` @@ -93,16 +93,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python notion_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `notion`, you may also use any @@ -119,7 +119,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function loads notion databases from notion into the destination. -```python +```py @dlt.source def notion_databases( database_ids: Optional[List[Dict[str, str]]] = None, @@ -146,7 +146,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="notion", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -159,7 +159,7 @@ verified source. 1. To load all the integrated databases: - ```python + ```py load_data = notion_databases() load_info = pipeline.run(load_data) print(load_info) @@ -167,7 +167,7 @@ verified source. 1. To load the custom databases: - ```python + ```py selected_database_ids = [{"id": "0517dae9409845cba7d","use_name":"db_one"}, {"id": "d8ee2d159ac34cfc"}] load_data = notion_databases(database_ids=selected_database_ids) load_info = pipeline.run(load_data) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md index 6fae36d0ec..bb40758d40 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md @@ -57,7 +57,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init personio duckdb ``` @@ -102,16 +102,16 @@ For more information, read [Credentials](../../general-usage/credentials). 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python personio_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `personio`, you may also use @@ -127,7 +127,7 @@ For more information, read [Run a pipeline.](../../walkthroughs/run-a-pipeline) ### Source `personio_source` This `dlt` source returns data resources like `employees`, `absences`, `absence_types`, etc. -```python +```py @dlt.source(name="personio") def personio_source( client_id: str = dlt.secrets.value, @@ -158,7 +158,7 @@ def personio_source( This resource retrieves data on all the employees in a company. -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def employees( updated_at: dlt.sources.incremental[ @@ -185,7 +185,7 @@ data incrementally from the Personio API to your preferred destination. ### Resource `absence_types` Simple resource, which retrieves a list of various types of employee absences. -```python +```py @dlt.resource(primary_key="id", write_disposition="replace") def absence_types(items_per_page: int = items_per_page) -> Iterable[TDataItem]: ... @@ -209,7 +209,7 @@ The transformer functions transform or process data from resources. The transformer function `employees_absences_balance` process data from the `employees` resource. It fetches and returns a list of the absence balances for each employee. -```python +```py @dlt.transformer( data_from=employees, write_disposition="merge", @@ -232,7 +232,7 @@ verified source. 1. Configure the [pipeline](../../general-usage/pipeline) by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="personio", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -242,14 +242,14 @@ verified source. 1. To load employee data: - ```python + ```py load_data = personio_source().with_resources("employees") print(pipeline.run(load_data)) ``` 1. To load data from all supported endpoints: - ```python + ```py load_data = personio_source() print(pipeline.run(load_data)) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index 17907c9467..57cd46e406 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -53,7 +53,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init pipedrive duckdb ``` @@ -93,16 +93,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python pipedrive_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `pipedrive`, but you may also use @@ -138,7 +138,7 @@ Pipedrive API. This function returns a list of resources including activities, deals, custom_fields_mapping and other resources data from Pipedrive API. -```python +```py @dlt.source(name="pipedrive") def pipedrive_source( pipedrive_api_key: str = dlt.secrets.value, @@ -159,7 +159,7 @@ This code generates resources for each entity in [RECENTS_ENTITIES](https://github.com/dlt-hub/verified-sources/blob/master/sources/pipedrive/settings.py), stores them in endpoints_resources, and then loads data from each endpoint to the destination. -```python +```py endpoints_resources = {} for entity, resource_name in RECENTS_ENTITIES.items(): endpoints_resources[resource_name] = dlt.resource( @@ -186,7 +186,7 @@ for entity, resource_name in RECENTS_ENTITIES.items(): This function gets the participants of deals from the Pipedrive API and yields the result. -```python +```py def pipedrive_source(args): # Rest of function yield endpoints_resources["deals"] | dlt.transformer( @@ -209,7 +209,7 @@ further processing or loading. This function preserves the mapping of custom fields across different pipeline runs. It is used to create and store a mapping of custom fields for different entities in the source state. -```python +```py @dlt.resource(selected=False) def create_state(pipedrive_api_key: str) -> Iterator[Dict[str, Any]]: def _get_pages_for_rename( @@ -238,7 +238,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="pipedrive", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -251,7 +251,7 @@ verified source. 1. To print source info: - ```python + ```py pipedrive_data = pipedrive_source() #print source info print(pipedrive_data) @@ -263,7 +263,7 @@ verified source. 1. To load all the data in Pipedrive: - ```python + ```py load_data = pipedrive_source() # calls the source function load_info = pipeline.run(load_info) #runs the pipeline with selected source configuration print(load_info) @@ -271,7 +271,7 @@ verified source. 1. To load data from selected resources: - ```python + ```py #To load custom fields, include custom_fields_mapping for hash to name mapping. load_data = pipedrive_source().with_resources("products", "deals", "deals_participants", "custom_fields_mapping") load_info = pipeline.run(load_data) #runs the pipeline loading selected data @@ -280,7 +280,7 @@ verified source. 1. To load data from a start date: - ```python + ```py # Configure a source for 'activities' starting from the specified date. # The 'custom_fields_mapping' is incorporated to convert custom field hashes into their respective names. activities_source = pipedrive_source( diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md index aa8fbe10d4..4167df7da6 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md @@ -63,7 +63,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init salesforce duckdb ``` @@ -110,16 +110,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python salesforce_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `salesforce`, you may also use @@ -137,7 +137,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function returns a list of resources to load users, user_role, opportunity, opportunity_line_item, account etc. data from Salesforce API. -```python +```py @dlt.source(name="salesforce") def salesforce_source( user_name: str = dlt.secrets.value, @@ -156,7 +156,7 @@ def salesforce_source( This resource function retrieves records from the Salesforce "User" endpoint. -```python +```py @dlt.resource(write_disposition="replace") def sf_user() -> Iterator[Dict[str, Any]]: yield from get_records(client, "User") @@ -176,7 +176,7 @@ the "user_role" endpoint. This resource function retrieves records from the Salesforce "Opportunity" endpoint in incremental mode. -```python +```py @dlt.resource(write_disposition="merge") def opportunity( last_timestamp: Incremental[str] = dlt.sources.incremental( @@ -215,7 +215,7 @@ To create your data pipeline using single loading and 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="salesforce_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -228,7 +228,7 @@ To create your data pipeline using single loading and 1. To load data from all the endpoints, use the `salesforce_source` method as follows: - ```python + ```py load_data = salesforce_source() source.schema.merge_hints({"not_null": ["id"]}) # Hint for id field not null load_info = pipeline.run(load_data) @@ -241,7 +241,7 @@ To create your data pipeline using single loading and 1. To use the method `pipeline.run()` to load custom endpoints ā€œcandidatesā€ and ā€œmembersā€: - ```python + ```py load_info = pipeline.run(load_data.with_resources("opportunity", "contact")) # print the information on data that was loaded print(load_info) @@ -260,7 +260,7 @@ To create your data pipeline using single loading and 1. To load data from the ā€œcontactā€ in replace mode and ā€œtaskā€ incrementally merge mode endpoints: - ```python + ```py load_info = pipeline.run(load_data.with_resources("contact", "task")) # pretty print the information on data that was loaded print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md index 09dc392c87..7beb43d210 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md @@ -61,7 +61,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init shopify_dlt duckdb ``` @@ -125,16 +125,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python shopify_dlt_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `shopify_data`, you may also @@ -152,7 +152,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function returns a list of resources to load products, orders, and customers data from Shopify API. -```python +```py def shopify_source( private_app_password: str = dlt.secrets.value, api_version: str = DEFAULT_API_VERSION, @@ -188,7 +188,7 @@ incremental loading if unspecified. This resource loads products from your Shopify shop into the destination. It supports incremental loading and pagination. -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def products( updated_at: dlt.sources.incremental[ @@ -212,7 +212,7 @@ support incremental loading and pagination. ### Resource `shopify_partner_query`: This resource can be used to run custom GraphQL queries to load paginated data. -```python +```py @dlt.resource def shopify_partner_query( query: str, @@ -251,7 +251,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="shopify", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -264,7 +264,7 @@ verified source. 1. To load data from "products", "orders" and "customers" from 1st Jan 2023. - ```python + ```py # Add your desired resources to the list... resources = ["products", "orders", "customers"] start_date="2023-01-01" @@ -278,7 +278,7 @@ verified source. minimizes potential failure during large data loads. Running chunks and incremental loads in parallel accelerates the initial load. - ```python + ```py # Load all orders from 2023-01-01 to now min_start_date = current_start_date = pendulum.datetime(2023, 1, 1) max_end_date = pendulum.now() @@ -310,7 +310,7 @@ verified source. print(load_info) ``` 1. To load the first 10 transactions via GraphQL query from the Shopify Partner API. - ```python + ```py # Construct query to load transactions 100 per page, the `$after` variable is used to paginate query = """query Transactions($after: String) { transactions(after: $after, first: 10) { diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 647e39a427..4f2b228a2e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init slack duckdb ``` @@ -107,20 +107,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python slack_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -138,7 +138,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage It retrieves data from Slack's API and fetches the Slack data such as channels, messages for selected channels, users, logs. -```python +```py @dlt.source(name="slack", max_table_nesting=2) def slack_source( page_size: int = MAX_PAGE_SIZE, @@ -163,7 +163,7 @@ def slack_source( This function yields all the channels data as a `dlt` resource. -```python +```py @dlt.resource(name="channels", primary_key="id", write_disposition="replace") def channels_resource() -> Iterable[TDataItem]: ``` @@ -172,7 +172,7 @@ def channels_resource() -> Iterable[TDataItem]: This function yields all the users data as a `dlt` resource. -```python +```py @dlt.resource(name="users", primary_key="id", write_disposition="replace") def users_resource() -> Iterable[TDataItem]: ``` @@ -181,7 +181,7 @@ def users_resource() -> Iterable[TDataItem]: This method fetches messages for a specified channel from the Slack API. It creates a resource for each channel with the channel's name. -```python +```py def get_messages_resource( channel_data: Dict[str, Any], created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental( @@ -209,7 +209,7 @@ def get_messages_resource( This method retrieves access logs from the Slack API. -```python +```py @dlt.resource( name="access_logs", selected=False, @@ -235,7 +235,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="slack", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -244,7 +244,7 @@ verified source. ``` 1. To load Slack resources from the specified start date: - ```python + ```py source = slack_source(page_size=1000, start_date=datetime(2023, 9, 1), end_date=datetime(2023, 9, 8)) # Enable below to load only 'access_logs', available for paid accounts only. @@ -258,7 +258,7 @@ verified source. 1. To load data from selected Slack channels from the specified start date: - ```python + ```py # To load data from selected channels. selected_channels=["general", "random"] # Enter the channel names here. @@ -275,7 +275,7 @@ verified source. 1. To load only messages from selected Slack resources: - ```python + ```py # To load data from selected channels. selected_channels=["general", "random"] # Enter the channel names here. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index 67965863ce..7a72af48b2 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -58,7 +58,7 @@ The database above doesn't require a password. The connection URL can be broken down into: -```python +```py connection_url = "connection_string = f"{drivername}://{username}:{password}@{host}:{port}/{database}" ``` @@ -116,7 +116,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init sql_database duckdb ``` @@ -158,7 +158,7 @@ For more information, read the guide on [how to add a verified source](../../wal 1. You can also pass credentials in the pipeline script the following way: - ```python + ```py credentials = ConnectionStringCredentials( "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ) @@ -176,19 +176,19 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Install the necessary dependencies by running the following command: - ```bash + ```shell pip install -r requirements.txt ``` 1. Run the verified source by entering: - ```bash + ```shell python sql_database_pipeline.py ``` 1. Make sure that everything is loaded as expected with: - ```bash + ```shell dlt pipeline show ``` @@ -208,7 +208,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage This function loads data from an SQL database via SQLAlchemy and auto-creates resources for each table or from a specified list of tables. -```python +```py @dlt.source def sql_database( credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, @@ -244,7 +244,7 @@ remove certain columns to be selected. This function loads data from specific database tables. -```python +```py @dlt.common.configuration.with_config( sections=("sources", "sql_database"), spec=SqlTableResourceConfiguration ) @@ -284,7 +284,7 @@ certain range. 1. Consider a table with a `last_modified` timestamp column. By setting this column as your cursor and specifying an initial value, the loader generates a SQL query filtering rows with `last_modified` values greater than the specified initial value. - ```python + ```py from sql_database import sql_table from datetime import datetime @@ -303,7 +303,7 @@ certain range. 1. To incrementally load the "family" table using the sql_database source method: - ```python + ```py source = sql_database().with_resources("family") #using the "updated" field as an incremental field using initial value of January 1, 2022, at midnight source.family.apply_hints(incremental=dlt.sources.incremental("updated"),initial_value=pendulum.DateTime(2022, 1, 1, 0, 0, 0)) @@ -315,7 +315,7 @@ certain range. 1. To incrementally load the "family" table using the 'sql_table' resource. - ```python + ```py family = sql_table( table="family", incremental=dlt.sources.incremental( @@ -342,7 +342,7 @@ When running on Airflow ### Parallel extraction You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. -```python +```py database = sql_database().parallelize() table = sql_table().parallelize() ``` @@ -358,7 +358,7 @@ To create your own pipeline, use source and resource methods from this verified 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="rfam", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -370,7 +370,7 @@ To create your own pipeline, use source and resource methods from this verified 1. To load the entire database, use the `sql_database` source as: - ```python + ```py source = sql_database() info = pipeline.run(source, write_disposition="replace") print(info) @@ -378,7 +378,7 @@ To create your own pipeline, use source and resource methods from this verified 1. If you just need the "family" table, use: - ```python + ```py source = sql_database().with_resources("family") #running the pipeline info = pipeline.run(source, write_disposition="replace") @@ -389,7 +389,7 @@ To create your own pipeline, use source and resource methods from this verified [documentation](https://dlthub.com/docs/general-usage/customising-pipelines/pseudonymizing_columns). As an example, here's how to pseudonymize the "rfam_acc" column in the "family" table: - ```python + ```py import hashlib def pseudonymize_name(doc): @@ -421,7 +421,7 @@ To create your own pipeline, use source and resource methods from this verified 1. To exclude columns, such as the "rfam_id" column from the "family" table before loading: - ```python + ```py def remove_columns(doc): del doc["rfam_id"] return doc diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md index 4ddf20aa78..3e9e43f0c4 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md @@ -50,7 +50,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init strapi duckdb ``` @@ -73,7 +73,7 @@ For more information, read the guide on [how to add a verified source](../../wal information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py # put your secret values and credentials here. do not share this file and do not push it to github [sources.strapi] api_secret_key = "api_secret_key" # please set me up! @@ -96,13 +96,13 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python strapi_pipeline.py ``` @@ -113,7 +113,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -131,7 +131,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrives data from Strapi. -```python +```py @dlt.source def strapi_source( endpoints: List[str], @@ -155,7 +155,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="strapi", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -165,7 +165,7 @@ verified source. 1. To load the specified endpoints: - ```python + ```py endpoints = ["athletes"] load_data = strapi_source(endpoints=endpoints) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 0b172dc3be..ef98310127 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -56,7 +56,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init stripe_analytics duckdb ``` @@ -96,20 +96,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python stripe_analytics_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -127,7 +127,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug You can write your own pipelines to load data to a destination using this verified source. However, it is important to note is how the `ENDPOINTS` and `INCREMENTAL_ENDPOINTS` tuples are defined in `stripe_analytics/settings.py`. -```python +```py # The most popular Stripe API's endpoints ENDPOINTS = ("Subscription", "Account", "Coupon", "Customer", "Product", "Price") # Possible incremental endpoints @@ -140,7 +140,7 @@ INCREMENTAL_ENDPOINTS = ("Event", "Invoice", "BalanceTransaction") This function retrieves data from the Stripe API for the specified endpoint: -```python +```py @dlt.source def stripe_source( endpoints: Tuple[str, ...] = ENDPOINTS, @@ -159,7 +159,7 @@ def stripe_source( This source loads data in 'append' mode from incremental endpoints. -```python +```py @dlt.source def incremental_stripe_source( endpoints: Tuple[str, ...] = INCREMENTAL_ENDPOINTS, @@ -183,7 +183,7 @@ For more information, read the [General Usage: Incremental loading](../../genera This function loads a dictionary with calculated metrics, including MRR and Churn rate, along with the current timestamp. -```python +```py @dlt.resource(name="Metrics", write_disposition="append", primary_key="created") def metrics_resource() -> Iterable[TDataItem]: ``` @@ -203,7 +203,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="stripe_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -213,7 +213,7 @@ verified source. 1. To load endpoints like "Plan" and "Charge" in replace mode, retrieve all data for the year 2022: - ```python + ```py source_single = stripe_source( endpoints=("Plan", "Charge"), start_date=datetime(2022, 1, 1), @@ -225,7 +225,7 @@ verified source. 1. To load data from the "Invoice" endpoint, which has static data, using incremental loading: - ```python + ```py # Load all data on the first run that was created after start_date and before end_date source_incremental = incremental_stripe_source( endpoints=("Invoice", ), @@ -239,7 +239,7 @@ verified source. 1. To load data created after December 31, 2022, adjust the data range for stripe_source to prevent redundant loading. For incremental_stripe_source, the initial_start_date will auto-update to the last loaded date from the previous run. - ```python + ```py source_single = stripe_source( endpoints=("Plan", "Charge"), start_date=datetime(2022, 12, 31), @@ -254,7 +254,7 @@ verified source. 1. To load important metrics and store them in database: - ```python + ```py # Event is an endpoint with uneditable data, so we can use 'incremental_stripe_source'. source_event = incremental_stripe_source(endpoints=("Event",)) # Subscription is an endpoint with editable data, use stripe_source. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 8701db7db8..458c17b339 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -65,7 +65,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init workable duckdb ``` @@ -117,20 +117,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python workable_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -146,7 +146,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug Note the default definitions of DEFAULT_ENDPOINTS and DEFAULT_DETAILS in "workable/settings.py". -```python +```py DEFAULT_ENDPOINTS = ("members", "recruiters", "stages", "requisitions", "jobs", "custom_attributes","events") DEFAULT_DETAILS = { @@ -164,7 +164,7 @@ endpoints allow incremental 'merge' mode loading. This source returns a sequence of dltResources that correspond to the endpoints. -```python +```py @dlt.source(name="workable") def workable_source( access_token: str = dlt.secrets.value, @@ -187,7 +187,7 @@ def workable_source( This function is used to retrieve "candidates" endpoints. -```python +```py @dlt.resource(name="candidates", write_disposition="merge", primary_key="id") def candidates_resource( updated_at: Optional[Any] = dlt.sources.incremental( @@ -211,7 +211,7 @@ To create your data pipeline using single loading and 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="workable", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -221,7 +221,7 @@ To create your data pipeline using single loading and 1. To load all data: - ```python + ```py load_data = workable_source() load_info = pipeline.run(load_data) print(load_info) @@ -232,7 +232,7 @@ To create your data pipeline using single loading and 1. To load data from a specific date, including dependent endpoints: - ```python + ```py load_data = workable_source(start_date=datetime(2022, 1, 1), load_details=True) load_info = pipeline.run(load_data) print(load_info) @@ -244,7 +244,7 @@ To create your data pipeline using single loading and 1. To load custom endpoints ā€œcandidatesā€ and ā€œmembersā€: - ```python + ```py load_info = pipeline.run(load_data.with_resources("candidates", "members") # print the information on data that was loaded print(load_info) @@ -255,7 +255,7 @@ To create your data pipeline using single loading and 1. To load data from the ā€œjobsā€ endpoint and its dependent endpoints like "activities" and "application_form": - ```python + ```py load_data = workable_source(start_date=datetime(2022, 2, 1), load_details=True) # Set the load_details as True to load all the dependent endpoints. load_info = pipeline.run(load_data.with_resources("jobs","jobs_activities","jobs_application_form")) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md index 234483dca0..ef63ee0c12 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md @@ -84,7 +84,7 @@ Here's a summarized version: 1. To get full token using the client id obtained above, you can follow the [instructions here.](https://developer.zendesk.com/documentation/ticketing/working-with-oauth/creating-and-using-oauth-tokens-with-the-api/#creating-the-access-token) - ```curl + ```shell curl https://{subdomain}.zendesk.com/api/v2/oauth/tokens.json \ -X POST \ -v -u {email_address}:{password} \ @@ -129,7 +129,7 @@ To generate Zendesk chat OAuth token, please refer to this 1. Record the "CLIENT_ID" and "SUBDOMAIN". 1. Format the below URL with your own CLIENT_ID and SUBDOMAIN, paste it into a new browser tab, and press Enter. - ```bash + ```shell https://www.zopim.com/oauth2/authorizations/new?response_type=token&client_id=CLIENT_ID&scope=read%20write&subdomain=SUBDOMAIN ``` 1. The call will be made, possibly asking you to log in and select 'Allow' to generate the token. @@ -160,7 +160,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```shell dlt init zendesk duckdb ``` @@ -183,7 +183,7 @@ For more information, read the guide on [how to add a verified source.](../../wa information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py #Zendesk support credentials [sources.zendesk.credentials] subdomain = "subdomain" # Zendesk subdomain @@ -215,20 +215,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```shell pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```shell python zendesk_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```shell dlt pipeline show ``` @@ -246,7 +246,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrieves data from Zendesk Talk for phone calls and voicemails. -```python +```py @dlt.source(max_table_nesting=2) def zendesk_talk( credentials: TZendeskCredentials = dlt.secrets.value, @@ -266,7 +266,7 @@ run. This function loads data from Zendesk talk endpoint. -```python +```py def talk_resource( zendesk_client: ZendeskAPIClient, talk_endpoint_name: str, @@ -305,7 +305,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -315,7 +315,7 @@ verified source. 1. To load data related to support, talk and chat: - ```python + ```py #zendesk support source function data_support = zendesk_support(load_all=True) # zendesk chat source function @@ -329,7 +329,7 @@ verified source. 1. To load data related to support, chat and talk in incremental mode: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -350,7 +350,7 @@ verified source. 1. To load historical data in weekly ranges from Jan 1st, 2023, then switch to incremental loading for new tickets. - ```python + ```py # Load ranges of dates to load between January 1st 2023 and today min_start_date = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC") max_end_date = pendulum.today() diff --git a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md index c61805423b..823c1cecad 100644 --- a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md +++ b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md @@ -12,7 +12,7 @@ To do so, run the [cli command](../../reference/command-line-interface.md#show-t below with your pipeline name. The pipeline name is the name of the Python file where your pipeline is defined and also displayed in your terminal when loading: -```bash +```shell dlt pipeline {pipeline_name} show ``` @@ -33,7 +33,7 @@ pipeline and hide many intricacies of correctly setting up the connection to you Execute any SQL query and get results following the Python [dbapi](https://peps.python.org/pep-0249/) spec. Below we fetch data from the customers table: -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") with pipeline.sql_client() as client: with client.execute_query( @@ -54,7 +54,7 @@ natively (i.e. BigQuery and DuckDB), `dlt` uses the native method. Thanks to tha frames may be really fast! The example below reads GitHub reactions data from the `issues` table and counts reaction types. -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", @@ -79,7 +79,7 @@ The native connection to your destination like BigQuery `Client` or DuckDB `Duck available in case you want to do anything special. Below we take the native connection to `duckdb` to get `DuckDBPyRelation` from a query: -```python +```py import dlt import duckdb diff --git a/docs/website/docs/examples/chess_production/index.md b/docs/website/docs/examples/chess_production/index.md index d80558e745..ac305e943b 100644 --- a/docs/website/docs/examples/chess_production/index.md +++ b/docs/website/docs/examples/chess_production/index.md @@ -179,7 +179,7 @@ def load_data_with_retry(pipeline, data): :::warning To run this example you need to provide Slack incoming hook in `.dlt/secrets.toml`: -```python +```py [runtime] slack_incoming_hook="https://hooks.slack.com/services/***" ``` diff --git a/docs/website/docs/examples/qdrant_zendesk/index.md b/docs/website/docs/examples/qdrant_zendesk/index.md index 7920619b26..2bd71e3f6c 100644 --- a/docs/website/docs/examples/qdrant_zendesk/index.md +++ b/docs/website/docs/examples/qdrant_zendesk/index.md @@ -28,7 +28,7 @@ First, configure the destination credentials for [Qdrant](https://dlthub.com/doc Next, make sure you have the following dependencies installed: -```commandline +```shell pip install qdrant-client>=1.6.9 pip install fastembed>=0.1.1 ``` @@ -176,7 +176,7 @@ The query above gives stores the following results in the `response` variable: QueryResponse(id='ce2f1c5c-41c3-56c3-a31d-2399a7a9239d', embedding=None, metadata={'_dlt_id': 'ZMuFJZo0AJxV4A', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-03-14T10:52:28+00:00', 'custom_status_id': 12765028278545, 'description': 'X', 'from_messaging_channel': False, 'generated_timestamp': 1696163084, 'group_id': 12765036328465, 'has_incidents': False, 'id': 2, 'is_public': True, 'priority': 'high', 'raw_subject': 'SCRUBBED', 'requester_id': 13726460510097, 'status': 'deleted', 'subject': 'SCRUBBED', 'submitter_id': 12765072569105, 'tags': [], 'ticket_form_id': 13726337882769, 'type': 'question', 'updated_at': '2023-09-01T12:10:35+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/2.json', 'via__channel': 'web'}, document='', score=0.8467072)] ``` To get a closer look at what the Zendesk ticket was, and how dlt dealt with it, we can index into the metadata of the first `QueryResponse` object: -```json lines +```json {'_dlt_id': 'Nx3wBiL29xTgaQ', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, diff --git a/docs/website/docs/general-usage/credentials/config_providers.md b/docs/website/docs/general-usage/credentials/config_providers.md index 860370d38a..2bc67f03b8 100644 --- a/docs/website/docs/general-usage/credentials/config_providers.md +++ b/docs/website/docs/general-usage/credentials/config_providers.md @@ -38,7 +38,7 @@ providers. ### Example -```python +```py @dlt.source def google_sheets( spreadsheet_id=dlt.config.value, diff --git a/docs/website/docs/general-usage/credentials/config_specs.md b/docs/website/docs/general-usage/credentials/config_specs.md index 07e56b3e14..3c8134855b 100644 --- a/docs/website/docs/general-usage/credentials/config_specs.md +++ b/docs/website/docs/general-usage/credentials/config_specs.md @@ -21,7 +21,7 @@ service account credentials, while `ConnectionStringCredentials` handles databas As an example, let's use `ConnectionStringCredentials` which represents a database connection string. -```python +```py from dlt.sources.credentials import ConnectionStringCredentials @dlt.source @@ -60,7 +60,7 @@ dsn.password="loader" You can explicitly provide credentials in various forms: -```python +```py query("SELECT * FROM customers", "postgres://loader@localhost:5432/dlt_data") # or query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"...}) @@ -70,7 +70,7 @@ query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader".. We have some ready-made credentials you can reuse: -```python +```py from dlt.sources.credentials import ConnectionStringCredentials from dlt.sources.credentials import OAuth2Credentials from dlt.sources.credentials import GcpServiceAccountCredentials, GcpOAuthCredentials @@ -87,7 +87,7 @@ and additional query parameters. This class provides methods for parsing and generating connection strings. #### Usage -```python +```py credentials = ConnectionStringCredentials() # Set the necessary attributes @@ -117,7 +117,7 @@ client secret, refresh token, and access token. It also allows for the addition of scopes and provides methods for client authentication. Usage: -```python +```py credentials = OAuth2Credentials( client_id="CLIENT_ID", client_secret="CLIENT_SECRET", @@ -153,7 +153,7 @@ This class provides methods to retrieve native credentials for Google clients. - You may just pass the `service.json` as string or dictionary (in code and via config providers). - Or default credentials will be used. -```python +```py credentials = GcpServiceAccountCredentials() # Parse a native value (ServiceAccountCredentials) # Accepts a native value, which can be either an instance of ServiceAccountCredentials @@ -163,7 +163,7 @@ native_value = {"private_key": ".."} # or "path/to/services.json" credentials.parse_native_representation(native_value) ``` or more preferred use: -```python +```py import dlt from dlt.sources.credentials import GcpServiceAccountCredentials @@ -204,7 +204,7 @@ serialized OAuth client secrets JSON. This class provides methods for authentication and obtaining access tokens. ##### Usage -```python +```py oauth_credentials = GcpOAuthCredentials() # Accepts a native value, which can be either an instance of GoogleOAuth2Credentials @@ -214,7 +214,7 @@ native_value_oauth = {"client_secret": ...} oauth_credentials.parse_native_representation(native_value_oauth) ``` or more preferred use: -```python +```py import dlt from dlt.sources.credentials import GcpOAuthCredentials @@ -277,7 +277,7 @@ It inherits the ability to manage default credentials and extends it with method for handling partial credentials and converting credentials to a botocore session. #### Usage -```python +```py credentials = AwsCredentials() # Set the necessary attributes credentials.aws_access_key_id = "ACCESS_KEY_ID" @@ -285,7 +285,7 @@ credentials.aws_secret_access_key = "SECRET_ACCESS_KEY" credentials.region_name = "us-east-1" ``` or -```python +```py # Imports an external boto3 session and sets the credentials properties accordingly. import botocore.session @@ -295,7 +295,7 @@ credentials.parse_native_representation(session) print(credentials.aws_access_key_id) ``` or more preferred use: -```python +```py @dlt.source def aws_readers( bucket_url: str = dlt.config.value, @@ -340,14 +340,14 @@ handling partial credentials and converting credentials to a format suitable for interacting with Azure Blob Storage using the adlfs library. #### Usage -```python +```py credentials = AzureCredentials() # Set the necessary attributes credentials.azure_storage_account_name = "ACCOUNT_NAME" credentials.azure_storage_account_key = "ACCOUNT_KEY" ``` or more preferred use: -```python +```py @dlt.source def azure_readers( bucket_url: str = dlt.config.value, @@ -388,7 +388,7 @@ decorated function. Example: -```python +```py @dlt.source def zen_source(credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, some_option: bool = False): # depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected in `credentials` argument @@ -432,7 +432,7 @@ This is used a lot in the `dlt` core and may become useful for complicated sourc In fact, for each decorated function a spec is synthesized. In case of `google_sheets` following class is created: -```python +```py from dlt.sources.config import configspec, with_config @configspec diff --git a/docs/website/docs/general-usage/credentials/configuration.md b/docs/website/docs/general-usage/credentials/configuration.md index 9b2d392883..38f054c654 100644 --- a/docs/website/docs/general-usage/credentials/configuration.md +++ b/docs/website/docs/general-usage/credentials/configuration.md @@ -25,7 +25,7 @@ When done right you'll be able to run the same pipeline script during developmen In the example below, the `google_sheets` source function is used to read selected tabs from Google Sheets. It takes several arguments that specify the spreadsheet, the tab names and the Google credentials to be used when extracting data. -```python +```py @dlt.source def google_sheets( spreadsheet_id=dlt.config.value, @@ -68,14 +68,14 @@ You are free to call the function above as usual and pass all the arguments in t Instead let `dlt` to do the work and leave it to [injection mechanism](#injection-mechanism) that looks for function arguments in the config files or environment variables and adds them to your explicit arguments during a function call. Below are two most typical examples: 1. Pass spreadsheet id and tab names in the code, inject credentials from the secrets: - ```python + ```py data_source = google_sheets("23029402349032049", ["tab1", "tab2"]) ``` `credentials` value will be injected by the `@source` decorator (e.g. from `secrets.toml`). `spreadsheet_id` and `tab_names` take values from the call arguments. 2. Inject all the arguments from config / secrets - ```python + ```py data_source = google_sheets() ``` `credentials` value will be injected by the `@source` decorator (e.g. from **secrets.toml**). @@ -103,7 +103,7 @@ Where do the configs and secrets come from? By default, `dlt` looks in two **con ``` Note that **credentials** will be evaluated as dictionary containing **client_email**, **private_key** and **project_id** as keys. It is standard TOML behavior. - [Environment Variables](config_providers#environment-provider): - ```python + ```py CREDENTIALS= SPREADSHEET_ID=1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580 TAB_NAMES=tab1,tab2 @@ -123,7 +123,7 @@ There are many ways you can organize your configs and secrets. The example above ### Do not hardcode secrets You should never do that. Sooner or later your private key will leak. -```python +```py # WRONG!: # provide all values directly - wrong but possible. # secret values should never be present in the code! @@ -137,7 +137,7 @@ data_source = google_sheets( ### Pass secrets in code from external providers You can get the secret values from your own providers. Below we take **credentials** for our `google_sheets` source from Airflow base hook: -```python +```py from airflow.hooks.base_hook import BaseHook # get it from airflow connections or other credential store @@ -163,7 +163,7 @@ Doing so provides several benefits: 1. You can request [built-in and custom credentials](config_specs.md) (i.e. connection strings, AWS / GCP / Azure credentials). 1. You can specify a set of possible types via `Union` i.e. OAuth or API Key authorization. -```python +```py @dlt.source def google_sheets( spreadsheet_id: str = dlt.config.value, @@ -189,7 +189,7 @@ In case of `GcpServiceAccountCredentials`: ## Read configs and secrets yourself `dlt.secrets` and `dlt.config` provide dictionary-like access to configuration values and secrets, respectively. -```python +```py # use `dlt.secrets` and `dlt.config` to explicitly take # those values from providers from the explicit keys data_source = google_sheets( @@ -202,14 +202,14 @@ data_source.run(destination="bigquery") ``` `dlt.config` and `dlt.secrets` behave like dictionaries from which you can request a value with any key name. `dlt` will look in all [config providers](#injection-mechanism) - TOML files, env variables etc. just like it does with the standard section layout. You can also use `dlt.config.get()` or `dlt.secrets.get()` to request value cast to a desired type. For example: -```python +```py credentials = dlt.secrets.get("my_section.gcp_credentials", GcpServiceAccountCredentials) ``` Creates `GcpServiceAccountCredentials` instance out of values (typically a dictionary) under **my_section.gcp_credentials** key. ### Write configs and secrets in code **dlt.config** and **dlt.secrets** can be also used as setters. For example: -```python +```py dlt.config["sheet_id"] = "23029402349032049" dlt.secrets["destination.postgres.credentials"] = BaseHook.get_connection('postgres_dsn').extra ``` diff --git a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md index 3f665bd0fb..ba0b13636b 100644 --- a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md @@ -11,7 +11,7 @@ consistently achieve the same mapping. If instead you wish to anonymize, you can replace it with a constant. In the example below, we create a dummy source with a PII column called "name", which we replace with deterministic hashes (i.e. replacing the German umlaut). -```python +```py import dlt import hashlib diff --git a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md index 8493ffaec5..3163062ced 100644 --- a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md @@ -14,7 +14,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Create a source function that creates dummy data as follows: - ```python + ```py import dlt # This function creates a dummy data source. @@ -31,7 +31,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Next, create a function to filter out columns from the data before loading it into a database as follows: - ```python + ```py from typing import Dict, List, Optional def remove_columns(doc: Dict, remove_columns: Optional[List[str]] = None) -> Dict: @@ -53,7 +53,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Next, declare the columns to be removed from the table, and then modify the source as follows: - ```python + ```py # Example columns to remove: remove_columns_list = ["country_code"] @@ -67,7 +67,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. ``` 1. You can optionally inspect the result: - ```python + ```py for row in data_source: print(row) #{'id': 0, 'name': 'Jane Washington 0'} @@ -77,7 +77,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. At last, create a pipeline: - ```python + ```py # Integrating with a DLT pipeline pipeline = dlt.pipeline( pipeline_name='example', diff --git a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md index e58dae6d9d..04e4d33b13 100644 --- a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md @@ -12,7 +12,7 @@ In the example below, we create a dummy source with special characters in the na function that we intend to apply to the resource to modify its output (i.e. replacing the German umlaut): `replace_umlauts_in_dict_keys`. -```python +```py import dlt # create a dummy source with umlauts (special characters) in key names (um) diff --git a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md index 6b09510f68..e07be44c26 100644 --- a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md @@ -77,7 +77,7 @@ currency_conversion_enrichment/ 1. Here's the resource that yields the sample data as discussed above: - ```python + ```py @dlt.resource() def enriched_data_part_two(): data_enrichment_part_one = [ @@ -113,14 +113,14 @@ API token. information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py [sources] api_key= "Please set me up!" #ExchangeRate-API key ``` 1. Create the `converted_amount` function as follows: - ```python + ```py # @transformer(data_from=enriched_data_part_two) def converted_amount(record): """ @@ -210,7 +210,7 @@ API token. 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_two", @@ -229,7 +229,7 @@ API token. To do so, you need to add the transformer decorator at the top of the `converted_amount` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( enriched_data_part_two | converted_amount, diff --git a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md index f4578d065f..b99ce9efff 100644 --- a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md @@ -73,7 +73,7 @@ understanding, you may explore all three enrichments sequentially in the noteboo Alternatively, to create a data enrichment pipeline, you can start by creating the following directory structure: -```python +```py url_parser_enrichment/ ā”œā”€ā”€ .dlt/ ā”‚ ā””ā”€ā”€ secrets.toml @@ -100,7 +100,7 @@ Let's examine a synthetic dataset created for this article. It includes: Here's the resource that yields the sample data as discussed above: -```python +```py import dlt @dlt.resource(write_disposition="append") @@ -143,7 +143,7 @@ We use a free service called [URL Parse API](https://urlparse.com/), to parse th need to register to use this service neither get an API key. 1. Create a `url_parser` function as follows: - ```python + ```py # @dlt.transformer(data_from=tracked_data) def url_parser(record): """ @@ -195,7 +195,7 @@ need to register to use this service neither get an API key. 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_three", @@ -214,7 +214,7 @@ need to register to use this service neither get an API key. do so, you need to add the transformer decorator at the top of the `url_parser` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( tracked_data | url_parser, diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md index 8b33a852a8..d4b60c9258 100644 --- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -41,7 +41,7 @@ Here's the link to the notebook: ### B. Create a pipeline Alternatively, to create a data enrichment pipeline, you can start by creating the following directory structure: -```python +```py user_device_enrichment/ ā”œā”€ā”€ .dlt/ ā”‚ ā””ā”€ā”€ secrets.toml @@ -67,7 +67,7 @@ user_device_enrichment/ Here's the resource that yields the sample data as discussed above: - ```python + ```py import dlt @dlt.resource(write_disposition="append") @@ -118,7 +118,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py [sources] api_key= "Please set me up!" #Serp Api key. ``` @@ -126,7 +126,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Replace the value of the `api_key`. 1. Create `fetch_average_price()` function as follows: - ```python + ```py import datetime import requests @@ -247,7 +247,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_one", @@ -266,7 +266,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the do so, you need to add the transformer decorator at the top of the `fetch_average_price` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( tracked_data | fetch_average_price, diff --git a/docs/website/docs/general-usage/full-loading.md b/docs/website/docs/general-usage/full-loading.md index 4651d156f0..320d0664f5 100644 --- a/docs/website/docs/general-usage/full-loading.md +++ b/docs/website/docs/general-usage/full-loading.md @@ -13,7 +13,7 @@ that are not selected while performing a full load will not replace any data in To perform a full load on one or more of your resources, choose the `write_disposition='replace'` for this resource: -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="github") issues = [] reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", "eyes"] diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 144b176332..e0fe93df8a 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -64,7 +64,7 @@ child tables. Example below loads all the GitHub events and updates them in the destination using "id" as primary key, making sure that only a single copy of event is present in `github_repo_events` table: -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def github_repo_events(): yield from _get_event_pages() @@ -72,14 +72,14 @@ def github_repo_events(): You can use compound primary keys: -```python +```py @dlt.resource(primary_key=("id", "url"), write_disposition="merge") ... ``` By default, `primary_key` deduplication is arbitrary. You can pass the `dedup_sort` column hint with a value of `desc` or `asc` to influence which record remains after deduplication. Using `desc`, the records sharing the same `primary_key` are sorted in descending order before deduplication, making sure the record with the highest value for the column with the `dedup_sort` hint remains. `asc` has the opposite behavior. -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", @@ -91,7 +91,7 @@ By default, `primary_key` deduplication is arbitrary. You can pass the `dedup_so Example below merges on a column `batch_day` that holds the day for which given record is valid. Merge keys also can be compound: -```python +```py @dlt.resource(merge_key="batch_day", write_disposition="merge") def get_daily_batch(day): yield _get_batch_from_bucket(day) @@ -101,7 +101,7 @@ As with any other write disposition you can use it to load data ad hoc. Below we top reactions for `duckdb` repo. The lists have, obviously, many overlapping issues, but we want to keep just one instance of each. -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="github") issues = [] reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", "eyes"] @@ -117,7 +117,7 @@ Example below dispatches GitHub events to several tables by event type, keeps on by "id" and skips loading of past records using "last value" incremental. As you can see, all of this we can just declare in our resource. -```python +```py @dlt.resource(primary_key="id", write_disposition="merge", table_name=lambda i: i['type']) def github_repo_events(last_created_at = dlt.sources.incremental("created_at", "1970-01-01T00:00:00Z")): """A resource taking a stream of github events and dispatching them to tables named by event type. Deduplicates be 'id'. Loads incrementally by 'created_at' """ @@ -134,7 +134,7 @@ Each record in the destination table with the same `primary_key` or `merge_key` Deletes are propagated to any child table that might exist. For each record that gets deleted in the root table, all corresponding records in the child table(s) will also be deleted. Records in parent and child tables are linked through the `root key` that is explained in the next section. #### Example: with primary key and boolean delete column -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", @@ -157,7 +157,7 @@ def resource(): ``` #### Example: with merge key and non-boolean delete column -```python +```py @dlt.resource( merge_key="id", write_disposition="merge", @@ -175,7 +175,7 @@ def resource(): ``` #### Example: with primary key and "dedup_sort" hint -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", @@ -204,7 +204,7 @@ tables. This concept is similar to foreign key which references a parent table, set. We do not enable it everywhere because it takes storage space. Nevertheless, is some cases you may want to permanently enable root key propagation. -```python +```py pipeline = dlt.pipeline( pipeline_name='facebook_insights', destination='duckdb', @@ -243,7 +243,7 @@ Once you've figured that out, `dlt` takes care of finding maximum/minimum cursor duplicates and managing the state with last values of cursor. Take a look at GitHub example below, where we request recently created issues. -```python +```py @dlt.resource(primary_key="id") def repo_issues( access_token, @@ -280,7 +280,7 @@ In the example below we incrementally load the GitHub events, where API does not let us filter for the newest events - it always returns all of them. Nevertheless, `dlt` will load only the new items, filtering out all the duplicates and past issues. -```python +```py # use naming function in table name to generate separate tables for each event @dlt.resource(primary_key="id", table_name=lambda i: i['type']) # type: ignore def repo_events( @@ -309,7 +309,7 @@ and lets you select nested and complex data (including the whole data item when Example below creates last value which is a dictionary holding a max `created_at` value for each created table name: -```python +```py def by_event_type(event): last_value = None if len(event) == 1: @@ -333,7 +333,7 @@ def get_events(last_created_at = dlt.sources.incremental("$", last_value_func=by ### Using `end_value` for backfill You can specify both initial and end dates when defining incremental loading. Let's go back to our Github example: -```python +```py @dlt.resource(primary_key="id") def repo_issues( access_token, @@ -354,7 +354,7 @@ Please note that when `end_date` is specified, `dlt` **will not modify the exist To define specific ranges to load, you can simply override the incremental argument in the resource, for example: -```python +```py july_issues = repo_issues( created_at=dlt.sources.incremental( initial_value='2022-07-01T00:00:00Z', end_value='2022-08-01T00:00:00Z' @@ -399,7 +399,7 @@ The github events example is exactly such case. The results are ordered on curso In the same fashion the `row_order` can be used to **optimize backfill** so we don't continue making unnecessary API requests after the end of range is reached. For example: -```python +```py @dlt.resource(primary_key="id") def tickets( zendesk_client, @@ -432,7 +432,7 @@ incremental and exit yield loop when true. The `dlt.sources.incremental` instance provides `start_out_of_range` and `end_out_of_range` attributes which are set when the resource yields an element with a higher/lower cursor value than the initial or end values. If you do not want `dlt` to stop processing automatically and instead to handle such events yourself, do not specify `row_order`: -```python +```py @dlt.transformer(primary_key="id") def tickets( zendesk_client, @@ -472,7 +472,7 @@ deduplicate and which does not become a table hint. The same setting lets you di deduplication altogether when empty tuple is passed. Below we pass `primary_key` directly to `incremental` to disable deduplication. That overrides `delta` primary_key set in the resource: -```python +```py @dlt.resource(primary_key="delta") # disable the unique value check by passing () as primary key to incremental def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())): @@ -485,7 +485,7 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())) When resources are [created dynamically](source.md#create-resources-dynamically) it is possible to use `dlt.sources.incremental` definition as well. -```python +```py @dlt.source def stripe(): # declare a generator function @@ -521,7 +521,7 @@ result in `IncrementalUnboundError` exception. ### Using Airflow schedule for backfill and incremental loading When [running in Airflow task](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file), you can opt-in your resource to get the `initial_value`/`start_value` and `end_value` from Airflow schedule associated with your DAG. Let's assume that **Zendesk tickets** resource contains a year of data with thousands of tickets. We want to backfill the last year of data week by week and then continue incremental loading daily. -```python +```py @dlt.resource(primary_key="id") def tickets( zendesk_client, @@ -540,7 +540,7 @@ We opt-in to Airflow scheduler by setting `allow_external_schedulers` to `True`: 2. In all other environments, the `incremental` behaves as usual, maintaining `dlt` state. Let's generate a deployment with `dlt deploy zendesk_pipeline.py airflow-composer` and customize the dag: -```python +```py @dag( schedule_interval='@weekly', start_date=pendulum.datetime(2023, 2, 1), @@ -577,7 +577,7 @@ When you enable the DAG in Airflow, it will generate several runs and start exec subsequent weekly intervals starting with `2023-02-12, 00:00:00 UTC` to `2023-02-19, 00:00:00 UTC`. You can repurpose the DAG above to start loading new data incrementally after (or during) the backfill: -```python +```py @dag( schedule_interval='@daily', start_date=pendulum.datetime(2023, 2, 1), @@ -624,7 +624,7 @@ You may force a full refresh of a `merge` and `append` pipelines: Example: -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="dataset_name") # do a full refresh p.run(merge_source(), write_disposition="replace") @@ -655,7 +655,7 @@ is loaded, the yielded resource data will be loaded at the same time with the up In the two examples below you see how the `dlt.sources.incremental` is working under the hood. -```python +```py @resource() def tweets(): # Get a last value from loaded metadata. If not exist, get None @@ -670,7 +670,7 @@ def tweets(): If we keep a list or a dictionary in the state, we can modify the underlying values in the objects, and thus we do not need to set the state back explicitly. -```python +```py @resource() def tweets(): # Get a last value from loaded metadata. If not exist, get None @@ -708,7 +708,7 @@ data twice - even if the user makes a mistake and requests the same months range In the following example, we initialize a variable with an empty list as a default: -```python +```py @dlt.resource(write_disposition="append") def players_games(chess_url, players, start_month=None, end_month=None): loaded_archives_cache = dlt.current.resource_state().setdefault("archives", []) @@ -734,7 +734,7 @@ def players_games(chess_url, players, start_month=None, end_month=None): ### Advanced state usage: tracking the last value for all search terms in Twitter API -```python +```py @dlt.resource(write_disposition="append") def search_tweets(twitter_bearer_token=dlt.secrets.value, search_terms=None, start_time=None, end_time=None, last_value=None): headers = _headers(twitter_bearer_token) diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index 095e03e96d..53eca2e59a 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -15,7 +15,7 @@ Example: This pipeline will load a list of objects into `duckdb` table with a name "three": -```python +```py import dlt pipeline = dlt.pipeline(destination="duckdb", dataset_name="sequence") @@ -53,7 +53,7 @@ Arguments: Example: This pipeline will load the data the generator `generate_rows(10)` produces: -```python +```py import dlt def generate_rows(nr): @@ -110,7 +110,7 @@ pipeline run is progressing. `dlt` supports 4 progress monitors out of the box: You pass the progress monitor in `progress` argument of the pipeline. You can use a name from the list above as in the following example: -```python +```py # create a pipeline loading chess data that dumps # progress to stdout each 10 seconds (the default) pipeline = dlt.pipeline( @@ -123,7 +123,7 @@ pipeline = dlt.pipeline( You can fully configure the progress monitor. See two examples below: -```python +```py # log each minute to Airflow task logger ti = get_current_context()["ti"] pipeline = dlt.pipeline( @@ -134,7 +134,7 @@ pipeline = dlt.pipeline( ) ``` -```python +```py # set tqdm bar color to yellow pipeline = dlt.pipeline( pipeline_name="chess_pipeline", diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index 9b8d45982d..f98907ef42 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -19,7 +19,7 @@ Commonly used arguments: Example: -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(): for i in range(10): @@ -32,7 +32,7 @@ def source_name(): To get the data of a resource, we could do: -```python +```py for row in generate_rows(): print(row) @@ -57,7 +57,7 @@ accepts following arguments: `dlt` that column `tags` (containing a list of tags) in `user` table should have type `complex` which means that it will be loaded as JSON/struct and not as child table. - ```python + ```py @dlt.resource(name="user", columns={"tags": {"data_type": "complex"}}) def get_users(): ... @@ -82,7 +82,7 @@ You can alternatively use a [Pydantic](https://pydantic-docs.helpmanual.io/) mod For example: -```python +```py from pydantic import BaseModel @@ -119,7 +119,7 @@ Things to note: You can override this by configuring the Pydantic model -```python +```py from typing import ClassVar from dlt.common.libs.pydantic import DltConfig @@ -146,7 +146,7 @@ argument and the `table_name` string as a return value. For example, a resource that loads GitHub repository events wants to send `issue`, `pull request`, and `comment` events to separate tables. The type of the event is in the "type" field. -```python +```py # send item to a table with name item["type"] @dlt.resource(table_name=lambda event: event['type']) def repo_events() -> Iterator[TDataItems]: @@ -160,7 +160,7 @@ print(repo_events().table_schema({"type": "WatchEvent", id=...})) In more advanced cases, you can dispatch data to different tables directly in the code of the resource function: -```python +```py @dlt.resource def repo_events() -> Iterator[TDataItems]: # mark the "item" to be sent to table with name item["type"] @@ -172,7 +172,7 @@ def repo_events() -> Iterator[TDataItems]: You can add arguments to your resource functions like to any other. Below we parametrize our `generate_rows` resource to generate the number of rows we request: -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(nr): for i in range(nr): @@ -195,7 +195,7 @@ that returns a list of objects (i.e. users) in one endpoint and user details in with this by declaring a resource that obtains a list of users and another resource that receives items from the list and downloads the profiles. -```python +```py @dlt.resource(write_disposition="replace") def users(limit=None): for u in _get_users(limit): @@ -215,7 +215,7 @@ pipeline.run(user_details) ``` In the example above, `user_details` will receive data from default instance of `users` resource (with `limit` set to `None`). You can also use **pipe |** operator to bind resources dynamically -```python +```py # you can be more explicit and use a pipe operator. # with it you can create dynamic pipelines where the dependencies # are set at run time and resources are parametrized i.e. @@ -225,7 +225,7 @@ pipeline.run(users(limit=100) | user_details) :::tip Transformers are allowed not only to **yield** but also to **return** values and can decorate **async** functions and [**async generators**](../reference/performance.md#extract). Below we decorate an async function and request details on two pokemons. Http calls are made in parallel via httpx library. -```python +```py import dlt import httpx @@ -245,7 +245,7 @@ print(list([1,2] | pokemon())) A standalone resource is defined on a function that is top level in a module (not inner function) that accepts config and secrets values. Additionally if `standalone` flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the decorated function and user must call the wrapper to get the actual resource. Below we declare a `filesystem` resource that must be called before use. -```python +```py @dlt.resource(standalone=True) def filesystem(bucket_url=dlt.config.value): """list and yield files in `bucket_url`""" @@ -256,7 +256,7 @@ pipeline.run(filesystem("s3://my-bucket/reports"), table_name="reports") ``` Standalone may have dynamic name that depends on the arguments passed to the decorated function. For example:: -```python +```py @dlt.resource(standalone=True, name=lambda args: args["stream_name"]) def kinesis(stream_name: str): ... @@ -271,7 +271,7 @@ You can extract multiple resources in parallel threads or with async IO. To enable this for a sync resource you can set the `parallelized` flag to `True` in the resource decorator: -```python +```py @dlt.resource(parallelized=True) def get_users(): for u in _get_users(): @@ -288,7 +288,7 @@ pipeline.run(get_users(), get_orders()) Async generators are automatically extracted concurrently with other resources: -```python +```py @dlt.resource async def get_users(): async for u in _get_users(): # Assuming _get_users is an async generator @@ -317,7 +317,7 @@ so: Here's our resource: -```python +```py import dlt @dlt.resource(write_disposition="replace") @@ -330,7 +330,7 @@ def users(): Here's our script that defines transformations and loads the data: -```python +```py from pipedrive import users def anonymize_user(user_data): @@ -351,7 +351,7 @@ example data and test your transformations etc. In order to do that, you limit h be yielded by a resource by calling `resource.add_limit` method. In the example below we load just 10 first items from and infinite counter - that would otherwise never end. -```python +```py r = dlt.resource(itertools.count(), name="infinity").add_limit(10) assert list(r) == list(range(10)) ``` @@ -375,7 +375,7 @@ that will keep just one updated record per `user_id`. It also adds ["last value" incremental loading](incremental-loading.md#incremental_loading-with-last-value) on `created_at` column to prevent requesting again the already loaded records: -```python +```py tables = sql_database() tables.users.apply_hints( write_disposition="merge", @@ -386,7 +386,7 @@ pipeline.run(tables) ``` To just change a name of a table to which resource will load data, do the following: -```python +```py tables = sql_database() tables.users.table_name = "other_users" ``` @@ -398,7 +398,7 @@ with the existing schema in the same way `apply_hints` method above works. There should avoid lengthy operations (ie. reflecting database tables) during creation of the DAG so it is better do do it when DAG executes. You may also emit partial hints (ie. precision and scale for decimal types) for column to help `dlt` type inference. -```python +```py @dlt.resource def sql_table(credentials, schema, table): # create sql alchemy engine @@ -432,7 +432,7 @@ You can emit columns as Pydantic model and use dynamic hints (ie. lambda for tab ### Duplicate and rename resources There are cases when you your resources are generic (ie. bucket filesystem) and you want to load several instances of it (ie. files from different folders) to separate tables. In example below we use `filesystem` source to load csvs from two different folders into separate tables: -```python +```py @dlt.resource(standalone=True) def filesystem(bucket_url): # list and yield files in bucket_url @@ -463,7 +463,7 @@ You can pass individual resources or list of resources to the `dlt.pipeline` obj loaded outside the source context, will be added to the [default schema](schema.md) of the pipeline. -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(nr): for i in range(nr): @@ -485,6 +485,6 @@ To do a full refresh of an `append` or `merge` resources you temporarily change disposition to replace. You can use `apply_hints` method of a resource or just provide alternative write disposition when loading: -```python +```py p.run(merge_source(), write_disposition="replace") ``` diff --git a/docs/website/docs/general-usage/schema-contracts.md b/docs/website/docs/general-usage/schema-contracts.md index 764b565beb..81d45dbfa8 100644 --- a/docs/website/docs/general-usage/schema-contracts.md +++ b/docs/website/docs/general-usage/schema-contracts.md @@ -49,7 +49,7 @@ The `schema_contract` argument accepts two forms: 2. **shorthand** a contract mode (string) that will be applied to all schema entities. For example setting `schema_contract` to *freeze* will expand to the full form: -```python +```py {"tables": "freeze", "columns": "freeze", "data_type": "freeze"} ``` @@ -65,7 +65,7 @@ You can change the contract on the **source** instance via `schema_contract` pro Pydantic models can be used to [define table schemas and validate incoming data](resource.md#define-a-schema-with-pydantic). You can use any model you already have. `dlt` will internally synthesize (if necessary) new models that conform with the **schema contract** on the resource. Just passing a model in `column` argument of the [dlt.resource](resource.md#define-a-schema-with-pydantic) sets a schema contract that conforms to default Pydantic behavior: -```python +```py { "tables": "evolve", "columns": "discard_value", @@ -121,7 +121,7 @@ Here's how `dlt` deals with column modes: When contract is violated in freeze mode, `dlt` raises `DataValidationError` exception. This exception gives access to the full context and passes the evidence to the caller. As with any other exception coming from pipeline run, it will be re-raised via `PipelineStepFailed` exception which you should catch in except: -```python +```py try: pipeline.run() except as pip_ex: diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 7ce1d959c9..006c5c9468 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -308,7 +308,7 @@ schema available via `dlt.current.source_schema()`. Example: -```python +```py @dlt.source def textual(nesting_level: int): # get the source schema from the `current` context diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md index 1b3d1ce0cc..bcdd137dce 100644 --- a/docs/website/docs/general-usage/source.md +++ b/docs/website/docs/general-usage/source.md @@ -26,7 +26,7 @@ You declare source by decorating an (optionally async) function that return or y You can create resources by using `dlt.resource` as a function. In an example below we reuse a single generator function to create a list of resources for several Hubspot endpoints. -```python +```py @dlt.source def hubspot(api_key=dlt.secrets.value): @@ -59,7 +59,7 @@ If this is impractical (for example you want to reflect a database to create res You can access resources present in a source and select which of them you want to load. In case of `hubspot` resource above we could select and load "companies", "deals" and "products" resources: -```python +```py from hubspot import hubspot source = hubspot() @@ -73,7 +73,7 @@ pipeline.run(source.with_resources("companies", "deals")) Resources can be individually accessed and selected: -```python +```py # resources are accessible as attributes of a source for c in source.companies: # enumerate all data in companies resource print(c) @@ -89,7 +89,7 @@ source.deals.selected = False You can modify and filter data in resources, for example if we want to keep only deals after certain date: -```python +```py source.deals.add_filter(lambda deal: deal["created_at"] > yesterday) ``` @@ -103,7 +103,7 @@ You can easily get your test dataset in a few minutes, when otherwise you'd need the full loading to complete. Below we limit the `pipedrive` source to just get 10 pages of data from each endpoint. Mind that the transformers will be evaluated fully: -```python +```py from pipedrive import pipedrive_source pipeline = dlt.pipeline(pipeline_name='pipedrive', destination='duckdb', dataset_name='pipedrive_data') @@ -121,7 +121,7 @@ declare a new [transformer that takes the data from](resource.md#feeding-data-from-one-resource-into-another) `deals` resource and add it to the source. -```python +```py import dlt from hubspot import hubspot @@ -140,11 +140,11 @@ source.resources.add(source.deals | deal_scores) pipeline.run(source) ``` You can also set the resources in the source as follows -```python +```py source.deal_scores = source.deals | deal_scores ``` or -```python +```py source.resources["deal_scores"] = source.deals | deal_scores ``` :::note @@ -156,7 +156,7 @@ When adding resource to the source, `dlt` clones the resource so your existing i You can limit how deep `dlt` goes when generating child tables. By default, the library will descend and generate child tables for all nested lists, without limit. -```python +```py @dlt.source(max_table_nesting=1) def mongo_db(): ... @@ -172,7 +172,7 @@ tables of child tables). Typical settings: You can achieve the same effect after the source instance is created: -```python +```py from mongo_db import mongo_db source = mongo_db() @@ -202,7 +202,7 @@ You are also free to decompose a single source into several ones. For example, y down a 50 table copy job into an airflow dag with high parallelism to load the data faster. To do so, you could get the list of resources as: -```python +```py # get a list of resources' names resource_list = sql_source().resources.keys() @@ -216,12 +216,12 @@ for res in resource_list: You can temporarily change the "write disposition" to `replace` on all (or selected) resources within a source to force a full refresh: -```python +```py p.run(merge_source(), write_disposition="replace") ``` With selected resources: -```python +```py p.run(tables.with_resources("users"), write_disposition="replace") ``` diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index 23625db27c..7a000f04cf 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -15,7 +15,7 @@ You read and write the state in your resources. Below we use the state to create game archives which we then use to [prevent requesting duplicates](incremental-loading.md#advanced-state-usage-storing-a-list-of-processed-entities). -```python +```py @dlt.resource(write_disposition="append") def players_games(chess_url, player, start_month=None, end_month=None): # create or request a list of archives from resource scoped state @@ -114,7 +114,7 @@ could: You can inspect pipeline state with [`dlt pipeline` command](../reference/command-line-interface.md#dlt-pipeline): -```sh +```shell dlt pipeline -v chess_pipeline info ``` diff --git a/docs/website/docs/getting-started.md b/docs/website/docs/getting-started.md index cd121b0ad5..a32b3b7916 100644 --- a/docs/website/docs/getting-started.md +++ b/docs/website/docs/getting-started.md @@ -20,13 +20,13 @@ Let's get started! Install dlt using `pip`: -```bash +```shell pip install -U dlt ``` The command above installs (or upgrades) the library core, in the example below we use DuckDB as a destination so let's add a `duckdb` dependency: -```bash +```shell pip install "dlt[duckdb]" ``` @@ -63,13 +63,13 @@ When you look at the code above, you can see that we: Save this Python script with the name `quick_start_pipeline.py` and run the following command: -```bash +```shell python quick_start_pipeline.py ``` The output should look like: -```bash +```shell Pipeline quick_start completed in 0.59 seconds 1 load package(s) were loaded to destination duckdb and into dataset mydata The duckdb destination used duckdb:////home/user-name/quick_start/quick_start.duckdb location to store data @@ -82,13 +82,13 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs To allow sneak peek and basic discovery you can take advantage of [built-in integration with Strealmit](reference/command-line-interface#show-tables-and-data-in-the-destination): -```bash +```shell dlt pipeline quick_start show ``` **quick_start** is the name of the pipeline from the script above. If you do not have Streamlit installed yet do: -```bash +```shell pip install streamlit ``` diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index 6df0dad82d..3521c187a9 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -14,7 +14,7 @@ import snippets from '!!raw-loader!./intro-snippets.py'; `dlt` is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well-structured, live datasets. To get started, install it with: -```sh +```shell pip install dlt ``` Unlike other solutions, with dlt, there's no need to use any backends or containers. Simply import `dlt` in a Python file or a Jupyter Notebook cell, and create a pipeline to load data into any of the [supported destinations](dlt-ecosystem/destinations/). You can load data from any source that produces Python data structures, including APIs, files, databases, and more. @@ -54,7 +54,7 @@ load_info = pipeline.run(data, table_name="player") Copy this example to a file or a Jupyter Notebook and run it. To make it work with the DuckDB destination, you'll need to install the **duckdb** dependency (the default `dlt` installation is really minimal): -```sh +```shell pip install "dlt[duckdb]" ``` Now **run** your Python file or Notebook cell. @@ -70,7 +70,7 @@ loads it into a [destination](general-usage/glossary.md#destination) (here: **du Initialize the [Slack source](dlt-ecosystem/verified-sources/slack) with `dlt init` command: -```sh +```shell dlt init slack duckdb ``` @@ -164,7 +164,7 @@ print(load_info) Install **pymysql** driver: -```sh +```shell pip install sqlalchemy pymysql ``` diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md index b37a3a118e..c72bd790b0 100644 --- a/docs/website/docs/reference/command-line-interface.md +++ b/docs/website/docs/reference/command-line-interface.md @@ -34,7 +34,7 @@ and prints the relevant warning. ## `dlt deploy` This command prepares your pipeline for deployment and gives you step by step instruction how to accomplish it. To enabled this functionality please first execute -```sh +```shell pip install "dlt[cli]" ``` that will add additional packages to current environment. @@ -156,7 +156,7 @@ folder where you execute the `pipeline sync` command. ### Selectively drop tables and reset state -```sh +```shell dlt pipeline drop [resource_1] [resource_2] ``` @@ -164,7 +164,7 @@ Drops tables generated by selected resources and resets the state associated wit to force a full refresh on selected tables. In example below we drop all tables generated by `repo_events` resource in github pipeline: -```sh +```shell dlt pipeline github_events drop repo_events ``` @@ -194,20 +194,20 @@ The `drop` command accepts several advanced settings: 1. You can use regexes to select resources. Prepend `re:` string to indicate regex pattern. Example below will select all resources starting with `repo`: -```sh +```shell dlt pipeline github_events drop "re:^repo" ``` 2. You can drop all tables in indicated schema: -```sh +```shell dlt pipeline chess drop --drop-all ``` 3. You can indicate additional state slots to reset by passing JsonPath to source state. In example below we reset the `archives` slot in source state: -```sh +```shell dlt pipeline chess_pipeline drop --state-paths archives ``` @@ -236,7 +236,7 @@ This command lists all the pipelines executed on the local machine with their wo default pipelines folder. ### Drop pending and partially loaded packages -```sh +```shell dlt pipeline drop-pending-packages ``` Removes all extracted and normalized packages in the pipeline's working dir. @@ -247,6 +247,6 @@ were created. Use `dlt pipeline ... sync` is recommended if your destination sup ## Show stack traces If the command fails and you want to see the full stack trace add `--debug` just after `dlt` executable. -```sh +```shell dlt --debug pipeline github info ``` diff --git a/docs/website/docs/reference/installation.md b/docs/website/docs/reference/installation.md index a802c34597..8b5f748a6e 100644 --- a/docs/website/docs/reference/installation.md +++ b/docs/website/docs/reference/installation.md @@ -10,7 +10,7 @@ keywords: [installation, environment, pip install] ### Make sure you are using **Python 3.8-3.12** and have `pip` installed -```bash +```shell python --version pip --version ``` @@ -22,7 +22,7 @@ pip --version You can install Python 3.10 with an `apt` command. -```bash +```shell sudo apt update sudo apt install python3.10 sudo apt install python3.10-venv @@ -33,7 +33,7 @@ sudo apt install python3.10-venv Once you have installed [Homebrew](https://brew.sh), you can install Python 3.10. -```bash +```shell brew update brew install python@3.10 ``` @@ -44,7 +44,7 @@ brew install python@3.10 You need to install [Python 3.10 (64-bit version) for Windows](https://www.python.org/downloads/windows/). After this, you can then install `pip`. -```bash +```shell C:\> pip3 install -U pip ``` @@ -59,13 +59,13 @@ C:\> pip3 install -U pip Create a new virtual environment by making a `./env` directory to hold it. -```bash +```shell python -m venv ./env ``` Activate the virtual environment: -```bash +```shell source ./env/bin/activate ``` @@ -74,13 +74,13 @@ source ./env/bin/activate Create a new virtual environment by making a `./env` directory to hold it. -```bash +```shell python -m venv ./env ``` Activate the virtual environment: -```bash +```shell source ./env/bin/activate ``` @@ -106,6 +106,6 @@ C:\> .\env\Scripts\activate You can install `dlt` in your virtual environment by running: -```bash +```shell pip install -U dlt ``` diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 7c095b53d4..b3e67e5047 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -131,7 +131,7 @@ Please make sure that you have the `psutils` package installed (note that Airflo progress="log" ``` or when running the pipeline: -```sh +```shell PROGRESS=log python pipeline_script.py ``` @@ -514,7 +514,7 @@ next_item_mode="fifo" - Dataclasses Import the module as follows -```python +```py from dlt.common import json ``` @@ -536,18 +536,18 @@ For most use cases this is a drop in replacement for `requests`, so: :heavy_multiplication_x: **Don't** -```python +```py import requests ``` :heavy_check_mark: **Do** -```python +```py from dlt.sources.helpers import requests ``` And use it just like you would use `requests`: -```python +```py response = requests.get('https://example.com/api/contacts', headers={'Authorization': MY_API_KEY}) data = response.json() ... @@ -590,7 +590,7 @@ For more control you can create your own instance of `dlt.sources.requests.Clien This lets you customize which status codes and exceptions to retry on: -```python +```py from dlt.sources.helpers import requests http_client = requests.Client( @@ -604,7 +604,7 @@ This is sometimes needed when loading from non-standard APIs which don't use HTT For example: -```python +```py from dlt.sources.helpers import requests def retry_if_error_key(response: Optional[requests.Response], exception: Optional[BaseException]) -> bool: diff --git a/docs/website/docs/running-in-production/alerting.md b/docs/website/docs/running-in-production/alerting.md index 1364c1f988..c1fdb57ec2 100644 --- a/docs/website/docs/running-in-production/alerting.md +++ b/docs/website/docs/running-in-production/alerting.md @@ -42,7 +42,7 @@ receiving rich information on executed pipelines, including encountered errors a Alerts can be sent to a Slack channel via Slack's incoming webhook URL. The code snippet below demonstrates automated Slack notifications for database table updates using the `send_slack_message` function. -```python +```py # Import the send_slack_message function from the dlt library from dlt.common.runtime.slack import send_slack_message diff --git a/docs/website/docs/running-in-production/monitoring.md b/docs/website/docs/running-in-production/monitoring.md index 8532bac36b..77076686e2 100644 --- a/docs/website/docs/running-in-production/monitoring.md +++ b/docs/website/docs/running-in-production/monitoring.md @@ -83,7 +83,7 @@ Normalized data for the following tables: ``` To load these info back to the destination you can use the following: -```python +```py # Create a pipeline with the specified name, destination, and dataset # Run the pipeline @@ -111,7 +111,7 @@ dlt pipeline load-package The above information can also be obtained from the script as follows: -```python +```py info = pipeline.run(source, table_name="table_name", write_disposition='append') print(info.load_packages[0]) diff --git a/docs/website/docs/running-in-production/running.md b/docs/website/docs/running-in-production/running.md index 2912ff7aa2..ebf4d2114f 100644 --- a/docs/website/docs/running-in-production/running.md +++ b/docs/website/docs/running-in-production/running.md @@ -9,7 +9,7 @@ keywords: [running, production, tips] When running the pipeline in production, you may consider a few additions to your script. We'll use the script below as a starting point. -```python +```py import dlt from chess import chess @@ -28,7 +28,7 @@ packages. Package information contains its state (`COMPLETED/PROCESSED`) and lis their statuses, file sizes, types and in case of failed jobs-the error messages from the destination. -```python +```py # see when load was started print(load_info.started_at) # print the information on the first load package and all jobs inside @@ -39,7 +39,7 @@ destination. `load_info` may also be loaded into the destinations as below: -```python +```py # we reuse the pipeline instance below and load to the same dataset as data pipeline.run([load_info], table_name="_load_info") ``` @@ -50,7 +50,7 @@ where they were obtained. You can display and load trace info as shown below. Us to explore `trace` object further. The `normalize` step information contains the counts of rows per table of data that was normalized and then loaded. -```python +```py # print human friendly trace information print(pipeline.last_trace) # save trace to destination, sensitive data will be removed @@ -59,7 +59,7 @@ table of data that was normalized and then loaded. You can also access the last `extract`, `normalize` and `load` infos directly: -```python +```py # print human friendly extract information print(pipeline.last_trace.last_extract_info) # print human friendly normalization information @@ -79,7 +79,7 @@ In the package information you can also see the list of all tables and columns c destination during loading of that package. The code below displays all tables and schemas. Note that those objects are Typed Dictionaries, use your code editor to explore. -```python +```py # print all the new tables/columns in for package in load_info.load_packages: for table_name, table in package.schema_update.items(): @@ -91,7 +91,7 @@ those objects are Typed Dictionaries, use your code editor to explore. You can save only the new tables and column schemas to the destination. Note that the code above that saves `load_info` saves this data as well. -```python +```py # save just the new tables table_updates = [p.asdict()["tables"] for p in load_info.load_packages] pipeline.run(table_updates, table_name="_new_tables") @@ -129,7 +129,7 @@ RUNTIME__SLACK_INCOMING_HOOK="https://hooks.slack.com/services/T04DHMAF13Q/B04E7 Then the configured hook is available via pipeline object, we also provide convenience method to send Slack messages: -```python +```py from dlt.common.runtime.slack import send_slack_message send_slack_message(pipeline.runtime_config.slack_incoming_hook, message) @@ -191,7 +191,7 @@ There are two different types of exceptions in `__context__`: Code below tells one exception type from another. Note that we provide retry strategy helpers that does that for you. -```python +```py from dlt.common.exceptions import TerminalException if isinstance(ex, TerminalException) or (ex.__context__ is not None and isinstance(ex.__context__, TerminalException)): @@ -204,7 +204,7 @@ If any job in the package **fail terminally** it will be moved to `failed_jobs` such status. By default **no exception is raised** and other jobs will be processed and completed. You may inspect if the failed jobs are present by checking the load info as follows: -```python +```py # returns True if there are failed jobs in any of the load packages print(load_info.has_failed_jobs) # raises terminal exception if there are any failed jobs @@ -241,7 +241,7 @@ the [tenacity](https://tenacity.readthedocs.io/en/latest/) library. Snippet belo `load` stage with the `retry_load` strategy and defined back-off or re-raise exception for any other steps (`extract`, `normalize`) and for terminal exceptions. -```python +```py from tenacity import stop_after_attempt, retry_if_exception, Retrying, retry from dlt.common.runtime.slack import send_slack_message from dlt.pipeline.helpers import retry_load @@ -264,7 +264,7 @@ if __name__ == "__main__" : You can also use `tenacity` to decorate functions. This example additionally retries on `extract`: -```python +```py if __name__ == "__main__" : pipeline = dlt.pipeline(pipeline_name="chess_pipeline", destination='duckdb', dataset_name="games_data") diff --git a/docs/website/docs/tutorial/grouping-resources.md b/docs/website/docs/tutorial/grouping-resources.md index a54ba97fe3..bed7527d48 100644 --- a/docs/website/docs/tutorial/grouping-resources.md +++ b/docs/website/docs/tutorial/grouping-resources.md @@ -117,7 +117,7 @@ print(load_info) You've noticed that there's a lot of code duplication in the `get_issues` and `get_comments` functions. We can reduce that by extracting the common fetching code into a separate function and use it in both resources. Even better, we can use `dlt.resource` as a function and pass it the `fetch_github_data()` generator function directly. Here's the refactored code: -```python +```py import dlt from dlt.sources.helpers import requests @@ -163,7 +163,7 @@ For the next step we'd want to get the [number of repository clones](https://doc Let's handle this by changing our `fetch_github_data()` first: -```python +```py def fetch_github_data(endpoint, params={}, access_token=None): """Fetch data from GitHub API based on endpoint and params.""" headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} @@ -196,7 +196,7 @@ def github_source(access_token): Here, we added `access_token` parameter and now we can use it to pass the access token to the request: -```python +```py load_info = pipeline.run(github_source(access_token="ghp_XXXXX")) ``` @@ -204,7 +204,7 @@ It's a good start. But we'd want to follow the best practices and not hardcode t To use it, change the `github_source()` function to: -```python +```py @dlt.source def github_source( access_token: str = dlt.secrets.value, @@ -228,7 +228,7 @@ access_token = "ghp_A...3aRY" Now we can run the script and it will load the data from the `traffic/clones` endpoint: -```python +```py import dlt from dlt.sources.helpers import requests @@ -278,7 +278,7 @@ load_info = pipeline.run(github_source()) The next step is to make our dlt GitHub source reusable so it can load data from any GitHub repo. We'll do that by changing both `github_source()` and `fetch_github_data()` functions to accept the repo name as a parameter: -```python +```py import dlt from dlt.sources.helpers import requests diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index f14566b5c0..a7949cf017 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -8,7 +8,7 @@ In this section, we will retrieve and load data from the GitHub API into [DuckDB Before we start, make sure you have installed `dlt` with the DuckDB dependency: -```bash +```shell pip install "dlt[duckdb]" ``` @@ -52,13 +52,13 @@ Here's what the code above does: Save `github_issues.py` and run the following command: -```bash +```shell python github_issues.py ``` Once the data has been loaded, you can inspect the created dataset using the Streamlit app: -```bash +```shell dlt pipeline github_issues show ``` diff --git a/docs/website/docs/walkthroughs/add-a-verified-source.md b/docs/website/docs/walkthroughs/add-a-verified-source.md index bd7bd9894e..eba92302e5 100644 --- a/docs/website/docs/walkthroughs/add-a-verified-source.md +++ b/docs/website/docs/walkthroughs/add-a-verified-source.md @@ -23,13 +23,13 @@ cd various_pipelines List available verified sources to see their names and descriptions: -```bash +```shell dlt init --list-verified-sources ``` Now pick one of the source names, for example `pipedrive` and a destination i.e. `bigquery`: -```bash +```shell dlt init pipedrive bigquery ``` @@ -100,7 +100,7 @@ You can modify an existing verified source in place. ## 5. Add more sources to your project -```bash +```shell dlt init chess duckdb ``` @@ -116,7 +116,7 @@ pipeline: To update the verified source you have to the newest online version just do the same init command in the parent folder: -```bash +```shell dlt init pipedrive bigquery ``` @@ -124,19 +124,19 @@ dlt init pipedrive bigquery To find out more info about this command, use --help: -```bash +```shell dlt init --help ``` To deploy from a branch of the `verified-sources` repo, you can use the following: -```bash +```shell dlt init source destination --branch ``` To deploy from another repo, you could fork the verified-sources repo and then provide the new repo url as below, replacing `dlt-hub` with your fork name: -```bash +```shell dlt init pipedrive bigquery --location "https://github.com/dlt-hub/verified-sources" ``` diff --git a/docs/website/docs/walkthroughs/adjust-a-schema.md b/docs/website/docs/walkthroughs/adjust-a-schema.md index fb27ab3f9a..196a876bfd 100644 --- a/docs/website/docs/walkthroughs/adjust-a-schema.md +++ b/docs/website/docs/walkthroughs/adjust-a-schema.md @@ -18,7 +18,7 @@ schema. Set up an import folder from which `dlt` will read your modifications by Following our example in [run a pipeline](run-a-pipeline.md): -```python +```py dlt.pipeline( import_schema_path="schemas/import", export_schema_path="schemas/export", @@ -78,7 +78,7 @@ In next steps we'll experiment a lot, you will be warned to set `full_refresh=Tr So if you have a `yaml` file, and you change it (e.g. change a data type or add a hint), then you need to **delete the dataset** or set `full_refresh=True`: -```python +```py dlt.pipeline( import_schema_path="schemas/import", export_schema_path="schemas/export", diff --git a/docs/website/docs/walkthroughs/create-a-pipeline.md b/docs/website/docs/walkthroughs/create-a-pipeline.md index 0facdfa884..f58debce65 100644 --- a/docs/website/docs/walkthroughs/create-a-pipeline.md +++ b/docs/website/docs/walkthroughs/create-a-pipeline.md @@ -18,19 +18,19 @@ steps below. Create a new empty directory for your `dlt` project by running: -```bash +```shell mkdir weatherapi_duckdb && cd weatherapi_duckdb ``` Start a `dlt` project with a pipeline template that loads data to DuckDB by running: -```bash +```shell dlt init weatherapi duckdb ``` Install the dependencies necessary for DuckDB: -```bash +```shell pip install -r requirements.txt ``` @@ -48,7 +48,7 @@ Copy the value of the API key into `.dlt/secrets.toml`: api_secret_key = '' ``` The **secret name** corresponds to the **argument name** in the source function. Below `api_secret_key` [will get its value](../general-usage/credentials/configuration.md#general-usage-and-an-example) from `secrets.toml` when `weatherapi_source()` is called. -```python +```py @dlt.source def weatherapi_source(api_secret_key=dlt.secrets.value): ... @@ -56,7 +56,7 @@ def weatherapi_source(api_secret_key=dlt.secrets.value): Run the `weatherapi.py` pipeline script to test that authentication headers look fine: -```bash +```shell python3 weatherapi.py ``` @@ -67,7 +67,7 @@ Your API key should be printed out to stdout along with some test data. Replace the definition of the `weatherapi_resource` function definition in the `weatherapi.py` pipeline script with a call to the WeatherAPI.com API: -```python +```py @dlt.resource(write_disposition="append") def weatherapi_resource(api_secret_key=dlt.secrets.value): url = "https://api.weatherapi.com/v1/current.json" @@ -82,7 +82,7 @@ def weatherapi_resource(api_secret_key=dlt.secrets.value): Run the `weatherapi.py` pipeline script to test that the API call works: -```bash +```shell python3 weatherapi.py ``` @@ -93,7 +93,7 @@ This should print out the weather in New York City right now. Remove the `exit()` call from the `main` function in `weatherapi.py`, so that running the `python3 weatherapi.py` command will now also run the pipeline: -```python +```py if __name__=='__main__': # configure the pipeline with your destination details @@ -118,13 +118,13 @@ if __name__=='__main__': Run the `weatherapi.py` pipeline script to load data into DuckDB: -```bash +```shell python3 weatherapi.py ``` Then this command to see that the data loaded: -```bash +```shell dlt pipeline weatherapi show ``` diff --git a/docs/website/docs/walkthroughs/create-new-destination.md b/docs/website/docs/walkthroughs/create-new-destination.md index 3e64cc55ab..ad77b7cf83 100644 --- a/docs/website/docs/walkthroughs/create-new-destination.md +++ b/docs/website/docs/walkthroughs/create-new-destination.md @@ -174,7 +174,7 @@ Add an import to your factory in [`dlt.destinations.__init__`](https://github.co ## Testing We can quickly repurpose existing github source and `secrets.toml` already present in the project to test new destination. Let's assume that the module name is `presto`, same for the destination name and config section name. Here's our testing script `github_pipeline.py` -```python +```py import dlt from github import github_repo_events diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md index 2e8cdfe7d3..8f6b5a343d 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md @@ -15,7 +15,7 @@ You can setup GCP cloud function webhook using `dlt` as follows: 5. Select "Python 3.10" as the environment. 6. Use the code provided to set up the cloud function for event ingestion: - ```python + ```py import dlt import json import time @@ -40,7 +40,7 @@ You can setup GCP cloud function webhook using `dlt` as follows: 7. Set the function name as "your_webhook" in the Entry point field. 8. In the requirements.txt file, specify the necessary packages: - ```python + ```py # Function dependencies, for example: # package>=version dlt @@ -56,7 +56,7 @@ You can setup GCP cloud function webhook using `dlt` as follows: To manually test the function you have created, you can send a manual POST request as a webhook using the following code: -```bash +```shell import requests webhook_url = 'please set me up!' # Your cloud function Trigger URL diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md index 365f6747dc..a33163c6a7 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md @@ -23,7 +23,7 @@ initialize a Git repository in your `dlt` project directory and push it to GitHu Before you can deploy, you must run your pipeline locally at least once. -```bash +```shell python3 {pipeline_name}_pipeline.py ``` @@ -31,12 +31,12 @@ This should successfully load data from the source to the destination once and a ## 3. Initialize deployment First you need to add additional dependencies that `deploy` command requires: -```bash +```shell pip install "dlt[cli]" ``` then: -```bash +```shell dlt deploy {pipeline_name}_pipeline.py airflow-composer ``` @@ -65,7 +65,7 @@ By default, the `dlt deploy` command shows you the deployment credentials in ENV ## Example with the pipedrive pipeline ### 1. Run the deploy command -```bash +```shell dlt deploy pipedrive_pipeline.py airflow-composer ``` where `pipedrive_pipeline.py` is the pipeline script that you just ran and `airflow-composer` is a deployment method. The command will create deployment files and provide instructions to set up the credentials. @@ -92,7 +92,7 @@ pipedrive_api_key = "c66..." > šŸ’” `deploy` command will use [Airflow variable](#4-add-credentials) called `dlt_secrets_toml` to store all the required secrets as `toml` fragment. You can also use **environment variables** by passing `--secrets-format env` option: -```bash +```shell dlt deploy pipedrive_pipeline.py airflow-composer --secrets-format env ``` which will output the environment variable names and their values. @@ -114,7 +114,7 @@ c66c.. In directory `dags/` you can find the file `dag_pipedrive.py` that you need to edit. It has the following structure: -```python +```py import dlt from airflow.decorators import dag from dlt.common import pendulum @@ -169,7 +169,7 @@ load_data() (`use_task_logger=True`) and set the retry policy as a Retrying class object with three restart attempts. - ```python + ```py from tenacity import Retrying, stop_after_attempt # Set `use_data_folder` to True to store temporary data on the `data` bucket. @@ -193,7 +193,7 @@ created DAG script. - Import your sources from your existing pipeline script - after task group is created: - ```python + ```py # Import your source from pipeline script from pipedrive import pipedrive_source ``` @@ -202,7 +202,7 @@ created DAG script. then copy it here. For example, look at the `load_from_start_date` function in `pipedrive_pipeline.py`: - ```python + ```py """Example to incrementally load activities limited to items updated after a given date""" pipeline = dlt.pipeline( @@ -231,7 +231,7 @@ created DAG script. activities_source\]), so we have to add them sequentially. See [Troubleshooting](deploy-with-airflow-composer.md#troubleshooting) section. - ```python + ```py # Create the source, # the "serialize" decompose option will convert # dlt resources into Airflow tasks. @@ -265,7 +265,7 @@ created DAG script. As a result, we will get a script of the following form: -```python +```py import dlt from airflow.decorators import dag from dlt.common import pendulum @@ -446,19 +446,19 @@ There are two ways to pass the credentials Add stage deployment files to commit. Use your Git UI or the following command: - ```bash + ```shell git add dags/dag_pipedrive.py build/cloudbuild.yaml ``` Commit the files above. Use your Git UI or the following command: - ```bash + ```shell git commit -m 'initiate pipedrive pipeline with Airflow' ``` Push changes to GitHub. Use your Git UI or the following command: - ```bash + ```shell git push origin ``` @@ -497,7 +497,7 @@ unacceptable data structure and provided `decompose = "serialize"`. For example: -```python +```py tasks.add_run( pipeline=pipeline, data=[source, activities_source], @@ -512,7 +512,7 @@ Airflow tasks. PipelineTasksGroup can't handle the list of sources in the ā€œserializeā€ mode, it can only decompose `DltSource`, so we have to add them sequentially: -```python +```py tasks.add_run( pipeline=pipeline, data=source, @@ -531,7 +531,7 @@ Or you should set the `decompose = "noneā€` to run it as the one Airflow task. In case of `pipedrive` pipeline we tried to load data from ā€œcustom_fields_mappingā€ twice. -```python +```py # First source configure to load everything except activities from the beginning source = pipedrive_source() source.resources["activities"].selected = False @@ -545,7 +545,7 @@ activities_source = pipedrive_source( Because of this we got the following error: -```python +```py airflow.exceptions.DuplicateTaskIdFound: Task id ā€˜pipedrive.pipedrive_custom_fields_mappingā€™ has already been added to the DAG ``` @@ -554,7 +554,7 @@ Task ids in the task group should be still unique globally, so in this case we h ā€œcustom_fields_mappingā€ from `activities_source`. ā€œcustom_fields_mappingā€ will be taken from the current state to translate custom field hashes to names: -```python +```py activities_source = pipedrive_source( since_timestamp="2023-03-01 00:00:00Z" ).with_resources("activities") diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-github-actions.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-github-actions.md index 7604e14746..4897d930df 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-github-actions.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-github-actions.md @@ -30,7 +30,7 @@ This should successfully load data from the source to the destination once. ## Initialize deployment First you need to add additional dependencies that `deploy` command requires: -```bash +```shell pip install "dlt[cli]" ``` then the command below will create a Github workflow that runs your pipeline script every 30 minutes: diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions.md index 897b3257b9..fd220dfd8e 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions.md @@ -32,7 +32,7 @@ To deploy the pipeline, we'll use the Google Cloud Source Repositories method. - Run the following command to initialise the verified source with Notion and create a pipeline example with BigQuery as the target. - ```bash + ```shell dlt init notion bigquery ``` @@ -43,7 +43,7 @@ To deploy the pipeline, we'll use the Google Cloud Source Repositories method. in the `dlthub` [documentation](../../dlt-ecosystem/verified-sources/notion). 1. Create a new Python file called "main.py" in the main directory. The file can be configured as follows: - ```python + ```py from notion_pipeline import load_databases def pipeline_notion(request): @@ -59,7 +59,7 @@ To deploy the pipeline, we'll use the Google Cloud Source Repositories method. In a shell editor, navigate to the main directory where the "main.py" file is located and run the following command in the terminal: -```bash +```shell gcloud functions deploy pipeline_notion --runtime python310 \ --trigger-http --allow-unauthenticated --source . --timeout 300 ``` diff --git a/docs/website/docs/walkthroughs/run-a-pipeline.md b/docs/website/docs/walkthroughs/run-a-pipeline.md index d17f941e94..b217ad750d 100644 --- a/docs/website/docs/walkthroughs/run-a-pipeline.md +++ b/docs/website/docs/walkthroughs/run-a-pipeline.md @@ -16,7 +16,7 @@ Once you [created a new pipeline](create-a-pipeline) or (or [customize](add-a-verified-source#3-customize-or-write-a-pipeline-script)) a pipeline script, like the one below that loads the data from [chess.com](https://www.chess.com) API: -```python +```py import dlt from chess import chess @@ -34,7 +34,7 @@ packages. The `run` method returns a `load_info` object that, when printed, disp with pipeline and dataset names, ids of the load packages, optionally with the information on failed jobs. Add the following line to your script: -```python +```py print(load_info) ``` @@ -54,20 +54,20 @@ progress bars or console logging to observe what pipeline is doing. We support m progress bar libraries, Python loggers or just a text console. To demonstrate, let's modify the script to get a year of chess games data: -```python +```py data = chess(['magnuscarlsen', 'rpragchess'], start_month="2021/11", end_month="2022/12") ``` Install [enlighten](https://github.com/Rockhopper-Technologies/enlighten). Enlighten displays progress bars that can be mixed with log messages: -```sh +```shell pip install enlighten ``` Run your script setting the `PROGRESS` environment variable to the library name: -```sh +```shell PROGRESS=enlighten python chess_pipeline.py ``` @@ -75,7 +75,7 @@ Other libraries that you can use are [tqdm](https://github.com/tqdm/tqdm), [alive_progress](https://github.com/rsalmei/alive-progress). Set the name to `log` to dump progress to console periodically: -```sh +```shell PROGRESS=log python chess_pipeline.py ``` @@ -86,7 +86,7 @@ PROGRESS=log python chess_pipeline.py You can quickly inspect the generated tables, the data, see how many rows were loaded to which table, do SQL queries etc. by executing the following command from the same folder as your script: -```sh +```shell dlt pipeline chess_pipeline show ``` @@ -110,29 +110,29 @@ Collecting usage statistics. To deactivate, set browser.gatherUsageStats to Fals particular tables. The packages are identified by **load_id**, that you can see in the printout above or get by running the following command: -```sh +```shell dlt pipeline chess_pipeline info ``` You can inspect the package, get list of jobs and in case of failed ones, get the associated error messages. - See the most recent load package info: - ```sh + ```shell dlt pipeline chess_pipeline load-package ``` - See package info with given load id: - ```sh + ```shell dlt pipeline chess_pipeline load-package 1679931001.985323 ``` - Also see the schema changes introduced in the package: - ```sh + ```shell dlt pipeline -v chess_pipeline load-package ``` `dlt` stores the trace of the most recent data load. The trace contains information on the pipeline processing steps: `extract`, `normalize` and `load`. It also shows the last `load_info`: -```sh +```shell dlt pipeline chess_pipeline trace ``` @@ -153,7 +153,7 @@ to solving your problem. Let us know if you come across one that is not clear to The most common exception that you will encounter looks like this. Here we modify our `chess_pipeline.py` script to load data into postgres, but we are not providing the password. -```bash +```shell CREDENTIALS="postgres://loader@localhost:5432/dlt_data" python chess_pipeline.py ... dlt.common.configuration.exceptions.ConfigFieldMissingException: Following fields are missing: ['password'] in configuration with spec PostgresCredentials @@ -203,7 +203,7 @@ credentials.password="loader" `dlt` will raise `PipelineStepFailed` exception to inform you of a problem encountered during execution of particular step. You can catch those in code: -```python +```py from dlt.pipeline.exceptions import PipelineStepFailed try: @@ -215,7 +215,7 @@ except PipelineStepFailed as step_failed: Or use `trace` command to review the last exception. Here we provided a wrong postgres password: -```sh +```shell dlt pipeline chess_pipeline trace ``` @@ -245,7 +245,7 @@ What now? Investigate further with following command: -```sh +```shell dlt pipeline chess_pipeline failed-jobs ``` diff --git a/docs/website/docs/walkthroughs/share-a-dataset.md b/docs/website/docs/walkthroughs/share-a-dataset.md index e91177110e..9ed103f23c 100644 --- a/docs/website/docs/walkthroughs/share-a-dataset.md +++ b/docs/website/docs/walkthroughs/share-a-dataset.md @@ -15,7 +15,7 @@ BigQuery: ## 1. Replace the "destination" argument with "bigquery" -```python +```py import dlt from chess import chess diff --git a/docs/website/docs/walkthroughs/zendesk-weaviate.md b/docs/website/docs/walkthroughs/zendesk-weaviate.md index df0812191d..ad30f88a8c 100644 --- a/docs/website/docs/walkthroughs/zendesk-weaviate.md +++ b/docs/website/docs/walkthroughs/zendesk-weaviate.md @@ -23,7 +23,7 @@ We're going to use some ready-made components from the [dlt ecosystem](https://d 1. Create a new folder for your project, navigate to it, and create a virtual environment: - ```bash + ```shell mkdir zendesk-weaviate cd zendesk-weaviate python -m venv venv @@ -31,13 +31,13 @@ We're going to use some ready-made components from the [dlt ecosystem](https://d ``` 2. Install dlt with Weaviate support - ```bash + ```shell pip install "dlt[weaviate]" ``` 3. Install dlt Zendesk verified source - ```bash + ```shell dlt init zendesk weaviate ``` @@ -77,7 +77,7 @@ X-OpenAI-Api-Key = "sk-..." When you run `dlt init zendesk weaviate`, dlt creates a file called `zendesk_pipeline.py` in the current directory. This file contains an example pipeline that you can use to load data from Zendesk source. Let's edit this file to make it work for our use case: -```python +```py import dlt from dlt.destinations.adapters import weaviate_adapter @@ -122,7 +122,7 @@ Let's go through the code above step by step: Now that we have the pipeline configured, we can run the Python script: -```bash +```shell python zendesk_pipeline.py ``` @@ -132,7 +132,7 @@ We have successfully loaded the data from Zendesk to Weaviate. Let's check it ou We can now run a vector search query on the data we loaded into Weaviate. Create a new Python file called `query.py` and add the following code: -```python +```py import weaviate client = weaviate.Client( url='YOUR_WEAVIATE_URL', diff --git a/docs/website/process_docs.py b/docs/website/process_docs.py deleted file mode 100644 index 0aef5c9fcb..0000000000 --- a/docs/website/process_docs.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Parse all markdown files, insert snippets, add tuba links and export to final directory -""" \ No newline at end of file