Docs: Add preprocessor (#1114)

* add docs preprocessor script poc * fix api reference sidebar * fix deploy scripts * remove python dependency from running docs locally * fix edit link for process docs * pin databind.json python package * pin databind core * use concurrently to run watcher for local dev * extend preprocess script to insert tuba links * remove tuba links from md files * update tuba markers * update script to insert snippets * update package.json * update preprocess script * remove snippets code from markdown files and update markers * update examples * small change to contributing md * fix preprocess script to use new snippets marker * update preprocess script and npm run scripts * fix custom destination example to match new format
dlt-hub · Mar 22, 2024 · b626c9b · b626c9b
1 parent f1ec901
commit b626c9b
Show file tree

Hide file tree

Showing 74 changed files with 680 additions and 1,718 deletions.
diff --git a/docs/examples/CONTRIBUTING.md b/docs/examples/CONTRIBUTING.md
@@ -15,7 +15,7 @@ Note: All paths in this guide are relative to the `dlt` repository directory.
     - In the section `<Header info=` add the tl;dr for your example, it should be short but informative.
     - Set `slug="<example-name>" run_file="<snippet-name>" />`.
     - List what users will learn from this example. Use bullet points and link corresponding documentation pages.
-    - Use tags `<!--@@@DLT_SNIPPET_START ./code/<snippet-name>-snippets.py::smal_part_of_code-->` to insert example code snippets. Do not write them manually!
+    - Use tags `<!--@@@DLT_SNIPPET ./code/<snippet-name>-snippets.py::smal_part_of_code-->` to insert example code snippets. Do not write them manually!
 
 ## Add tests
 

diff --git a/docs/examples/chess_production/chess.py b/docs/examples/chess_production/chess.py
@@ -6,7 +6,6 @@
 from dlt.common.typing import StrAny, TDataItems
 from dlt.sources.helpers.requests import client
 
-
 @dlt.source
 def chess(
     chess_url: str = dlt.config.value,
@@ -57,7 +56,6 @@ def players_games(username: Any) -> Iterator[TDataItems]:
 
 MAX_PLAYERS = 5
 
-
 def load_data_with_retry(pipeline, data):
     try:
         for attempt in Retrying(
@@ -67,7 +65,9 @@ def load_data_with_retry(pipeline, data):
             reraise=True,
         ):
             with attempt:
-                logger.info(f"Running the pipeline, attempt={attempt.retry_state.attempt_number}")
+                logger.info(
+                    f"Running the pipeline, attempt={attempt.retry_state.attempt_number}"
+                )
                 load_info = pipeline.run(data)
                 logger.info(str(load_info))
 
@@ -89,7 +89,9 @@ def load_data_with_retry(pipeline, data):
     # print the information on the first load package and all jobs inside
     logger.info(f"First load package info: {load_info.load_packages[0]}")
     # print the information on the first completed job in first load package
-    logger.info(f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}")
+    logger.info(
+        f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}"
+    )
 
     # check for schema updates:
     schema_updates = [p.schema_update for p in load_info.load_packages]
@@ -147,4 +149,4 @@ def load_data_with_retry(pipeline, data):
     )
     # get data for a few famous players
     data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS)
-    load_data_with_retry(pipeline, data)
+    load_data_with_retry(pipeline, data)
diff --git a/docs/examples/connector_x_arrow/load_arrow.py b/docs/examples/connector_x_arrow/load_arrow.py
@@ -3,7 +3,6 @@
 import dlt
 from dlt.sources.credentials import ConnectionStringCredentials
 
-
 def read_sql_x(
     conn_str: ConnectionStringCredentials = dlt.secrets.value,
     query: str = dlt.config.value,
@@ -15,7 +14,6 @@ def read_sql_x(
         protocol="binary",
     )
 
-
 def genome_resource():
     # create genome resource with merge on `upid` primary key
     genome = dlt.resource(

diff --git a/docs/examples/custom_destination_bigquery/.dlt/example.secrets.toml b/docs/examples/custom_destination_bigquery/.dlt/example.secrets.toml
@@ -1,4 +1,3 @@
-# you can just paste services.json as credentials
 [destination.bigquery.credentials]
 client_email = ""
 private_key = ""

diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py
@@ -15,7 +15,6 @@
 # format: "your-project.your_dataset.your_table"
 BIGQUERY_TABLE_ID = "chat-analytics-rasa-ci.ci_streaming_insert.natural-disasters"
 
-
 # dlt sources
 @dlt.resource(name="natural_disasters")
 def resource(url: str):
@@ -39,7 +38,6 @@ def resource(url: str):
     )
     yield table
 
-
 # dlt biquery custom destination
 # we can use the dlt provided credentials class
 # to retrieve the gcp credentials from the secrets
@@ -60,7 +58,6 @@ def bigquery_insert(
         load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config)
     load_job.result()  # Waits for the job to complete.
 
-
 if __name__ == "__main__":
     # run the pipeline and print load results
     pipeline = dlt.pipeline(
@@ -71,4 +68,4 @@ def bigquery_insert(
     )
     load_info = pipeline.run(resource(url=OWID_DISASTERS_URL))
 
-    print(load_info)
+    print(load_info)
diff --git a/docs/examples/google_sheets/google_sheets.py b/docs/examples/google_sheets/google_sheets.py
@@ -9,15 +9,13 @@
 )
 from dlt.common.typing import DictStrAny, StrAny
 
-
 def _initialize_sheets(
     credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials]
 ) -> Any:
     # Build the service object.
     service = build("sheets", "v4", credentials=credentials.to_native_credentials())
     return service
 
-
 @dlt.source
 def google_spreadsheet(
     spreadsheet_id: str,
@@ -57,7 +55,6 @@ def get_sheet(sheet_name: str) -> Iterator[DictStrAny]:
         for name in sheet_names
     ]
 
-
 if __name__ == "__main__":
     pipeline = dlt.pipeline(destination="duckdb")
     # see example.secrets.toml to where to put credentials
@@ -70,4 +67,4 @@ def get_sheet(sheet_name: str) -> Iterator[DictStrAny]:
             sheet_names=range_names,
         )
     )
-    print(info)
+    print(info)
diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/zendesk.py
@@ -6,11 +6,12 @@
 from dlt.common.typing import TAnyDateTime
 from dlt.sources.helpers.requests import client
 
-
 @dlt.source(max_table_nesting=2)
 def zendesk_support(
     credentials: Dict[str, str] = dlt.secrets.value,
-    start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1),  # noqa: B008
+    start_date: Optional[TAnyDateTime] = pendulum.datetime(  # noqa: B008
+        year=2000, month=1, day=1
+    ),
     end_date: Optional[TAnyDateTime] = None,
 ):
     """
@@ -112,12 +113,11 @@ def get_pages(
         if not response_json["end_of_stream"]:
             get_url = response_json["next_page"]
 
-
 if __name__ == "__main__":
     # create dlt pipeline
     pipeline = dlt.pipeline(
         pipeline_name="zendesk", destination="duckdb", dataset_name="zendesk_data"
     )
 
     load_info = pipeline.run(zendesk_support())
-    print(load_info)
+    print(load_info)
diff --git a/docs/examples/nested_data/nested_data.py b/docs/examples/nested_data/nested_data.py
@@ -13,7 +13,6 @@
 
 CHUNK_SIZE = 10000
 
-
 # You can limit how deep dlt goes when generating child tables.
 # By default, the library will descend and generate child tables
 # for all nested lists, without a limit.
@@ -82,7 +81,6 @@ def load_documents(self) -> Iterator[TDataItem]:
         while docs_slice := list(islice(cursor, CHUNK_SIZE)):
             yield map_nested_in_place(convert_mongo_objs, docs_slice)
 
-
 def convert_mongo_objs(value: Any) -> Any:
     if isinstance(value, (ObjectId, Decimal128)):
         return str(value)

diff --git a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py
@@ -4,7 +4,6 @@
 from dlt.destinations.impl.weaviate import weaviate_adapter
 from PyPDF2 import PdfReader
 
-
 @dlt.resource(selected=False)
 def list_files(folder_path: str):
     folder_path = os.path.abspath(folder_path)
@@ -16,7 +15,6 @@ def list_files(folder_path: str):
             "mtime": os.path.getmtime(file_path),
         }
 
-
 @dlt.transformer(primary_key="page_id", write_disposition="merge")
 def pdf_to_text(file_item, separate_pages: bool = False):
     if not separate_pages:
@@ -30,7 +28,6 @@ def pdf_to_text(file_item, separate_pages: bool = False):
         page_item["page_id"] = file_item["file_name"] + "_" + str(page_no)
         yield page_item
 
-
 pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate")
 
 # this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf"
@@ -54,4 +51,4 @@ def pdf_to_text(file_item, separate_pages: bool = False):
 
 client = weaviate.Client("http://localhost:8080")
 # get text of all the invoices in InvoiceText class we just created above
-print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
+print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
diff --git a/docs/examples/qdrant_zendesk/qdrant.py b/docs/examples/qdrant_zendesk/qdrant.py
@@ -10,12 +10,13 @@
 
 from dlt.common.configuration.inject import with_config
 
-
 # function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk
 @dlt.source(max_table_nesting=2)
 def zendesk_support(
     credentials: Dict[str, str] = dlt.secrets.value,
-    start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1),  # noqa: B008
+    start_date: Optional[TAnyDateTime] = pendulum.datetime(  # noqa: B008
+        year=2000, month=1, day=1
+    ),
     end_date: Optional[TAnyDateTime] = None,
 ):
     """
@@ -79,15 +80,13 @@ def _parse_date_or_none(value: Optional[str]) -> Optional[pendulum.DateTime]:
         return None
     return ensure_pendulum_datetime(value)
 
-
 # modify dates to return datetime objects instead
 def _fix_date(ticket):
     ticket["updated_at"] = _parse_date_or_none(ticket["updated_at"])
     ticket["created_at"] = _parse_date_or_none(ticket["created_at"])
     ticket["due_at"] = _parse_date_or_none(ticket["due_at"])
     return ticket
 
-
 # function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk
 def get_pages(
     url: str,
@@ -128,7 +127,6 @@ def get_pages(
         if not response_json["end_of_stream"]:
             get_url = response_json["next_page"]
 
-
 if __name__ == "__main__":
     # create a pipeline with an appropriate name
     pipeline = dlt.pipeline(
@@ -148,6 +146,7 @@ def get_pages(
 
     print(load_info)
 
+
     # running the Qdrant client to connect to your Qdrant database
 
     @with_config(sections=("destination", "qdrant", "credentials"))

diff --git a/docs/examples/transformers/pokemon.py b/docs/examples/transformers/pokemon.py
@@ -1,7 +1,6 @@
 import dlt
 from dlt.sources.helpers import requests
 
-
 @dlt.source(max_table_nesting=2)
 def source(pokemon_api_url: str):
     """"""
@@ -45,7 +44,6 @@ def species(pokemon_details):
 
     return (pokemon_list | pokemon, pokemon_list | pokemon | species)
 
-
 if __name__ == "__main__":
     # build duck db pipeline
     pipeline = dlt.pipeline(
@@ -54,4 +52,4 @@ def species(pokemon_details):
 
     # the pokemon_list resource does not need to be loaded
     load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon"))
-    print(load_info)
+    print(load_info)
diff --git a/docs/website/.gitignore b/docs/website/.gitignore
@@ -4,6 +4,9 @@
 # Production
 /build
 
+# preprocessed docs
+/docs_processed
+
 # Generated files
 .docusaurus
 .cache-loader

diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md
@@ -154,14 +154,5 @@ aws_data_catalog="awsdatacatalog"
 You can choose the following file formats:
 * [parquet](../file-formats/parquet.md) is used by default
 
-<!--@@@DLT_SNIPPET_START tuba::athena-->
-## Additional Setup guides
-
-- [Load data from Chess.com to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-athena)
-- [Load data from Notion to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-athena)
-- [Load data from HubSpot to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-athena)
-- [Load data from GitHub to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-athena)
-- [Load data from Google Analytics to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-athena)
-- [Load data from Google Sheets to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-athena)
-- [Load data from Stripe to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-athena)
-<!--@@@DLT_SNIPPET_END tuba::athena-->
+<!--@@@DLT_TUBA athena-->
+
diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md
@@ -263,16 +263,7 @@ bigquery_adapter(my_resource, partition="partition_column_name")
 my_resource = bigquery_adapter(my_resource, partition="partition_column_name")
 ```
 
-Refer to the [full API specification](../../../docs/api_reference/destinations/impl/bigquery/bigquery_adapter.md) for more details.
-
-<!--@@@DLT_SNIPPET_START tuba::bigquery-->
-## Additional Setup guides
-
-- [Load data from Notion to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-bigquery)
-- [Load data from Google Analytics to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-bigquery)
-- [Load data from Chess.com to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-bigquery)
-- [Load data from HubSpot to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-bigquery)
-- [Load data from GitHub to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-bigquery)
-- [Load data from Google Sheets to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-bigquery)
-- [Load data from Stripe to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-bigquery)
-<!--@@@DLT_SNIPPET_END tuba::bigquery-->
+Refer to the [full API specification](../../api_reference/destinations/impl/bigquery/bigquery_adapter.md) for more details.
+
+<!--@@@DLT_TUBA bigquery-->
+
diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md
@@ -185,14 +185,5 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-d
 ### Syncing of `dlt` state
 This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination).
 
-<!--@@@DLT_SNIPPET_START tuba::databricks-->
-## Additional Setup guides
-
-- [Load data from GitHub to Databricks in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-databricks)
-- [Load data from Notion to Databricks in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-databricks)
-- [Load data from Stripe to Databricks in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-databricks)
-- [Load data from HubSpot to Databricks in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-databricks)
-- [Load data from Google Analytics to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-databricks)
-- [Load data from Google Sheets to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-databricks)
-- [Load data from Chess.com to Databricks in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-databricks)
-<!--@@@DLT_SNIPPET_END tuba::databricks-->
+<!--@@@DLT_TUBA databricks-->
+
diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md
@@ -114,14 +114,5 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-d
 ### Syncing of `dlt` state
 This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination).
 
-<!--@@@DLT_SNIPPET_START tuba::duckdb-->
-## Additional Setup guides
-
-- [Load data from Google Analytics to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-duckdb)
-- [Load data from Google Sheets to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-duckdb)
-- [Load data from Stripe to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-duckdb)
-- [Load data from Notion to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-duckdb)
-- [Load data from Chess.com to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-duckdb)
-- [Load data from HubSpot to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-duckdb)
-- [Load data from GitHub to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-duckdb)
-<!--@@@DLT_SNIPPET_END tuba::duckdb-->
+<!--@@@DLT_TUBA duckdb-->
+
diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
@@ -251,5 +251,4 @@ You can choose the following file formats:
 This destination does not support restoring the `dlt` state. You can change that by requesting the [feature](https://github.com/dlt-hub/dlt/issues/new/choose) or contributing to the core library 😄
 You can however easily [backup and restore the pipeline working folder](https://gist.github.com/rudolfix/ee6e16d8671f26ac4b9ffc915ad24b6e) - reusing the bucket and credentials used to store files.
 
-<!--@@@DLT_SNIPPET_START tuba::filesystem-->
-<!--@@@DLT_SNIPPET_END tuba::filesystem-->
+<!--@@@DLT_TUBA filesystem-->
diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md
@@ -98,5 +98,5 @@ Our observation is that if you write a lot of data into the database, then close
 ### Invalid Input Error: Initialization function "motherduck_init" from file
 Use `duckdb 0.8.1` or above.
 
-<!--@@@DLT_SNIPPET_START tuba::motherduck-->
-<!--@@@DLT_SNIPPET_END tuba::motherduck-->
+<!--@@@DLT_TUBA motherduck-->
+
diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md
@@ -110,14 +110,5 @@ destination.mssql.credentials="mssql://loader:<password>@loader.database.windows
 ### dbt support
 No dbt support yet.
 
-<!--@@@DLT_SNIPPET_START tuba::mssql-->
-## Additional Setup guides
-
-- [Load data from Stripe to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-mssql)
-- [Load data from Google Analytics to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-mssql)
-- [Load data from Google Sheets to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-mssql)
-- [Load data from Chess.com to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-mssql)
-- [Load data from GitHub to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-mssql)
-- [Load data from Notion to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-mssql)
-- [Load data from HubSpot to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-mssql)
-<!--@@@DLT_SNIPPET_END tuba::mssql-->
+<!--@@@DLT_TUBA mssql-->
+