diff --git a/docs/examples/chess_production/chess_production.py b/docs/examples/chess_production/chess_production.py index 9e9fa14cdb..7e7a069e2d 100644 --- a/docs/examples/chess_production/chess_production.py +++ b/docs/examples/chess_production/chess_production.py @@ -4,6 +4,17 @@ description: Learn how run chess pipeline in production keywords: [incremental loading, example] --- + +In this example, you'll find a Python script that interacts with the Chess API to extract players and game data. + +We'll learn how to: + +- Inspecting packages after they have been loaded. +- Loading back load information, schema updates, and traces. +- Triggering notifications in case of schema evolution. +- Using context managers to independently retry pipeline stages. +- Run basic tests utilizing `sql_client` and `normalize_info`. + """ import threading diff --git a/docs/examples/connector_x_arrow/connector_x_arrow.py b/docs/examples/connector_x_arrow/connector_x_arrow.py index 2480c8ff1b..b4ff7cdc82 100644 --- a/docs/examples/connector_x_arrow/connector_x_arrow.py +++ b/docs/examples/connector_x_arrow/connector_x_arrow.py @@ -4,6 +4,21 @@ description: Load data from sql queries fast with connector x and arrow tables keywords: [connector x, pyarrow, zero copy] --- + +The example script below takes genome data from public **mysql** instance and then loads it into **duckdb**. Mind that your destination +must support loading of parquet files as this is the format that `dlt` uses to save arrow tables. [Connector X](https://github.com/sfu-db/connector-x) allows to +get data from several popular databases and creates in memory Arrow table which `dlt` then saves to load package and loads to the destination. +:::tip +You can yield several tables if your data is large and you need to partition your load. +::: + +We'll learn: + +- How to get arrow tables from [connector X](https://github.com/sfu-db/connector-x) and yield them. +- That merge and incremental loads work with arrow tables. +- How to enable [incremental loading](../general-usage/incremental-loading) for efficient data extraction. +- How to use build in ConnectionString credentials. + """ import connectorx as cx diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index df83b011cd..c3654697ac 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -4,6 +4,15 @@ description: Learn how use the custom destination to load to bigquery and use credentials keywords: [destination, credentials, example, bigquery, custom destination] --- + +In this example, you'll find a Python script that demonstrates how to load to bigquey with the custom destination. + +We'll learn how to: +- use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials) +- use the [custom destination](../dlt-ecosystem/destinations/destination.md) +- Use pyarrow tables to create complex column types on bigquery +- Use bigquery `autodetect=True` for schema inference from parquet files + """ from tests.pipeline.utils import assert_load_info diff --git a/docs/examples/google_sheets/google_sheets.py b/docs/examples/google_sheets/google_sheets.py index 93d3e24f55..131cf3e31a 100644 --- a/docs/examples/google_sheets/google_sheets.py +++ b/docs/examples/google_sheets/google_sheets.py @@ -4,6 +4,18 @@ description: Learn how work with Google services keywords: [google sheets, credentials, example] --- + +In this example, you'll find a Python script that demonstrates how to load Google Sheets data using the `dlt` library. + +We'll learn how to: +- use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials); +- use [union of credentials](../general-usage/credentials/config_specs#working-with-alternatives-of-credentials-union-types); +- create [dynamically generated resources](../general-usage/source#create-resources-dynamically). + +:::tip +This example is for educational purposes. For best practices, we recommend using [Google Sheets verified source](../dlt-ecosystem/verified-sources/google_sheets.md). +::: + """ __source_name__ = "google_sheets" diff --git a/docs/examples/incremental_loading/incremental_loading.py b/docs/examples/incremental_loading/incremental_loading.py index 30d9792958..4098d6027f 100644 --- a/docs/examples/incremental_loading/incremental_loading.py +++ b/docs/examples/incremental_loading/incremental_loading.py @@ -4,6 +4,17 @@ description: Learn how do incremental loading in consecutive runs keywords: [incremental loading, example] --- + +In this example, you'll find a Python script that interacts with the Zendesk Support API to extract ticket events data. + +We'll learn: + +- How to pass [credentials](../general-usage/credentials) as dict and how to type the `@dlt.source` function arguments. +- How to set [the nesting level](../general-usage/source#reduce-the-nesting-level-of-generated-tables). +- How to enable [incremental loading](../general-usage/incremental-loading) for efficient data extraction. +- How to specify [the start and end dates](../general-usage/incremental-loading#using-dltsourcesincremental-for-backfill) for the data loading and how to [opt-in to Airflow scheduler](../../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) by setting `allow_external_schedulers` to `True`. +- How to work with timestamps, specifically converting them to Unix timestamps for incremental data extraction. +- How to use the `start_time` parameter in API requests to retrieve data starting from a specific timestamp. """ # because the example below uses credentials and it is copied to the module zendesk.py @@ -134,7 +145,6 @@ def get_pages( load_info = pipeline.run(zendesk_support()) print(load_info) - -# check that stuff was loaded -row_counts = pipeline.last_trace.last_normalize_info.row_counts -assert row_counts["ticket_events"] == 17 + # check that stuff was loaded + row_counts = pipeline.last_trace.last_normalize_info.row_counts + assert row_counts["ticket_events"] == 17 diff --git a/docs/examples/nested_data/nested_data.py b/docs/examples/nested_data/nested_data.py index edc8109c4d..e6d10343b6 100644 --- a/docs/examples/nested_data/nested_data.py +++ b/docs/examples/nested_data/nested_data.py @@ -4,7 +4,18 @@ description: Learn how control nested data keywords: [incremental loading, example] --- + +In this example, you'll find a Python script that demonstrates how to control nested data using the `dlt` library. + +We'll learn how to: +- [Adjust maximum nesting level in three ways:](../general-usage/source#reduce-the-nesting-level-of-generated-tables) + - Limit nesting levels with dlt decorator. + - Dynamic nesting level adjustment. + - Apply data type hints. +- Work with [MongoDB](../dlt-ecosystem/verified-sources/mongodb) in Python and `dlt`. +- Enable [incremental loading](../general-usage/incremental-loading) for efficient data extraction. """ + __source_name__ = "mongodb" from itertools import islice diff --git a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py index 03f8061865..751f35374f 100644 --- a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py +++ b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py @@ -4,6 +4,23 @@ description: Extract text from PDF and load it into a vector database keywords: [pdf, weaviate, vector store, vector database, ] --- + +We'll use PyPDF2 to extract text from PDFs. Make sure you have it installed: + +```sh +pip install PyPDF2 +``` + +We start with a simple resource that lists files in specified folder. To that we add a **filter** function that removes all files that are not pdfs. + +To parse PDFs we use [PyPDF](https://pypdf2.readthedocs.io/en/3.0.0/user/extract-text.html) and return each page from a given PDF as separate data item. + +Parsing happens in `@dlt.transformer` which receives data from `list_files` resource. It splits PDF into pages, extracts text and yields pages separately +so each PDF will correspond to many items in Weaviate `InvoiceText` class. We set the primary key and use merge disposition so if the same PDF comes twice +we'll just update the vectors, and not duplicate. + +Look how we pipe data from `list_files` resource (note that resource is deselected so we do not load raw file items to destination) into `pdf_to_text` using **|** operator. + """ from tests.pipeline.utils import assert_load_info diff --git a/docs/examples/qdrant_zendesk/qdrant_zendesk.py b/docs/examples/qdrant_zendesk/qdrant_zendesk.py index 32df342b32..97e106009f 100644 --- a/docs/examples/qdrant_zendesk/qdrant_zendesk.py +++ b/docs/examples/qdrant_zendesk/qdrant_zendesk.py @@ -4,6 +4,25 @@ description: Learn how to use the dlt source, Zendesk and dlt destination, Qdrant to conduct a similarity search on your tickets data. keywords: [similarity search, example] --- + +This article outlines a system to map vectorized ticket data from Zendesk to Qdrant, similar to our guide on the topic concerning [Weaviate](https://dlthub.com/docs/dlt-ecosystem/destinations/qdrant). In this example, we will: +- Connect to our [Zendesk source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk). +- Extract tickets data from our Zendesk source. +- [Create a dlt pipeline](https://dlthub.com/docs/walkthroughs/create-a-pipeline) with Qdrant as destination. +- Vectorize/embed the tickets data from Zendesk. +- Pass the vectorized data to be stored in Qdrant via the dlt pipeline. +- Query data that we stored in Qdrant. +- Explore the similarity search results. + +First, configure the destination credentials for [Qdrant](https://dlthub.com/docs/dlt-ecosystem/destinations/qdrant#setup-guide) and [Zendesk](https://dlthub.com/docs/walkthroughs/zendesk-weaviate#configuration) in `.dlt/secrets.toml`. + +Next, make sure you have the following dependencies installed: + +```sh +pip install qdrant-client>=1.6.9 +pip install fastembed>=0.1.1 +``` + """ __source_name__ = "zendesk" diff --git a/docs/examples/transformers/transformers.py b/docs/examples/transformers/transformers.py index 08a9d0159c..ebf1f935ba 100644 --- a/docs/examples/transformers/transformers.py +++ b/docs/examples/transformers/transformers.py @@ -4,6 +4,17 @@ description: Learn how to use dlt transformers and how to speed up your loads with parallelism keywords: [transformers, parallelism, example] --- + +For this example, we will be loading Pokemon data from the [PokeAPI](https://pokeapi.co/) with the help of transformers to load +Pokemon details in parallel. + +We'll learn how to: +- create 2 [transformers](../general-usage/resource.md#feeding-data-from-one-resource-into-another) and connect them to a resource with the pipe operator `|`; +- [load these transformers in parallel](../reference/performance.md#parallelism) using the `@dlt.defer` decorator; +- [configure parallelism](../reference/performance.md#parallel-pipeline-config-example) in the `config.toml` file; +- deselect the main resource, so it will not be loaded into the database; +- importing and using a pre-configured `requests` library with automatic retries (`from dlt.sources.helpers import requests`). + """ import dlt @@ -12,7 +23,6 @@ @dlt.source(max_table_nesting=2) def source(pokemon_api_url: str): - # note that we deselect `pokemon_list` - we do not want it to be loaded @dlt.resource(write_disposition="replace", selected=False) def pokemon_list(): diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index a313367908..8f68e9d5f5 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -11,6 +11,19 @@ // @ts-check const fs = require('fs'); +const path = require('path'); + + +function *walkSync(dir) { + const files = fs.readdirSync(dir, { withFileTypes: true }); + for (const file of files) { + if (file.isDirectory()) { + yield* walkSync(path.join(dir, file.name)); + } else { + yield path.join(dir, file.name); + } + } +} /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ const sidebars = { @@ -270,15 +283,6 @@ const sidebars = { keywords: ['examples'], }, items: [ - 'examples/transformers/index', - 'examples/incremental_loading/index', - 'examples/connector_x_arrow/index', - 'examples/chess_production/index', - 'examples/nested_data/index', - 'examples/qdrant_zendesk/index', - 'examples/google_sheets/index', - 'examples/pdf_to_weaviate/index', - 'examples/custom_destination_bigquery/index' ], }, { @@ -306,6 +310,19 @@ const sidebars = { ] }; + +// insert examples +for (const item of sidebars.tutorialSidebar) { + if (item.label === 'Code examples') { + for (let examplePath of walkSync("./docs_processed/examples")) { + examplePath = examplePath.replace("docs_processed/", ""); + examplePath = examplePath.replace(".md", ""); + item.items.push(examplePath); + } + } +} + + // inject api reference if it exists if (fs.existsSync('./docs_processed/api_reference/sidebar.json')) { for (const item of sidebars.tutorialSidebar) { diff --git a/docs/website/tools/preprocess_docs.js b/docs/website/tools/preprocess_docs.js index 2e2749cf30..0a2e5f3ad0 100644 --- a/docs/website/tools/preprocess_docs.js +++ b/docs/website/tools/preprocess_docs.js @@ -15,9 +15,8 @@ const DOCS_EXTENSIONS = [".md", ".mdx"]; const SNIPPETS_FILE_SUFFIX = "-snippets.py" // examples settings -const EXAMPLES_DESTINATION_DIR = "./docs/examples/"; +const EXAMPLES_DESTINATION_DIR = `./${MD_TARGET_DIR}examples/`; const EXAMPLES_SOURCE_DIR = "../examples/"; -const EXAMPLES_MAIN_SNIPPET_NAME = "example"; // markers const DLT_MARKER = "@@@DLT"; @@ -243,15 +242,105 @@ function preprocess_docs() { } +function trimArray(lines) { + if (lines.length == 0) { + return lines; + } + while (!lines[0].trim()) { + lines.shift(); + } + while (!lines[lines.length-1].trim()) { + lines.pop(); + } + return lines; +} + /** - * Sync examples into examples folder + * Sync examples into docs */ function syncExamples() { + + let count = 0; for (const exampleDir of listDirsSync(EXAMPLES_SOURCE_DIR)) { + if (exampleDir.includes("archive")) { + continue; + } + const exampleName = exampleDir.split("/").slice(-1)[0]; + const exampleFile = `${EXAMPLES_SOURCE_DIR}${exampleName}/${exampleName}.py`; + const targetFileName = `${EXAMPLES_DESTINATION_DIR}/${exampleName}.md`; + const lines = fs.readFileSync(exampleFile, 'utf8').split(/\r?\n/); + + let commentCount = 0; + let headerCount = 0; + + // separate file content + const header = [] + const markdown = [] + const code = [] + + for (const line of lines) { + + // find file docstring boundaries + if (line.startsWith(`"""`)) { + commentCount += 1 + if (commentCount > 2) { + throw new Error(); + } + continue; + } + + // find header boundaries + if (line.startsWith(`---`)) { + headerCount += 1; + if (headerCount > 2) { + throw new Error(); + } + continue; + } + + if (headerCount == 1) { + header.push(line); + } + else if (commentCount == 1) { + markdown.push(line) + } + else if (commentCount == 2) { + code.push(line); + } + + } + + let output = []; + + + output.push("---") + output = output.concat(header); + output.push("---") + + // add tip + output.push(":::info") + const url = `https://github.com/dlt-hub/dlt/tree/devel/docs/examples/${exampleName}` + output.push(`The source code for this example can be found in our repository at: `) + output.push(url); + output.push(":::") + + output.push("## About this Example") + output = output.concat(trimArray(markdown)); + + output.push("### Full source code") + output.push("```py"); + output = output.concat(trimArray(code)); + output.push("```"); + fs.mkdirSync(path.dirname(targetFileName), { recursive: true }); + fs.writeFileSync(targetFileName, output.join("\n")); + + count += 1; } + console.log(`Synced ${count} examples`) } +syncExamples(); preprocess_docs(); /** @@ -266,6 +355,7 @@ if (process.argv.includes("--watch")) { return; } console.log('%s changed...', name); + syncExamples(); preprocess_docs(); lastUpdate = Date.now(); });