From cf3ac9f41b71dcf5728406794c880df2fd674529 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Mon, 25 Mar 2024 12:55:14 +0100 Subject: [PATCH] docs: enabled listing for docs snippets (#1143) * start fixing blog snippets * fix parsing and linting errors * fix toml snippets --- docs/tools/utils.py | 11 ++++++ ...6-14-dlthub-gpt-accelerated learning_01.md | 19 +++++----- .../blog/2023-08-14-dlt-motherduck-blog.md | 2 +- .../blog/2023-08-21-dlt-lineage-support.md | 2 +- docs/website/blog/2023-08-24-dlt-etlt.md | 6 ++-- docs/website/blog/2023-09-05-mongo-etl.md | 22 ++++++------ .../blog/2023-09-26-verba-dlt-zendesk.md | 29 ++++++++------- docs/website/blog/2023-10-06-dlt-holistics.md | 22 ++++++------ .../blog/2023-10-09-dlt-ops-startups.md | 8 ++--- .../blog/2023-10-16-first-data-warehouse.md | 2 +- docs/website/blog/2023-10-19-dbt-runners.md | 6 ++-- docs/website/blog/2023-10-23-arrow-loading.md | 8 ++--- docs/website/blog/2023-10-25-dlt-deepnote.md | 2 +- docs/website/blog/2023-10-26-dlt-prefect.md | 13 ++++--- .../blog/2023-10-30-data-modelling-tools.md | 4 +-- docs/website/blog/2023-11-01-dlt-dagster.md | 36 +++++++++---------- ...1-22-dlt-webhooks-event-based-ingestion.md | 11 +++--- .../blog/2023-11-27-dlt-data-lineage.md | 20 +++++------ .../blog/2023-12-01-dlt-kestra-demo.md | 2 +- .../blog/2023-12-13-dlt-aws-taktile-blog.md | 12 +++---- .../2024-01-08-streaming-pubsub-json-gcp.md | 2 +- docs/website/blog/2024-01-10-dlt-mode.md | 4 +-- ...01-15-dlt-dbt-runner-on-cloud-functions.md | 9 ++--- .../blog/2024-01-16-dlt-dbt-semantic-layer.md | 16 ++++----- ...24-02-21-pipelines-single-pane-of-glass.md | 2 +- ...2024-03-07-openapi-generation-chargebee.md | 15 ++++---- .../2024-03-11-moving-away-from-segment.md | 17 +++++---- 27 files changed, 158 insertions(+), 144 deletions(-) diff --git a/docs/tools/utils.py b/docs/tools/utils.py index 074b19b8e1..b7d401b893 100644 --- a/docs/tools/utils.py +++ b/docs/tools/utils.py @@ -5,12 +5,15 @@ DOCS_DIR = "../website/docs" +BLOG_DIR = "../website/blog" def collect_markdown_files(verbose: bool) -> List[str]: """ Discovers all docs markdown files """ + + # collect docs pages markdown_files: List[str] = [] for path, _, files in os.walk(DOCS_DIR): if "api_reference" in path: @@ -23,6 +26,14 @@ def collect_markdown_files(verbose: bool) -> List[str]: if verbose: fmt.echo(f"Discovered {os.path.join(path, file)}") + # collect blog pages + for path, _, files in os.walk(BLOG_DIR): + for file in files: + if file.endswith(".md"): + markdown_files.append(os.path.join(path, file)) + if verbose: + fmt.echo(f"Discovered {os.path.join(path, file)}") + if len(markdown_files) < 50: # sanity check fmt.error("Found too few files. Something went wrong.") exit(1) diff --git a/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md b/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md index 394504dc64..08180b379e 100644 --- a/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md +++ b/docs/website/blog/2023-06-14-dlthub-gpt-accelerated learning_01.md @@ -47,9 +47,11 @@ The code provided below demonstrates training a chat-oriented GPT model using th -```python -!python3 -m pip install --upgrade langchain deeplake openai tiktoken +```sh +python -m pip install --upgrade langchain deeplake openai tiktoken +``` +```py # Create accounts on platform.openai.com and deeplake.ai. After registering, retrieve the access tokens for both platforms and securely store them for use in the next step. Enter the access tokens grabbed in the last step and enter them when prompted import os @@ -65,7 +67,7 @@ embeddings = OpenAIEmbeddings(disallowed_special=()) #### 2. Create a directory to store the code for training the model. Clone the desired repositories into that. -```python +```sh # making a new directory named dlt-repo !mkdir dlt-repo # changing the directory to dlt-repo @@ -80,7 +82,7 @@ embeddings = OpenAIEmbeddings(disallowed_special=()) ``` #### 3. Load the files from the directory -```python +```py import os from langchain.document_loaders import TextLoader @@ -95,7 +97,7 @@ for dirpath, dirnames, filenames in os.walk(root_dir): pass ``` #### 4. Load the files from the directory -```python +```py import os from langchain.document_loaders import TextLoader @@ -111,7 +113,7 @@ for dirpath, dirnames, filenames in os.walk(root_dir): ``` #### 5. Splitting files to chunks -```python +```py # This code uses CharacterTextSplitter to split documents into smaller chunksbased on character count and store the resulting chunks in the texts variable. from langchain.text_splitter import CharacterTextSplitter @@ -119,7 +121,8 @@ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts = text_splitter.split_documents(docs) ``` #### 6. Create Deeplake dataset -```python + +```sh # Set up your deeplake dataset by replacing the username with your Deeplake account and setting the dataset name. For example if the deeplakes username is “your_name” and the dataset is “dlt-hub-dataset” username = "your_deeplake_username" # replace with your username from app.activeloop.ai @@ -138,7 +141,7 @@ retriever.search_kwargs['maximal_marginal_relevance'] = True retriever.search_kwargs['k'] = 10 ``` #### 7. Initialize the GPT model -```python +```py from langchain.chat_models import ChatOpenAI from langchain.chains import ConversationalRetrievalChain diff --git a/docs/website/blog/2023-08-14-dlt-motherduck-blog.md b/docs/website/blog/2023-08-14-dlt-motherduck-blog.md index 9f48d808a5..21aa7139f3 100644 --- a/docs/website/blog/2023-08-14-dlt-motherduck-blog.md +++ b/docs/website/blog/2023-08-14-dlt-motherduck-blog.md @@ -70,7 +70,7 @@ This is a perfect problem to test out my new super simple and highly customizabl `dlt init bigquery duckdb` This creates a folder with the directory structure - ``` + ```text ├── .dlt │ ├── config.toml │ └── secrets.toml diff --git a/docs/website/blog/2023-08-21-dlt-lineage-support.md b/docs/website/blog/2023-08-21-dlt-lineage-support.md index a76f89ed6a..90f6eb58aa 100644 --- a/docs/website/blog/2023-08-21-dlt-lineage-support.md +++ b/docs/website/blog/2023-08-21-dlt-lineage-support.md @@ -63,7 +63,7 @@ By combining row and column level lineage, you can have an easy overview of wher After a pipeline run, the schema evolution info gets stored in the load info. Load it back to the database to persist the column lineage: -```python +```py load_info = pipeline.run(data, write_disposition="append", table_name="users") diff --git a/docs/website/blog/2023-08-24-dlt-etlt.md b/docs/website/blog/2023-08-24-dlt-etlt.md index 3e27a21338..fb8215c9a0 100644 --- a/docs/website/blog/2023-08-24-dlt-etlt.md +++ b/docs/website/blog/2023-08-24-dlt-etlt.md @@ -83,7 +83,7 @@ This engine is configurable in both how it works and what it does, you can read more here: [Normaliser, schema settings](https://dlthub.com/docs/general-usage/schema#data-normalizer) Here is a usage example (it's built into the pipeline): -```python +```py import dlt @@ -119,7 +119,7 @@ Besides your own customisations, `dlt` also supports injecting your transform co Here is a code example of pseudonymisation, a common case where data needs to be transformed before loading: -```python +```py import dlt import hashlib @@ -168,7 +168,7 @@ load_info = pipeline.run(data_source) Finally, once you have clean data loaded, you will probably prefer to use SQL and one of the standard tools. `dlt` offers a dbt runner to get you started easily with your transformation package. -```python +```py pipeline = dlt.pipeline( pipeline_name='pipedrive', destination='bigquery', diff --git a/docs/website/blog/2023-09-05-mongo-etl.md b/docs/website/blog/2023-09-05-mongo-etl.md index 19e1f18682..0e4a3d83f2 100644 --- a/docs/website/blog/2023-09-05-mongo-etl.md +++ b/docs/website/blog/2023-09-05-mongo-etl.md @@ -139,21 +139,21 @@ Here's a code explanation of how it works under the hood: example of how this nested data could look: ```json - data = { - 'id': 1, - 'name': 'Alice', - 'job': { + { + "id": 1, + "name": "Alice", + "job": { "company": "ScaleVector", - "title": "Data Scientist", + "title": "Data Scientist" }, - 'children': [ + "children": [ { - 'id': 1, - 'name': 'Eve' + "id": 1, + "name": "Eve" }, { - 'id': 2, - 'name': 'Wendy' + "id": 2, + "name": "Wendy" } ] } @@ -161,7 +161,7 @@ Here's a code explanation of how it works under the hood: 1. We can load the data to a supported destination declaratively: - ```python + ```py import dlt pipeline = dlt.pipeline( diff --git a/docs/website/blog/2023-09-26-verba-dlt-zendesk.md b/docs/website/blog/2023-09-26-verba-dlt-zendesk.md index 1990a5df7f..f3825b4427 100644 --- a/docs/website/blog/2023-09-26-verba-dlt-zendesk.md +++ b/docs/website/blog/2023-09-26-verba-dlt-zendesk.md @@ -40,7 +40,7 @@ In this blog post, we'll guide you through the process of building a RAG applica Create a new folder for your project and install Verba: -```bash +```sh mkdir verba-dlt-zendesk cd verba-dlt-zendesk python -m venv venv @@ -50,7 +50,7 @@ pip install goldenverba To configure Verba, we need to set the following environment variables: -```bash +```sh VERBA_URL=https://your-cluster.weaviate.network # your Weaviate instance URL VERBA_API_KEY=F8...i4WK # the API key of your Weaviate instance OPENAI_API_KEY=sk-...R # your OpenAI API key @@ -61,13 +61,13 @@ You can put them in a `.env` file in the root of your project or export them in Let's test that Verba is installed correctly: -```bash +```sh verba start ``` You should see the following output: -```bash +```sh INFO: Uvicorn running on (Press CTRL+C to quit) ℹ Setting up client ✔ Client connected to Weaviate Cluster @@ -88,7 +88,7 @@ If you try to ask a question now, you'll get an error in return. That's because We get our data from Zendesk using dlt. Let's install it along with the Weaviate extra: -```bash +```sh pip install "dlt[weaviate]" ``` @@ -96,7 +96,7 @@ This also installs a handy CLI tool called `dlt`. It will help us initialize the Let's initialize the verified source: -```bash +```sh dlt init zendesk weaviate ``` @@ -104,7 +104,7 @@ dlt init zendesk weaviate To make things easier, we'll use the email address and password authentication method for Zendesk API. Let's add our credentials to `secrets.toml`: -```yaml +```toml [sources.zendesk.credentials] password = "your-password" subdomain = "your-subdomain" @@ -113,14 +113,13 @@ email = "your-email@example.com" We also need to specify the URL and the API key of our Weaviate instance. Copy the credentials for the Weaviate instance you created earlier and add them to `secrets.toml`: -```yaml +```toml [destination.weaviate.credentials] url = "https://your-cluster.weaviate.network" api_key = "F8.....i4WK" [destination.weaviate.credentials.additional_headers] X-OpenAI-Api-Key = "sk-....." - ``` All the components are now in place and configured. Let's set up a pipeline to import data from Zendesk. @@ -129,7 +128,7 @@ All the components are now in place and configured. Let's set up a pipeline to i Open your favorite text editor and create a file called `zendesk_verba.py`. Add the following code to it: -```python +```py import itertools import dlt @@ -217,13 +216,13 @@ Finally, we run the pipeline and print the load info. Let's run the pipeline: -```bash +```sh python zendesk_verba.py ``` You should see the following output: -```bash +```sh Pipeline zendesk_verba completed in 8.27 seconds 1 load package(s) were loaded to destination weaviate and into dataset None The weaviate destination used location to store data @@ -235,13 +234,13 @@ Verba is now populated with data from Zendesk Support. However there are a coupl Run the following command: -```bash +```sh verba init ``` You should see the following output: -```bash +```sh ===================== Creating Document and Chunk class ===================== ℹ Setting up client ✔ Client connected to Weaviate Cluster @@ -264,7 +263,7 @@ Document class already exists, do you want to overwrite it? (y/n): n We're almost there! Let's start Verba: -```bash +```sh verba start ``` diff --git a/docs/website/blog/2023-10-06-dlt-holistics.md b/docs/website/blog/2023-10-06-dlt-holistics.md index b2791bd2ec..c5e9b2ca46 100644 --- a/docs/website/blog/2023-10-06-dlt-holistics.md +++ b/docs/website/blog/2023-10-06-dlt-holistics.md @@ -92,7 +92,7 @@ In this section, we walk through how to set up a MongoDB data pipeline using `dl Use the command below to install `dlt`. -```bash +```sh pip3 install -U dlt ``` @@ -100,13 +100,13 @@ Consider setting up a virtual environment for your projects and installing the p Once we have `dlt` installed, we can go ahead and initialize a verified MongoDB pipeline with the destination set to Google BigQuery. First, create a project directory and then execute the command below: -```python +```sh dlt init mongodb bigquery ``` The above command will create a local ready-made pipeline that we can customize to our needs. After executing the command your project directory will look as follows: -```bash +```text . ├── .dlt │ ├── config.toml @@ -127,7 +127,7 @@ We also need to set up the GCP service account credentials to get permissions to Once all the credentials are set add them to the `secrets.toml` file. Your file should look something like this: -```bash +```toml # put your secret values and credentials here. do not share this file and do not push it to github [sources.mongodb] connection_url = "mongodb+srv://:@.cvanypn.mongodb.net" # please set me up! @@ -143,7 +143,7 @@ client_email = "@analytics.iam.gserviceaccount.com" # please set me up The `mongodb_pipeline.py` at the root of your project directory is the script that runs the pipeline. It contains many functions that provide different ways of loading the data. The selection of the function depends on your specific use case, but for this demo, we try to keep it simple and use the `load_entire_database` function. -```python +```py def load_entire_database(pipeline: Pipeline = None) -> LoadInfo: """Use the mongo source to completely load all collection in a database""" if pipeline is None: @@ -165,13 +165,13 @@ def load_entire_database(pipeline: Pipeline = None) -> LoadInfo: Before we execute the pipeline script let's install the dependencies for the pipeline by executing the `requirements.txt` file. -```bash +```sh pip install -r requirements.txt ``` Finally, we are ready to execute the script. In the main function uncomment the `load_entire_database` function call and run the script. -```bash +```sh python mongodb_pipeline.py ``` @@ -290,7 +290,7 @@ This is a typical way data is structured in a NoSQL database. The data is in a J The ddl (data definition language) for the movies table in BigQuery can be seen below: -```json +```sql CREATE TABLE `dlthub-analytics.mongo_database.movies` ( _id STRING NOT NULL, @@ -354,7 +354,7 @@ In Holistics, add a new data source click on the plus sign (+) on the top menu, Once the BigQuery source is added we are ready to import the schemas from BigQuery into Holistics. The schema(`dataset_name`) name under which dlt loaded the MongoDB data is defined in the `load_entire_database` function when we create the MongoDB pipeline. -```bash +```sh # Create a pipeline pipeline = dlt.pipeline( pipeline_name="local_mongo", @@ -399,13 +399,13 @@ The resulting relationship can seen As Code using the Holistics 4.0 Analytics as Previously, we created the relationship between the `cast` and the `movies` tables using GUI, now let’s add the relationship between the `directors` and `movies` tables using the Analytics as Code feature. In the `dataset.aml` file append the relationships block with the following line of code: -```python +```py relationship(model__mongo_database_movies_directors.dlt_parent_id > model__mongo_database_movies.dlt_id, true) ``` After the change, the `dataset.aml` file should look like this: -```python +```sh import '../Models/mongo_database_movies.model.aml' { mongo_database_movies as model__mongo_database_movies } diff --git a/docs/website/blog/2023-10-09-dlt-ops-startups.md b/docs/website/blog/2023-10-09-dlt-ops-startups.md index 94c1ff662b..dd21725f90 100644 --- a/docs/website/blog/2023-10-09-dlt-ops-startups.md +++ b/docs/website/blog/2023-10-09-dlt-ops-startups.md @@ -61,14 +61,14 @@ The `dlt` [init command](https://dlthub.com/docs/reference/command-line-interfac - Open `.dlt/secrets.toml` file on your laptop. - Enter the OpenAI secrets: - ``` + ```toml [sources.unstructured_data] openai_api_key = "openai_api_key" ``` - Enter your email account secrets in the same section `[sources.unstructured_data]`: - ``` + ```toml host = 'imap.example.com' email_account = "example@example.com" password = 'set me up!' @@ -78,7 +78,7 @@ The `dlt` [init command](https://dlthub.com/docs/reference/command-line-interfac - Enter the BigQuery secrets: - ``` + ```toml [destination.bigquery] location = "US" [destination.bigquery.credentials] @@ -96,7 +96,7 @@ This is the part where you can define what you’d like to see as an outcome. Queries example: -```python +```py INVOICE_QUERIES = { "recipient_company_name": "Who is the recipient of the invoice? Just return the name. If you don't know, then return None", "invoice_amount": "What is the total amount of the invoice? Just return the amount as decimal number, no currency or text. If you don't know, then return None", diff --git a/docs/website/blog/2023-10-16-first-data-warehouse.md b/docs/website/blog/2023-10-16-first-data-warehouse.md index 79186fd267..641751eb1d 100644 --- a/docs/website/blog/2023-10-16-first-data-warehouse.md +++ b/docs/website/blog/2023-10-16-first-data-warehouse.md @@ -75,7 +75,7 @@ For those new to pushing data via an API, it may seem intimidating. Let's simplify - sending data to an API endpoint for loading or updating an object is similar to making a `GET` request. Here's a straightforward example in Python: -```python +```py # Assuming data is in this format import requests # assume we have a table of contacts we want to push to Pipedrive. diff --git a/docs/website/blog/2023-10-19-dbt-runners.md b/docs/website/blog/2023-10-19-dbt-runners.md index 713815abb0..9eb22c050f 100644 --- a/docs/website/blog/2023-10-19-dbt-runners.md +++ b/docs/website/blog/2023-10-19-dbt-runners.md @@ -149,7 +149,7 @@ The Cloud runner we support can do the following: - Check the status of a dbt job in your account. Code example: -```python +```py from dlt.helpers.dbt_cloud import run_dbt_cloud_job # Trigger a job run with additional data @@ -179,7 +179,7 @@ The core runner does the following: - Execute the package and report the outcome. Code example: -```python +```py # Create a transformation on a new dataset called 'pipedrive_dbt' # we created a local dbt package # and added pipedrive_raw to its sources.yml @@ -210,7 +210,7 @@ for m in models: f"Model {m.model_name} materialized" + f"in {m.time}" + f"with status {m.status}" + - f"and message {m.message}" + f"and message {m.message}") ``` ## 4. A short demo on how to do that with dlt’s dbt runner. diff --git a/docs/website/blog/2023-10-23-arrow-loading.md b/docs/website/blog/2023-10-23-arrow-loading.md index 978586fa76..2f25511d73 100644 --- a/docs/website/blog/2023-10-23-arrow-loading.md +++ b/docs/website/blog/2023-10-23-arrow-loading.md @@ -18,13 +18,13 @@ Here we achieved ~30x speedups when loading data from (local) postgres database We’ll start with [ConnectorX library](https://github.com/sfu-db/connector-x) that creates Arrow tables from SQL queries on most of the popular database engines. -```python +```sh pip install connectorx ``` Lib has Rust inside, zero copy extraction and is amazingly fast. We’ll extract and normalize 10 000 000 [test rows](https://github.com/dlt-hub/verified-sources/blob/master/tests/sql_database/sql_source.py#L88) from local postgresql. The table **chat_message** looks like Slack messages dump. Messages have unique autoincrement **id** which we use to load in chunks: -```python +```py import connectorx as cx import dlt from dlt.sources.credentials import ConnectionStringCredentials @@ -49,7 +49,7 @@ chat_messages = dlt.resource( In this demo I just extract and normalize data and skip the loading step. -```python +```py pipeline = dlt.pipeline(destination="duckdb", full_refresh=True) # extract first pipeline.extract(chat_messages) @@ -78,7 +78,7 @@ Step normalize COMPLETED in 0.08 seconds. Here’s corresponding code working with **SqlAlchemy**. We process 10 000 000 rows, yielding in 100k rows packs and normalize to parquet in 3 parallel processes. -```python +```py from itertools import islice import dlt from sqlalchemy import create_engine diff --git a/docs/website/blog/2023-10-25-dlt-deepnote.md b/docs/website/blog/2023-10-25-dlt-deepnote.md index 864353a36d..2674ceae7d 100644 --- a/docs/website/blog/2023-10-25-dlt-deepnote.md +++ b/docs/website/blog/2023-10-25-dlt-deepnote.md @@ -37,7 +37,7 @@ likely than not, you spend more time fixing data pipelines or data formats then on ML algorithms or dashboard designs. We aren’t always lucky enough to get structured data to work with. Imagine a world where your training data is just this statement without no prior work: -```jsx +```sql select * from ``` diff --git a/docs/website/blog/2023-10-26-dlt-prefect.md b/docs/website/blog/2023-10-26-dlt-prefect.md index 8bd6321489..6e9caa3fea 100644 --- a/docs/website/blog/2023-10-26-dlt-prefect.md +++ b/docs/website/blog/2023-10-26-dlt-prefect.md @@ -82,8 +82,7 @@ It would take some effort to interpret even a simple response like this one for "updated": 1502138686, "is_app_user": false, "has_2fa": false - }, - // ... (more data) + } ] } ``` @@ -92,14 +91,14 @@ You can use dlt to build a Slack to BigQuery pipeline in just a few seconds with Seriously, it is that simple. In preparation, let’s make sure to install what we need: -```bash +```sh pip install dlt pip install prefect ```` Then just run a simple init command: -```bash +```sh dlt init slack bigquery ``` @@ -126,7 +125,7 @@ Note that we are redacting some of the code in the preview for brevity, to follow along completely navigate to the repo. -```python +```py # Pipeline to load Slack into BigQuery from typing import List @@ -190,14 +189,14 @@ that can make sure your pipelines aren’t causing you stress in the middle of t Make sure you’re logged in to Prefect Cloud by [signing up](https://app.prefect.cloud/?utm_source=dltblog) and using the following command: -```bash +```sh prefect cloud login ``` Luckily, Prefect is also incredibly Pythonic. Turning any pipeline into an observable, scheduled Prefect flow is as simple as adding decorators to your functions and `serving` it up. Here’s our `dlt` generated pipeline, scheduled daily: -```python +```py from typing import List import dlt diff --git a/docs/website/blog/2023-10-30-data-modelling-tools.md b/docs/website/blog/2023-10-30-data-modelling-tools.md index e5839ee66e..960d80a569 100644 --- a/docs/website/blog/2023-10-30-data-modelling-tools.md +++ b/docs/website/blog/2023-10-30-data-modelling-tools.md @@ -71,7 +71,7 @@ Our database is based on the data published by [LivWell](https://www.nature.com/ Sample input structure: -```jsx +```py [{"survey_id": "AM2000DHS", "country": "Armenia", "marriage_related": [{...}, {...}, ...], @@ -81,7 +81,7 @@ Sample input structure: "health_related": [{...}, {...}, ...], "age_related": [{...}, {...}, ...] }, - {...}, {...}, {...}, ...}] + {...}, {...}, {...}, {...}] ``` To break it up into proper tables representing the different sections of the surveys, we gave this data to **dlt** to unpack it into a flat relational structure into BigQuery. dlt automatically unpacked the original data into connected tables. The various child tables link to the parent table `wellness` using foreign keys. `Wellness` contains surveys identified by ID and country. The final setup of indicators broken up into different categories can be found below, as displayed by Power BI. This structured database has been used to experiment with all three dashboarding tools in this article. diff --git a/docs/website/blog/2023-11-01-dlt-dagster.md b/docs/website/blog/2023-11-01-dlt-dagster.md index 4da685be73..dc05a35bff 100644 --- a/docs/website/blog/2023-11-01-dlt-dagster.md +++ b/docs/website/blog/2023-11-01-dlt-dagster.md @@ -33,7 +33,7 @@ As we will be ingesting data into BigQuery we first need to create service accou Once we have the credentials we are ready to begin. Let’s first install Dagster and `dlt`. The below commands should install both. -```python +```sh pip install dlt pip install dagster dagster-webserver ``` @@ -42,13 +42,13 @@ pip install dagster dagster-webserver As a first step, we will create the GitHub issues pipeline using `dlt`. -```bash +```sh dlt init github_issues bigquery ``` This will generate a template for us to create a new pipeline. Under `.dlt/secrets.toml` add the service account credentials for BigQuery. Then in the `github_issues.py` delete the generated code and add the following: -```python +```py @dlt.resource(write_disposition="append") def github_issues_resource(api_secret_key=dlt.secrets.value): owner = 'dlt-hub' @@ -88,7 +88,7 @@ The above code creates a simple **github_issues** pipeline that gets the issues To run the pipeline execute the below commands: -```bash +```sh pip install -r requirements.txt python github_issues.py ``` @@ -103,7 +103,7 @@ We will need to adjust our pipeline a bit to orchestrate it using Dagster. - Create a new directory for your Dagster project and scaffold the basic structure: -```bash +```sh mkdir dagster_github_issues cd dagster_github_issues dagster project scaffold --name github-issues @@ -115,7 +115,7 @@ This will generate the default files for Dagster that we will use as a starting - Inside the `github-issues/github_issues` directory create the following folders: `assets`, `resources`, and `dlt`. -```bash +```sh . ├── README.md ├── github_issues @@ -143,7 +143,7 @@ This will generate the default files for Dagster that we will use as a starting - Define a `DltResource` class in `resources/__init__.py` as a Dagster configurable resource. This class allows you to reuse pipeline code inside an asset. -```python +```py from dagster import ConfigurableResource import dlt @@ -167,7 +167,7 @@ class DltResource(ConfigurableResource): - Define the asset, `issues_pipeline`, in `assets/__init__.py`. This asset uses the configurable resource to create a dlt pipeline and ingests data into BigQuery. -```python +```py from dagster import asset, get_dagster_logger from ..resources import DltResource from ..dlt import github_issues_resource @@ -188,12 +188,12 @@ The defined asset (**issues_pipeline**) takes as input the configurable resource - Add the schema evolution code to the asset to make our pipelines more resilient to changes. -```python +```py from dagster import AssetExecutionContext @asset def issues_pipeline(context: AssetExecutionContext, pipeline: DltResource): -... -md_content="" + ... + md_content="" for package in result.load_packages: for table_name, table in package.schema_update.items(): for column_name, column in table["columns"].items(): @@ -207,7 +207,7 @@ md_content="" - In the `__init.py__` under the **github_issues** folder add the definitions: -```python +```py all_assets = load_assets_from_modules([assets]) simple_pipeline = define_asset_job(name="simple_pipeline", selection= ['issues_pipeline']) @@ -255,20 +255,20 @@ One of the main strengths of `dlt` lies in its ability to extract, normalize, an - Start by creating a new Dagster project scaffold: -```python +```sh dagster project scaffold --name mongodb-dlt ``` - Follow the steps mentioned earlier and create an `assets`, and `resources` directory under `mongodb-dlt/mongodb_dlt`. - Initialize a `dlt` MongoDB pipeline in the same directory: -```python +```sh dlt init mongodb bigquery ``` This will create a template with all the necessary logic implemented for extracting data from MongoDB. After running the command your directory structure should be as follows: -```python +```text . ├── README.md ├── mongodb_dlt @@ -303,7 +303,7 @@ Next, create a `.env` file and add the BigQuery and MongoDB credentials to the f Create a `DltResouce` under the **resources** directory. Add the following code to the `__init__.py`: -```python +```py from dagster import ConfigurableResource import dlt @@ -335,7 +335,7 @@ In the `mongodb_pipeline.py` file, locate the `load_select_collection_hint_db` f In the `__init__.py` file under the **assets** directory, define the `dlt_asset_factory`: -```python +```py from ..mongodb import mongodb from ..resources import DltResource @@ -386,7 +386,7 @@ dlt_assets = dlt_asset_factory(DATABASE_COLLECTIONS) Add the definitions in the `__init__.py` in the root directory: -```python +```py from dagster import Definitions from .assets import dlt_assets diff --git a/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md b/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md index 292879fc95..aa433dc883 100644 --- a/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md +++ b/docs/website/blog/2023-11-22-dlt-webhooks-event-based-ingestion.md @@ -79,7 +79,7 @@ in-depth guide, please refer to the detailed documentation. 1. Click 'Create Function' in Cloud Functions, and select your region and environment setup. 1. Choose HTTP as the trigger, enable 'Allow unauthenticated invocations', save, and click 'Next'. 1. Set the environment to Python 3.10 and prepare to insert code into main.py: - ```python + ```py import dlt import json import time @@ -106,7 +106,7 @@ in-depth guide, please refer to the detailed documentation. dlt[bigquery] ``` 1. Post-deployment, a webhook URL is generated, typically following a specific format. - ```bash + ```sh https://{region]-{project-id}.cloudfunctions.net/{cloud-function-name} ``` @@ -140,7 +140,7 @@ Set up the webhook by creating a cloud function, using the same steps as for the 1. Here’s what `main.py` looks like: - ```python + ```py import dlt from flask import jsonify @@ -215,7 +215,7 @@ Set up the webhook by creating a cloud function, using the same steps as for the 1. Here’s what `main.py`looks like: - ```python + ```py import dlt from flask import jsonify @@ -227,7 +227,8 @@ Set up the webhook by creating a cloud function, using the same steps as for the # Initialize and configure the DLT pipeline pipeline = dlt.pipeline( - pipeline_name=ßigquery', # Destination service for the data + pipeline_name="hubspot", + destination='bigquery', # Destination service for the data dataset_name='hubspot_webhooks_dataset', # BigQuery dataset name ) diff --git a/docs/website/blog/2023-11-27-dlt-data-lineage.md b/docs/website/blog/2023-11-27-dlt-data-lineage.md index 233ef58800..d91659eb6b 100644 --- a/docs/website/blog/2023-11-27-dlt-data-lineage.md +++ b/docs/website/blog/2023-11-27-dlt-data-lineage.md @@ -42,7 +42,7 @@ The **load_info** produced by `dlt` for both pipelines is also populated into Bi To get started install `dlt` and dbt: -```jsx +```sh pip install dlt pip install dbt-bigquery ``` @@ -59,13 +59,13 @@ We use the following CSV files as our data sources for this demo: To get started we initialize a dlt pipeline and selecting BigQuery as our destination by running the following command: -```python +```sh dlt init data_lineage bigquery ``` This will create default scaffolding to build our pipeline. Install the dependencies by running the following command: -```python +```sh pip install -r requirements.txt ``` @@ -76,7 +76,7 @@ As a first step, we will load the sales data from the online and physical store In the `data_lineage.py` file remove the default code and add the following: -```python +```py FILEPATH = "data/supermarket_sales.csv" FILEPATH_SHOPIFY = "data/orders_export_1.csv" @@ -109,7 +109,7 @@ Any changes in the underlying data are captured by the dlt **load_info**. To sho We will add the **load_info** back to BigQuery to use in our Dashboard. The Dashboard will provide an overview data lineage for our ingested data. -```python +```py if __name__ == "__main__": data_store = pd.read_csv(FILEPATH) @@ -134,7 +134,7 @@ if __name__ == "__main__": dataset_name='sales_shopify' ) - load_a = pipeline_store.run_pipeline( + load_a = pipeline_store.run_pipeline( data=select_c_data_store, table_name='sales_info', write_disposition='replace' @@ -161,7 +161,7 @@ if __name__ == "__main__": To run the pipeline, execute the following command: -```python +```sh python data_lineage.py ``` @@ -175,7 +175,7 @@ Now that both the Shopify and Store data are available in BigQuery, we will use To get started initialize a dbt project in the root directory: -```python +```sh dbt init sales_dbt ``` @@ -244,7 +244,7 @@ In the query, we combine the **load_info** for both sources by doing a union ove In the `data_lineage.py` add the code to run the dbt package using `dlt`. -```python +```py pipeline_transform = dlt.pipeline( pipeline_name='pipeline_transform', destination='bigquery', @@ -271,7 +271,7 @@ for m in models: Next, run the pipeline using the following command: -```python +```sh python data_lineage.py ``` diff --git a/docs/website/blog/2023-12-01-dlt-kestra-demo.md b/docs/website/blog/2023-12-01-dlt-kestra-demo.md index da47384194..9f1d7acba2 100644 --- a/docs/website/blog/2023-12-01-dlt-kestra-demo.md +++ b/docs/website/blog/2023-12-01-dlt-kestra-demo.md @@ -78,7 +78,7 @@ In my scenario, the email data doesn't have nested structures, so there's no nee Here's how the pipeline is defined and subsequently run in the first task of the main flow in **`Kestra`**: -```python +```py # Run dlt pipeline to load email data from gmail to BigQuery pipeline = dlt.pipeline( pipeline_name="standard_inbox", diff --git a/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md b/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md index c819f90741..296d303dcb 100644 --- a/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md +++ b/docs/website/blog/2023-12-13-dlt-aws-taktile-blog.md @@ -46,13 +46,13 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 1. Install the SAM CLI [add link or command here] - ```bash + ```sh pip install aws-sam-cli ``` 2. Define your resources in a `template.yml` file - ```yaml + ```text AWSTemplateFormatVersion: "2010-09-09" Transform: AWS::Serverless-2016-10-31 @@ -86,7 +86,7 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM Effect: Allow Action: - secretsmanager:GetSecretValue - Resource: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:DLT_* + Resource: !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:DLT_*" Metadata: DockerTag: dlt-aws DockerContext: . @@ -99,13 +99,13 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 3. Build a deployment package - ```bash + ```sh sam build ``` 4. Test your setup locally - ```bash + ```sh sam local start-api # in a second terminal window @@ -114,7 +114,7 @@ SAM is a lightweight Infrastructure-As-Code framework provided by AWS. Using SAM 5. Deploy your resources to AWS - ```bash + ```sh sam deploy --stack-name= --resolve-image-repos --resolve-s3 --capabilities CAPABILITY_IAM ``` diff --git a/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md b/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md index d31d9a7e3a..e6e7d2ba18 100644 --- a/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md +++ b/docs/website/blog/2024-01-08-streaming-pubsub-json-gcp.md @@ -125,7 +125,7 @@ By using this micro-batch architecture, we strive to maintain a balance of datab insert efficiency (by writing multiple records at a time) with near real-time insertion (by keeping the window size around 5 seconds). -```python +```py pipeline = dlt.pipeline( pipeline_name="pubsub_dlt", diff --git a/docs/website/blog/2024-01-10-dlt-mode.md b/docs/website/blog/2024-01-10-dlt-mode.md index b92425184d..1d6bf8ca0e 100644 --- a/docs/website/blog/2024-01-10-dlt-mode.md +++ b/docs/website/blog/2024-01-10-dlt-mode.md @@ -123,13 +123,13 @@ With the model we just created, called Products, a chart can be instantly create In this demo, we’ll forego the authentication issues of connecting to a data warehouse, and choose the DuckDB destination to show how the Python environment within Mode can be used to initialize a data pipeline and dump normalized data into a destination. In order to see how it works, we first install dlt[duckdb] into the Python environment. -```python +```sh !pip install dlt[duckdb] ``` Next, we initialize the dlt pipeline: -```python +```py # initializing the dlt pipeline with your # data warehouse destination pipeline = dlt.pipeline( diff --git a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md index b36748aed9..e21154d98e 100644 --- a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md +++ b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md @@ -86,7 +86,7 @@ We recommend setting up and testing dbt-core locally before using it in cloud fu 1. Next, modify the `main.py` as follows: - ```python + ```py import os import subprocess import logging @@ -191,9 +191,10 @@ To integrate dlt and dbt in cloud functions, use the dlt-dbt runner; here’s ho 1. Next, configure the `main.py` as follows: - ```python + ```py import dlt - import logging, json + import logging + import json from flask import jsonify from dlt.common.runtime.slack import send_slack_message @@ -306,7 +307,7 @@ To integrate dlt and dbt in cloud functions, use the dlt-dbt runner; here’s ho 1. Next, list runtime-installable modules in `requirements.txt`: - ``` + ```sh dbt-core dbt-bigquery ``` diff --git a/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md b/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md index e67e203caf..415a55f9b9 100644 --- a/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md +++ b/docs/website/blog/2024-01-16-dlt-dbt-semantic-layer.md @@ -38,7 +38,7 @@ Here’s how a pipeline could look: The data being used is of a questionnaire, which includes questions, the options of those questions, respondents and responses. This data is contained within a nested json object, that we’ll pass as a raw source to `dlt` to structure, normalize and dump into a BigQuery destination. -```python +```py # initializing the dlt pipeline with your data warehouse destination pipeline = dlt.pipeline( pipeline_name="survey_pipeline", @@ -89,20 +89,20 @@ measures: - name: surveys_total description: The total surveys for each --dimension. agg: count - # if all rows need to be counted then expr = 1 + # if all rows need to be counted then expr = 1 expr: 1 # where in SQL you would: group by columns dimensions: - # default dbt requirement + # default dbt requirement - name: surveyed_at type: time type_params: time_granularity: day # count entry per answer - - name: people_per_color + - name: people_per_color type: categorical expr: answer - # count entry per question + # count entry per question - name: question type: categorical expr: question @@ -117,10 +117,10 @@ metrics: type: simple label: Favorite Colors type_params: - # reference of the measure created in the semantic model + # reference of the measure created in the semantic model measure: surveys_total - filter: | # adding a filter on the "question" column for asking about favorite color - {{ Dimension('id__question') }} = 'What is your favorite color?' + filter: | # adding a filter on the "question" column for asking about favorite color + {{ Dimension('id__question') }} = 'What is your favorite color?' ``` The DAG then looks like this: diff --git a/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md b/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md index 553284bc6f..ff54c463bd 100644 --- a/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md +++ b/docs/website/blog/2024-02-21-pipelines-single-pane-of-glass.md @@ -42,7 +42,7 @@ Since “checking” things can be tedious, we rather forget about it and be not Here’s a gist of how to use it -```python +```py from dlt.common.runtime.slack import send_slack_message def run_pipeline_and_notify(pipeline, data): diff --git a/docs/website/blog/2024-03-07-openapi-generation-chargebee.md b/docs/website/blog/2024-03-07-openapi-generation-chargebee.md index 367f8db2ca..3d77c3ea4c 100644 --- a/docs/website/blog/2024-03-07-openapi-generation-chargebee.md +++ b/docs/website/blog/2024-03-07-openapi-generation-chargebee.md @@ -90,7 +90,7 @@ There were no great challenges. The most ~~difficult~~ tedious probably was to m 1) Authentication The provided Authentication was a bit off. The generated code assumed the using of a username and password but what was actually required was — an empty username + api_key as a password. So super easy fix was changing -```python +```py def to_http_params(self) -> CredentialsHttpParams: cred = f"{self.api_key}:{self.password}" if self.password else f"{self.username}" encoded = b64encode(f"{cred}".encode()).decode() @@ -99,9 +99,9 @@ def to_http_params(self) -> CredentialsHttpParams: to -```python +```py def to_http_params(self) -> CredentialsHttpParams: - encoded = b64encode(f"{self.api_key}".encode()).decode() + encoded = b64encode(f"{self.api_key}".encode()).decode() return dict(cookies={}, headers={"Authorization": "Basic " + encoded}, params={}) ``` @@ -111,13 +111,14 @@ Also I was pleasantly surprised that generator had several different authenticat For the code generator it’s hard to guess a pagination method by OpenAPI specification, so the generated code has no pagination 😞. So I had to replace a line -```python -yield _build_response(requests.request(**kwargs)) +```py +def f(): + yield _build_response(requests.request(**kwargs)) ``` with yielding form a 6-lines `get_page` function -```python +```py def get_pages(kwargs: Dict[str, Any], data_json_path): has_more = True while has_more: @@ -133,7 +134,7 @@ The downside — I had to do it for each resource. The code wouldn’t run because it wasn’t able to find some models. I found a commented line in generator script -```python +```py # self._build_models() ``` diff --git a/docs/website/blog/2024-03-11-moving-away-from-segment.md b/docs/website/blog/2024-03-11-moving-away-from-segment.md index f834e25060..4f4b7d0a80 100644 --- a/docs/website/blog/2024-03-11-moving-away-from-segment.md +++ b/docs/website/blog/2024-03-11-moving-away-from-segment.md @@ -67,7 +67,7 @@ Next, we focus on establishing the necessary permissions for our pipeline. A cru Please refer to the Google Cloud documentation [here](https://cloud.google.com/iam/docs/service-accounts-create#console) to set up a service account. Once created, it's important to assign the necessary permissions to the service account. The project [README](https://github.com/dlt-hub/dlt_pubsub_demo) lists the necessary permissions. Finally, generate a key for the created service account and download the JSON file. Pass the credentials as environment variables in the project root directory. -```bash +```sh export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" ``` @@ -75,7 +75,7 @@ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" To set up our pipeline, start by cloning the [GitHub Repository](https://github.com/dlt-hub/dlt_pubsub_demo). The repository contains all the necessary components, structured as follows: -```bash +```sh . ├── README.md ├── cloud_functions @@ -102,7 +102,7 @@ Meanwhile, the **cloud_functions** folder includes the code for the Cloud Functi To begin, integrate the service account credentials with Terraform to enable authorization and resource management on Google Cloud. Edit the `terraform/main.tf` file to include the path to your service account's credentials file as follows: -```bash +```sh provider "google" { credentials = file("./../credentials.json") project = var.project_id @@ -114,7 +114,7 @@ provider "google" { Next, in the `terraform/variables.tf` define the required variables. These variables correspond to details within your `credentials.json` file and include your project's ID, the region for resource deployment, and any other parameters required by your Terraform configuration: -```bash +```sh variable "project_id" { type = string default = "Add Project ID" @@ -128,7 +128,6 @@ variable "region" { variable "service_account_email" { type = string default = "Add Service Account Email" - } ``` @@ -138,7 +137,7 @@ We are now ready to set up some cloud resources. To get started, navigate into t With the initialization complete, you're ready to proceed with the creation of your cloud resources. To do this, run the following Terraform commands in sequence. These commands instruct Terraform to plan and apply the configurations defined in your `.tf` files, setting up the infrastructure on Google Cloud as specified. -```bash +```sh terraform plan terraform apply ``` @@ -161,7 +160,7 @@ The following resources are created on Google Cloud once `terraform apply` comma Now that our cloud infrastructure is in place, it's time to activate the event publisher. Look for the `publisher.py` file in the project root directory. You'll need to provide specific details to enable the publisher to send events to the correct Pub/Sub topic. Update the file with the following: -```python +```py # TODO(developer) project_id = "Add GCP Project ID" topic_id = "telemetry_data_tera" @@ -169,7 +168,7 @@ topic_id = "telemetry_data_tera" The `publisher.py` script is designed to generate dummy events, simulating real-world data, and then sends these events to the specified Pub/Sub topic. This process is crucial for testing the end-to-end functionality of our event streaming pipeline, ensuring that data flows from the source (the publisher) to our intended destinations (BigQuery, via the Cloud Function and dlt). To run the publisher execute the following command: -```python +```sh python publisher.py ``` @@ -179,7 +178,7 @@ Once the publisher sends events to the Pub/Sub Topic, the pipeline is activated. The average completion time of the pipeline is approximately 12 minutes, accounting for the 10-minute time interval after which the subscriber pushes data to storage plus the Cloud Function execution time. The push interval of the subscriber can be adjusted by changing the **max_duration** in `pubsub.tf` -```bash +```sh cloud_storage_config { bucket = google_storage_bucket.tel_bucket_storage.name