From c2c21d0a512e7111ba9c1a2cd74ca0e186c31b3b Mon Sep 17 00:00:00 2001 From: Dennis Hume Date: Fri, 13 Dec 2024 08:56:31 -0600 Subject: [PATCH] [du] add backoff for duckdb connections (#26408) ## Summary & Motivation Update code snippets for sections 4 and 6 to wrap DuckDB connection in backoff to prevent race conditions when materializing all assets (Not an issue after these assets switch to using resources). ## How I Tested These Changes ## Changelog > Insert changelog entry or delete this section. --- .../coding-practice-taxi-zones-asset.md | 9 ++++++++- .../coding-practice-trips-by-week-asset.md | 10 +++++++++- .../lesson-4/loading-data-into-a-database.md | 12 +++++++++-- .../setting-up-a-database-resource.md | 9 ++++++++- .../lesson-6/using-resources-in-assets.md | 20 +++++++++++++++++-- 5 files changed, 53 insertions(+), 7 deletions(-) diff --git a/docs/dagster-university/pages/dagster-essentials/lesson-4/coding-practice-taxi-zones-asset.md b/docs/dagster-university/pages/dagster-essentials/lesson-4/coding-practice-taxi-zones-asset.md index 5cb1bc9b33abc..69f9139b16c2b 100644 --- a/docs/dagster-university/pages/dagster-essentials/lesson-4/coding-practice-taxi-zones-asset.md +++ b/docs/dagster-university/pages/dagster-essentials/lesson-4/coding-practice-taxi-zones-asset.md @@ -41,6 +41,13 @@ def taxi_zones() -> None: ); """ - conn = duckdb.connect(os.getenv("DUCKDB_DATABASE")) + conn = backoff( + fn=duckdb.connect, + retry_on=(RuntimeError, duckdb.IOException), + kwargs={ + "database": os.getenv("DUCKDB_DATABASE"), + }, + max_retries=10, + ) conn.execute(query) ``` diff --git a/docs/dagster-university/pages/dagster-essentials/lesson-4/coding-practice-trips-by-week-asset.md b/docs/dagster-university/pages/dagster-essentials/lesson-4/coding-practice-trips-by-week-asset.md index 8067d27dd281a..b5c5e90b1bfad 100644 --- a/docs/dagster-university/pages/dagster-essentials/lesson-4/coding-practice-trips-by-week-asset.md +++ b/docs/dagster-university/pages/dagster-essentials/lesson-4/coding-practice-trips-by-week-asset.md @@ -62,12 +62,20 @@ from datetime import datetime, timedelta from . import constants import pandas as pd +from dagster._utils.backoff import backoff @asset( deps=["taxi_trips"] ) def trips_by_week() -> None: - conn = duckdb.connect(os.getenv("DUCKDB_DATABASE")) + conn = backoff( + fn=duckdb.connect, + retry_on=(RuntimeError, duckdb.IOException), + kwargs={ + "database": os.getenv("DUCKDB_DATABASE"), + }, + max_retries=10, + ) current_date = datetime.strptime("2023-03-01", constants.DATE_FORMAT) end_date = datetime.strptime("2023-04-01", constants.DATE_FORMAT) diff --git a/docs/dagster-university/pages/dagster-essentials/lesson-4/loading-data-into-a-database.md b/docs/dagster-university/pages/dagster-essentials/lesson-4/loading-data-into-a-database.md index 560c46b30273c..f5bc537d9837b 100644 --- a/docs/dagster-university/pages/dagster-essentials/lesson-4/loading-data-into-a-database.md +++ b/docs/dagster-university/pages/dagster-essentials/lesson-4/loading-data-into-a-database.md @@ -13,6 +13,7 @@ Now that you have a query that produces an asset, let’s use Dagster to manage ```python import duckdb import os + from dagster._utils.backoff import backoff ``` 2. Copy and paste the code below into the bottom of the `trips.py` file. Note how this code looks similar to the asset definition code for the `taxi_trips_file` and the `taxi_zones` assets: @@ -42,7 +43,14 @@ Now that you have a query that produces an asset, let’s use Dagster to manage ); """ - conn = duckdb.connect(os.getenv("DUCKDB_DATABASE")) + conn = backoff( + fn=duckdb.connect, + retry_on=(RuntimeError, duckdb.IOException), + kwargs={ + "database": os.getenv("DUCKDB_DATABASE"), + }, + max_retries=10, + ) conn.execute(query) ``` @@ -54,7 +62,7 @@ Now that you have a query that produces an asset, let’s use Dagster to manage 3. Next, a variable named `query` is created. This variable contains a SQL query that creates a table named `trips`, which sources its data from the `data/raw/taxi_trips_2023-03.parquet` file. This is the file created by the `taxi_trips_file` asset. - 4. A variable named `conn` is created, which defines the connection to the DuckDB database in the project. To do this, it uses the `.connect` method from the `duckdb` library, passing in the `DUCKDB_DATABASE` environment variable to tell DuckDB where the database is located. + 4. A variable named `conn` is created, which defines the connection to the DuckDB database in the project. To do this, we first wrap everything with the Dagster utility function `backoff`. Using the backoff function ensures that multiple assets can use DuckDB safely without locking resources. The backoff function takes in the function we want to call (in this case the `.connect` method from the `duckdb` library), any errors to retry on (`RuntimeError` and `duckdb.IOException`), the max number of retries, and finally, the arguments to supply to the `.connect` DuckDB method. Here we are passing in the `DUCKDB_DATABASE` environment variable to tell DuckDB where the database is located. The `DUCKDB_DATABASE` environment variable, sourced from your project’s `.env` file, resolves to `data/staging/data.duckdb`. **Note**: We set up this file in Lesson 2 - refer to this lesson if you need a refresher. If this file isn’t set up correctly, the materialization will result in an error. diff --git a/docs/dagster-university/pages/dagster-essentials/lesson-6/setting-up-a-database-resource.md b/docs/dagster-university/pages/dagster-essentials/lesson-6/setting-up-a-database-resource.md index 311748941755c..9dadbf149bfa1 100644 --- a/docs/dagster-university/pages/dagster-essentials/lesson-6/setting-up-a-database-resource.md +++ b/docs/dagster-university/pages/dagster-essentials/lesson-6/setting-up-a-database-resource.md @@ -14,7 +14,14 @@ Throughout this module, you’ve used DuckDB to store and transform your data. E ) def taxi_trips() -> None: ... - conn = duckdb.connect(os.getenv("DUCKDB_DATABASE")) + conn = backoff( + fn=duckdb.connect, + retry_on=(RuntimeError, duckdb.IOException), + kwargs={ + "database": os.getenv("DUCKDB_DATABASE"), + }, + max_retries=10, + ) ... ``` diff --git a/docs/dagster-university/pages/dagster-essentials/lesson-6/using-resources-in-assets.md b/docs/dagster-university/pages/dagster-essentials/lesson-6/using-resources-in-assets.md index 3af063b4515b0..6f059f62f2bdc 100644 --- a/docs/dagster-university/pages/dagster-essentials/lesson-6/using-resources-in-assets.md +++ b/docs/dagster-university/pages/dagster-essentials/lesson-6/using-resources-in-assets.md @@ -48,7 +48,14 @@ def taxi_trips() -> None: ); """ - conn = duckdb.connect(os.getenv("DUCKDB_DATABASE")) + conn = backoff( + fn=duckdb.connect, + retry_on=(RuntimeError, duckdb.IOException), + kwargs={ + "database": os.getenv("DUCKDB_DATABASE"), + }, + max_retries=10, + ) conn.execute(query) ``` @@ -100,7 +107,14 @@ To refactor `taxi_trips` to use the `database` resource, we had to: 3. Replace the lines that connect to DuckDB and execute a query: ```python - conn = duckdb.connect(os.getenv("DUCKDB_DATABASE")) + conn = backoff( + fn=duckdb.connect, + retry_on=(RuntimeError, duckdb.IOException), + kwargs={ + "database": os.getenv("DUCKDB_DATABASE"), + }, + max_retries=10, + ) conn.execute(query) ``` @@ -111,6 +125,8 @@ To refactor `taxi_trips` to use the `database` resource, we had to: conn.execute(query) ``` + Notice that we no longer need to use the `backoff` function. The Dagster `DuckDBResource` handles this functionality for us. + --- ## Before you continue