From 641d7ba6f053ef2d451a421839c23819000d9ef6 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 8 Mar 2024 20:28:44 +0100 Subject: [PATCH 01/27] simplifies and fixes incremental / fixes #971 (#1062) * rewrites incremental: computation of hashes vastly reduced, fixed wrong criteria when to deduplicate, unique index in arrow frames rarely created * initial tests for ordered, random and overlapping incremental ranges * clarifies what deduplication in incremental means * handles no deduplication case explicitly, more tests --- dlt/extract/incremental/__init__.py | 23 +- dlt/extract/incremental/transform.py | 221 +++++++++--------- .../code/zendesk-snippets.py | 2 +- .../docs/general-usage/incremental-loading.md | 19 +- tests/common/test_validation.py | 15 ++ tests/extract/test_incremental.py | 150 +++++++++++- 6 files changed, 309 insertions(+), 121 deletions(-) diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 54e8b3d447..9ad174fd63 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -163,11 +163,12 @@ def _make_transforms(self) -> None: self._transformers[dt] = kls( self.resource_name, self.cursor_path, + self.initial_value, self.start_value, self.end_value, - self._cached_state, self.last_value_func, self._primary_key, + set(self._cached_state["unique_hashes"]), ) @classmethod @@ -453,14 +454,28 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: return rows transformer = self._get_transformer(rows) - if isinstance(rows, list): - return [ + rows = [ item for item in (self._transform_item(transformer, row) for row in rows) if item is not None ] - return self._transform_item(transformer, rows) + else: + rows = self._transform_item(transformer, rows) + + # write back state + self._cached_state["last_value"] = transformer.last_value + if not transformer.deduplication_disabled: + # compute hashes for new last rows + unique_hashes = set( + transformer.compute_unique_value(row, self.primary_key) + for row in transformer.last_rows + ) + # add directly computed hashes + unique_hashes.update(transformer.unique_hashes) + self._cached_state["unique_hashes"] = list(unique_hashes) + + return rows Incremental.EMPTY = Incremental[Any]("") diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index e20617cf63..2ad827b755 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -1,17 +1,17 @@ from datetime import datetime, date # noqa: I251 -from typing import Any, Optional, Tuple, List +from typing import Any, Optional, Set, Tuple, List from dlt.common.exceptions import MissingDependencyException from dlt.common.utils import digest128 from dlt.common.json import json from dlt.common import pendulum -from dlt.common.typing import TDataItem, TDataItems -from dlt.common.jsonpath import TJsonPath, find_values, JSONPathFields, compile_path +from dlt.common.typing import TDataItem +from dlt.common.jsonpath import find_values, JSONPathFields, compile_path from dlt.extract.incremental.exceptions import ( IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing, ) -from dlt.extract.incremental.typing import IncrementalColumnState, TCursorValue, LastValueFunc +from dlt.extract.incremental.typing import TCursorValue, LastValueFunc from dlt.extract.utils import resolve_column_value from dlt.extract.items import TTableHintTemplate from dlt.common.schema.typing import TColumnNames @@ -34,19 +34,24 @@ def __init__( self, resource_name: str, cursor_path: str, + initial_value: Optional[TCursorValue], start_value: Optional[TCursorValue], end_value: Optional[TCursorValue], - incremental_state: IncrementalColumnState, last_value_func: LastValueFunc[TCursorValue], primary_key: Optional[TTableHintTemplate[TColumnNames]], + unique_hashes: Set[str], ) -> None: self.resource_name = resource_name self.cursor_path = cursor_path + self.initial_value = initial_value self.start_value = start_value + self.last_value = start_value self.end_value = end_value - self.incremental_state = incremental_state + self.last_rows: List[TDataItem] = [] self.last_value_func = last_value_func self.primary_key = primary_key + self.unique_hashes = unique_hashes + self.start_unique_hashes = set(unique_hashes) # compile jsonpath self._compiled_cursor_path = compile_path(cursor_path) @@ -59,20 +64,17 @@ def __init__( self.cursor_path = self._compiled_cursor_path.fields[0] self._compiled_cursor_path = None - def __call__( - self, - row: TDataItem, - ) -> Tuple[bool, bool, bool]: ... - - -class JsonIncremental(IncrementalTransform): - def unique_value( + def compute_unique_value( self, row: TDataItem, primary_key: Optional[TTableHintTemplate[TColumnNames]], - resource_name: str, ) -> str: try: + assert not self.deduplication_disabled, ( + f"{self.resource_name}: Attempt to compute unique values when deduplication is" + " disabled" + ) + if primary_key: return digest128(json.dumps(resolve_column_value(primary_key, row), sort_keys=True)) elif primary_key is None: @@ -80,8 +82,20 @@ def unique_value( else: return None except KeyError as k_err: - raise IncrementalPrimaryKeyMissing(resource_name, k_err.args[0], row) + raise IncrementalPrimaryKeyMissing(self.resource_name, k_err.args[0], row) + def __call__( + self, + row: TDataItem, + ) -> Tuple[bool, bool, bool]: ... + + @property + def deduplication_disabled(self) -> bool: + """Skip deduplication when length of the key is 0""" + return isinstance(self.primary_key, (list, tuple)) and len(self.primary_key) == 0 + + +class JsonIncremental(IncrementalTransform): def find_cursor_value(self, row: TDataItem) -> Any: """Finds value in row at cursor defined by self.cursor_path. @@ -113,7 +127,8 @@ def __call__( return row, False, False row_value = self.find_cursor_value(row) - last_value = self.incremental_state["last_value"] + last_value = self.last_value + last_value_func = self.last_value_func # For datetime cursor, ensure the value is a timezone aware datetime. # The object saved in state will always be a tz aware pendulum datetime so this ensures values are comparable @@ -128,41 +143,45 @@ def __call__( # Check whether end_value has been reached # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value if self.end_value is not None and ( - self.last_value_func((row_value, self.end_value)) != self.end_value - or self.last_value_func((row_value,)) == self.end_value + last_value_func((row_value, self.end_value)) != self.end_value + or last_value_func((row_value,)) == self.end_value ): return None, False, True check_values = (row_value,) + ((last_value,) if last_value is not None else ()) - new_value = self.last_value_func(check_values) + new_value = last_value_func(check_values) + # new_value is "less" or equal to last_value (the actual max) if last_value == new_value: - processed_row_value = self.last_value_func((row_value,)) - # we store row id for all records with the current "last_value" in state and use it to deduplicate - - if processed_row_value == last_value: - unique_value = self.unique_value(row, self.primary_key, self.resource_name) - # if unique value exists then use it to deduplicate - if unique_value: - if unique_value in self.incremental_state["unique_hashes"]: - return None, False, False - # add new hash only if the record row id is same as current last value - self.incremental_state["unique_hashes"].append(unique_value) - return row, False, False - # skip the record that is not a last_value or new_value: that record was already processed + # use func to compute row_value into last_value compatible + processed_row_value = last_value_func((row_value,)) + # skip the record that is not a start_value or new_value: that record was already processed check_values = (row_value,) + ( (self.start_value,) if self.start_value is not None else () ) - new_value = self.last_value_func(check_values) + new_value = last_value_func(check_values) # Include rows == start_value but exclude "lower" - if new_value == self.start_value and processed_row_value != self.start_value: - return None, True, False - else: - return row, False, False + # new_value is "less" or equal to start_value (the initial max) + if new_value == self.start_value: + # if equal there's still a chance that item gets in + if processed_row_value == self.start_value: + if not self.deduplication_disabled: + unique_value = self.compute_unique_value(row, self.primary_key) + # if unique value exists then use it to deduplicate + if unique_value in self.start_unique_hashes: + return None, True, False + else: + # smaller than start value: gets out + return None, True, False + + # we store row id for all records with the current "last_value" in state and use it to deduplicate + if processed_row_value == last_value: + # add new hash only if the record row id is same as current last value + self.last_rows.append(row) else: - self.incremental_state["last_value"] = new_value - unique_value = self.unique_value(row, self.primary_key, self.resource_name) - if unique_value: - self.incremental_state["unique_hashes"] = [unique_value] + self.last_value = new_value + # store rows with "max" values to compute hashes after processing full batch + self.last_rows = [row] + self.unique_hashes = set() return row, False, False @@ -170,21 +189,25 @@ def __call__( class ArrowIncremental(IncrementalTransform): _dlt_index = "_dlt_index" - def unique_values( - self, item: "TAnyArrowItem", unique_columns: List[str], resource_name: str + def compute_unique_values(self, item: "TAnyArrowItem", unique_columns: List[str]) -> List[str]: + if not unique_columns: + return [] + rows = item.select(unique_columns).to_pylist() + return [self.compute_unique_value(row, self.primary_key) for row in rows] + + def compute_unique_values_with_index( + self, item: "TAnyArrowItem", unique_columns: List[str] ) -> List[Tuple[int, str]]: if not unique_columns: return [] - item = item indices = item[self._dlt_index].to_pylist() rows = item.select(unique_columns).to_pylist() return [ - (index, digest128(json.dumps(row, sort_keys=True))) for index, row in zip(indices, rows) + (index, self.compute_unique_value(row, self.primary_key)) + for index, row in zip(indices, rows) ] - def _deduplicate( - self, tbl: "pa.Table", unique_columns: Optional[List[str]], aggregate: str, cursor_path: str - ) -> "pa.Table": + def _add_unique_index(self, tbl: "pa.Table") -> "pa.Table": """Creates unique index if necessary.""" # create unique index if necessary if self._dlt_index not in tbl.schema.names: @@ -215,24 +238,18 @@ def __call__( self._dlt_index = primary_key elif primary_key is None: unique_columns = tbl.schema.names - else: # deduplicating is disabled - unique_columns = None start_out_of_range = end_out_of_range = False if not tbl: # row is None or empty arrow table return tbl, start_out_of_range, end_out_of_range - last_value = self.incremental_state["last_value"] - if self.last_value_func is max: compute = pa.compute.max - aggregate = "max" end_compare = pa.compute.less last_value_compare = pa.compute.greater_equal new_value_compare = pa.compute.greater elif self.last_value_func is min: compute = pa.compute.min - aggregate = "min" end_compare = pa.compute.greater last_value_compare = pa.compute.less_equal new_value_compare = pa.compute.less @@ -267,64 +284,56 @@ def __call__( # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py() - if last_value is not None: - if self.start_value is not None: - # Remove rows lower than the last start value - keep_filter = last_value_compare( - tbl[cursor_path], to_arrow_scalar(self.start_value, cursor_data_type) + if self.start_value is not None: + start_value_scalar = to_arrow_scalar(self.start_value, cursor_data_type) + # Remove rows lower or equal than the last start value + keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar) + start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) + tbl = tbl.filter(keep_filter) + if not self.deduplication_disabled: + # Deduplicate after filtering old values + tbl = self._add_unique_index(tbl) + # Remove already processed rows where the cursor is equal to the start value + eq_rows = tbl.filter(pa.compute.equal(tbl[cursor_path], start_value_scalar)) + # compute index, unique hash mapping + unique_values_index = self.compute_unique_values_with_index(eq_rows, unique_columns) + unique_values_index = [ + (i, uq_val) + for i, uq_val in unique_values_index + if uq_val in self.start_unique_hashes + ] + # find rows with unique ids that were stored from previous run + remove_idx = pa.array(i for i, _ in unique_values_index) + # Filter the table + tbl = tbl.filter( + pa.compute.invert(pa.compute.is_in(tbl[self._dlt_index], remove_idx)) ) - start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) - tbl = tbl.filter(keep_filter) - - # Deduplicate after filtering old values - last_value_scalar = to_arrow_scalar(last_value, cursor_data_type) - tbl = self._deduplicate(tbl, unique_columns, aggregate, cursor_path) - # Remove already processed rows where the cursor is equal to the last value - eq_rows = tbl.filter(pa.compute.equal(tbl[cursor_path], last_value_scalar)) - # compute index, unique hash mapping - unique_values = self.unique_values(eq_rows, unique_columns, self.resource_name) - unique_values = [ - (i, uq_val) - for i, uq_val in unique_values - if uq_val in self.incremental_state["unique_hashes"] - ] - remove_idx = pa.array(i for i, _ in unique_values) - # Filter the table - tbl = tbl.filter(pa.compute.invert(pa.compute.is_in(tbl[self._dlt_index], remove_idx))) - - if ( - new_value_compare(row_value_scalar, last_value_scalar).as_py() - and row_value != last_value - ): # Last value has changed - self.incremental_state["last_value"] = row_value + + if ( + self.last_value is None + or new_value_compare( + row_value_scalar, to_arrow_scalar(self.last_value, cursor_data_type) + ).as_py() + ): # Last value has changed + self.last_value = row_value + if not self.deduplication_disabled: # Compute unique hashes for all rows equal to row value - self.incremental_state["unique_hashes"] = [ - uq_val - for _, uq_val in self.unique_values( + self.unique_hashes = set( + self.compute_unique_values( tbl.filter(pa.compute.equal(tbl[cursor_path], row_value_scalar)), unique_columns, - self.resource_name, - ) - ] - else: - # last value is unchanged, add the hashes - self.incremental_state["unique_hashes"] = list( - set( - self.incremental_state["unique_hashes"] - + [uq_val for _, uq_val in unique_values] ) ) - else: - tbl = self._deduplicate(tbl, unique_columns, aggregate, cursor_path) - self.incremental_state["last_value"] = row_value - self.incremental_state["unique_hashes"] = [ - uq_val - for _, uq_val in self.unique_values( - tbl.filter(pa.compute.equal(tbl[cursor_path], row_value_scalar)), - unique_columns, - self.resource_name, + elif self.last_value == row_value and not self.deduplication_disabled: + # last value is unchanged, add the hashes + self.unique_hashes.update( + set( + self.compute_unique_values( + tbl.filter(pa.compute.equal(tbl[cursor_path], row_value_scalar)), + unique_columns, + ) ) - ] + ) if len(tbl) == 0: return None, start_out_of_range, end_out_of_range diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py index ff12a00fca..05ea18cb9e 100644 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -140,4 +140,4 @@ def get_pages( # check that stuff was loaded row_counts = pipeline.last_trace.last_normalize_info.row_counts - assert row_counts["ticket_events"] == 17 \ No newline at end of file + assert row_counts["ticket_events"] == 17 diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index dd52c9c750..144b176332 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -454,11 +454,20 @@ def tickets( ``` ::: -### Deduplication primary_key - -`dlt.sources.incremental` will inherit the primary key that is set on the resource. - - let's you optionally set a `primary_key` that is used exclusively to +### Deduplicate overlapping ranges with primary key + +`Incremental` **does not** deduplicate datasets like **merge** write disposition does. It however +makes sure than when another portion of data is extracted, records that were previously loaded won't be +included again. `dlt` assumes that you load a range of data, where the lower bound is inclusive (ie. greater than equal). +This makes sure that you never lose any data but will also re-acquire some rows. +For example: you have a database table with an cursor field on `updated_at` which has a day resolution, then there's a high +chance that after you extract data on a given day, still more records will be added. When you extract on the next day, you +should reacquire data from the last day to make sure all records are present, this will however create overlap with data +from previous extract. + +By default, content hash (a hash of `json` representation of a row) will be used to deduplicate. +This may be slow so`dlt.sources.incremental` will inherit the primary key that is set on the resource. +You can optionally set a `primary_key` that is used exclusively to deduplicate and which does not become a table hint. The same setting lets you disable the deduplication altogether when empty tuple is passed. Below we pass `primary_key` directly to `incremental` to disable deduplication. That overrides `delta` primary_key set in the resource: diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index 3fff3bf2ea..f7773fb89c 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -273,3 +273,18 @@ def f(item: Union[TDataItem, TDynHintType]) -> TDynHintType: validate_dict( TTestRecordCallable, test_item, path=".", validator_f=lambda p, pk, pv, t: callable(pv) ) + + +# def test_union_merge() -> None: +# """Overriding fields is simply illegal in TypedDict""" +# class EndpointResource(TypedDict, total=False): +# name: TTableHintTemplate[str] + +# class TTestRecordNoName(EndpointResource, total=False): +# name: Optional[TTableHintTemplate[str]] + +# # test_item = {"name": None} +# # validate_dict(TTestRecordNoName, test_item, path=".") + +# test_item = {} +# validate_dict(TTestRecordNoName, test_item, path=".") diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 7956c83947..a393706de7 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -1,5 +1,6 @@ import os import asyncio +import random from time import sleep from typing import Optional, Any from unittest import mock @@ -14,13 +15,14 @@ from dlt.common.configuration.specs.base_configuration import configspec, BaseConfiguration from dlt.common.configuration import ConfigurationValueError from dlt.common.pendulum import pendulum, timedelta -from dlt.common.pipeline import StateInjectableContext, resource_state +from dlt.common.pipeline import NormalizeInfo, StateInjectableContext, resource_state from dlt.common.schema.schema import Schema from dlt.common.utils import uniq_id, digest128, chunks from dlt.common.json import json from dlt.extract import DltSource from dlt.extract.exceptions import InvalidStepFunctionArguments +from dlt.extract.resource import DltResource from dlt.sources.helpers.transform import take_first from dlt.extract.incremental.exceptions import ( IncrementalCursorPathMissing, @@ -125,11 +127,11 @@ def test_unique_keys_are_deduplicated(item_type: TDataItemFormat) -> None: {"created_at": 3, "id": "e"}, ] data2 = [ + {"created_at": 4, "id": "g"}, {"created_at": 3, "id": "c"}, {"created_at": 3, "id": "d"}, {"created_at": 3, "id": "e"}, {"created_at": 3, "id": "f"}, - {"created_at": 4, "id": "g"}, ] source_items1 = data_to_item_format(item_type, data1) @@ -1307,7 +1309,6 @@ def descending_single_item( for i in reversed(range(14)): data = [{"updated_at": i}] yield from data_to_item_format(item_type, data) - yield {"updated_at": i} if i >= 10: assert updated_at.start_out_of_range is False else: @@ -1375,7 +1376,8 @@ def descending( assert data_item_length(data) == 48 - 10 + 1 # both bounds included -def test_transformer_row_order_out_of_range() -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_transformer_row_order_out_of_range(item_type: TDataItemFormat) -> None: out_of_range = [] @dlt.transformer @@ -1387,13 +1389,14 @@ def descending( ) -> Any: for chunk in chunks(count(start=48, step=-1), 10): data = [{"updated_at": i, "package": package} for i in chunk] + # print(data) yield data_to_item_format("json", data) if updated_at.can_close(): out_of_range.append(package) return data = list([3, 2, 1] | descending) - assert len(data) == 48 - 10 + 1 + assert data_item_length(data) == 48 - 10 + 1 # we take full package 3 and then nothing in 1 and 2 assert len(out_of_range) == 3 @@ -1453,6 +1456,143 @@ def ascending_desc( assert data_item_length(data) == 45 - 22 +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("order", ["random", "desc", "asc"]) +@pytest.mark.parametrize("primary_key", [[], None, "updated_at"]) +@pytest.mark.parametrize( + "deterministic", (True, False), ids=("deterministic-record", "non-deterministic-record") +) +def test_unique_values_unordered_rows( + item_type: TDataItemFormat, order: str, primary_key: Any, deterministic: bool +) -> None: + @dlt.resource(primary_key=primary_key) + def random_ascending_chunks( + order: str, + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=10, + ), + ) -> Any: + range_ = list(range(updated_at.start_value, updated_at.start_value + 121)) + if order == "random": + random.shuffle(range_) + if order == "desc": + range_ = reversed(range_) # type: ignore[assignment] + + for chunk in chunks(range_, 30): + # make sure that overlapping element is the last one + data = [ + {"updated_at": i, "rand": random.random() if not deterministic else 0} + for i in chunk + ] + # random.shuffle(data) + yield data_to_item_format(item_type, data) + + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately + pipeline = dlt.pipeline("test_unique_values_unordered_rows", destination="dummy") + pipeline.run(random_ascending_chunks(order)) + assert pipeline.last_trace.last_normalize_info.row_counts["random_ascending_chunks"] == 121 + + # 120 rows (one overlap - incremental reacquires and deduplicates) + pipeline.run(random_ascending_chunks(order)) + # overlapping element must be deduped when: + # 1. we have primary key on just updated at + # OR we have a key on full record but the record is deterministic so duplicate may be found + rows = 120 if primary_key == "updated_at" or (deterministic and primary_key != []) else 121 + assert pipeline.last_trace.last_normalize_info.row_counts["random_ascending_chunks"] == rows + + +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("primary_key", [[], None, "updated_at"]) # [], None, +@pytest.mark.parametrize( + "deterministic", (True, False), ids=("deterministic-record", "non-deterministic-record") +) +def test_carry_unique_hashes( + item_type: TDataItemFormat, primary_key: Any, deterministic: bool +) -> None: + # each day extends list of hashes and removes duplicates until the last day + + @dlt.resource(primary_key=primary_key) + def random_ascending_chunks( + # order: str, + day: int, + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=10, + ), + ) -> Any: + range_ = random.sample( + range(updated_at.initial_value, updated_at.initial_value + 10), k=10 + ) # list(range(updated_at.initial_value, updated_at.initial_value + 10)) + range_ += [100] + if day == 4: + # on day 4 add an element that will reset all others + range_ += [1000] + + for chunk in chunks(range_, 3): + # make sure that overlapping element is the last one + data = [ + {"updated_at": i, "rand": random.random() if not deterministic else 0} + for i in chunk + ] + yield data_to_item_format(item_type, data) + + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately + pipeline = dlt.pipeline("test_unique_values_unordered_rows", destination="dummy") + + def _assert_state(r_: DltResource, day: int, info: NormalizeInfo) -> None: + uniq_hashes = r_.state["incremental"]["updated_at"]["unique_hashes"] + row_count = info.row_counts.get("random_ascending_chunks", 0) + if primary_key == "updated_at": + # we keep only newest version of the record + assert len(uniq_hashes) == 1 + if day == 1: + # all records loaded + assert row_count == 11 + elif day == 4: + # new biggest item loaded + assert row_count == 1 + else: + # all deduplicated + assert row_count == 0 + elif primary_key is None: + # we deduplicate over full content + if day == 4: + assert len(uniq_hashes) == 1 + # both the 100 or 1000 are in if non deterministic content + assert row_count == (2 if not deterministic else 1) + else: + # each day adds new hash if content non deterministic + assert len(uniq_hashes) == (day if not deterministic else 1) + if day == 1: + assert row_count == 11 + else: + assert row_count == (1 if not deterministic else 0) + elif primary_key == []: + # no deduplication + assert len(uniq_hashes) == 0 + if day == 4: + assert row_count == 2 + else: + if day == 1: + assert row_count == 11 + else: + assert row_count == 1 + + r_ = random_ascending_chunks(1) + pipeline.run(r_) + _assert_state(r_, 1, pipeline.last_trace.last_normalize_info) + r_ = random_ascending_chunks(2) + pipeline.run(r_) + _assert_state(r_, 2, pipeline.last_trace.last_normalize_info) + r_ = random_ascending_chunks(3) + pipeline.run(r_) + _assert_state(r_, 3, pipeline.last_trace.last_normalize_info) + r_ = random_ascending_chunks(4) + pipeline.run(r_) + _assert_state(r_, 4, pipeline.last_trace.last_normalize_info) + + @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_get_incremental_value_type(item_type: TDataItemFormat) -> None: assert dlt.sources.incremental("id").get_incremental_value_type() is Any From ab213fa3edbfe4a41310d5d5bf55b0ade0b1963d Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Mon, 11 Mar 2024 14:26:05 +0300 Subject: [PATCH 02/27] Fix grammar in /docs/dlt-ecosystem/destinations/ (#1070) --- .../docs/dlt-ecosystem/destinations/athena.md | 70 ++++++------- .../dlt-ecosystem/destinations/bigquery.md | 56 +++++------ .../dlt-ecosystem/destinations/databricks.md | 28 +++--- .../docs/dlt-ecosystem/destinations/duckdb.md | 48 ++++----- .../docs/dlt-ecosystem/destinations/index.md | 8 +- .../dlt-ecosystem/destinations/motherduck.md | 42 ++++---- .../docs/dlt-ecosystem/destinations/mssql.md | 36 +++---- .../dlt-ecosystem/destinations/postgres.md | 58 +++++------ .../docs/dlt-ecosystem/destinations/qdrant.md | 10 +- .../dlt-ecosystem/destinations/redshift.md | 23 +++-- .../dlt-ecosystem/destinations/snowflake.md | 99 +++++++++---------- .../dlt-ecosystem/destinations/synapse.md | 27 +++-- .../dlt-ecosystem/destinations/weaviate.md | 30 +++--- 13 files changed, 262 insertions(+), 273 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 9fc5dc15f9..b376337e77 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -6,7 +6,7 @@ keywords: [aws, athena, glue catalog] # AWS Athena / Glue Catalog -The athena destination stores data as parquet files in s3 buckets and creates [external tables in aws athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with athena sql commands which will then scan the whole folder of parquet files and return the results. This destination works very similar to other sql based destinations, with the exception of the merge write disposition not being supported at this time. dlt metadata will be stored in the same bucket as the parquet files, but as iceberg tables. Athena additionally supports writing individual data tables as iceberg tables, so the may be manipulated later, a common use-case would be to strip gdpr data from them. +The Athena destination stores data as Parquet files in S3 buckets and creates [external tables in AWS Athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with Athena SQL commands, which will scan the entire folder of Parquet files and return the results. This destination works very similarly to other SQL-based destinations, with the exception that the merge write disposition is not supported at this time. The `dlt` metadata will be stored in the same bucket as the Parquet files, but as iceberg tables. Athena also supports writing individual data tables as Iceberg tables, so they may be manipulated later. A common use case would be to strip GDPR data from them. ## Install dlt with Athena **To install the DLT library with Athena dependencies:** @@ -17,35 +17,34 @@ pip install dlt[athena] ## Setup Guide ### 1. Initialize the dlt project -Let's start by initializing a new dlt project as follows: +Let's start by initializing a new `dlt` project as follows: ```bash dlt init chess athena ``` - > šŸ’” This command will initialise your pipeline with chess as the source and aws athena as the destination using the filesystem staging destination + > šŸ’” This command will initialize your pipeline with chess as the source and AWS Athena as the destination using the filesystem staging destination. -### 2. Setup bucket storage and athena credentials +### 2. Setup bucket storage and Athena credentials -First install dependencies by running: +First, install dependencies by running: ``` pip install -r requirements.txt ``` -or with `pip install dlt[athena]` which will install `s3fs`, `pyarrow`, `pyathena` and `botocore` packages. +or with `pip install dlt[athena]`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. :::caution -You may also install the dependencies independently -try +You may also install the dependencies independently. Try ```sh pip install dlt pip install s3fs pip install pyarrow pip install pyathena ``` -so pip does not fail on backtracking +so pip does not fail on backtracking. ::: -To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You will need to provide a `bucket_url` which holds the uploaded parquet files, a `query_result_bucket` which athena uses to write query results too, and credentials that have write and read access to these two buckets as well as the full athena access aws role. +To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You will need to provide a `bucket_url`, which holds the uploaded parquet files, a `query_result_bucket`, which Athena uses to write query results to, and credentials that have write and read access to these two buckets as well as the full Athena access AWS role. The toml file looks like this: @@ -63,10 +62,10 @@ query_result_bucket="s3://[results_bucket_name]" # replace with your query resul [destination.athena.credentials] aws_access_key_id="please set me up!" # same as credentials for filesystem aws_secret_access_key="please set me up!" # same as credentials for filesystem -region_name="please set me up!" # set your aws region, for example "eu-central-1" for frankfurt +region_name="please set me up!" # set your AWS region, for example "eu-central-1" for Frankfurt ``` -if you have your credentials stored in `~/.aws/credentials` just remove the **[destination.filesystem.credentials]** and **[destination.athena.credentials]** section above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): +If you have your credentials stored in `~/.aws/credentials`, just remove the **[destination.filesystem.credentials]** and **[destination.athena.credentials]** section above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): ```toml [destination.filesystem.credentials] profile_name="dlt-ci-user" @@ -77,7 +76,7 @@ profile_name="dlt-ci-user" ## Additional Destination Configuration -You can provide an athena workgroup like so: +You can provide an Athena workgroup like so: ```toml [destination.athena] athena_work_group="my_workgroup" @@ -85,45 +84,43 @@ athena_work_group="my_workgroup" ## Write disposition -`athena` destination handles the write dispositions as follows: -- `append` - files belonging to such tables are added to dataset folder -- `replace` - all files that belong to such tables are deleted from dataset folder and then current set of files is added. -- `merge` - falls back to `append` +The `athena` destination handles the write dispositions as follows: +- `append` - files belonging to such tables are added to the dataset folder. +- `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added. +- `merge` - falls back to `append`. ## Data loading -Data loading happens by storing parquet files in an s3 bucket and defining a schema on athena. If you query data via SQL queries on athena, the returned data is read by -scanning your bucket and reading all relevant parquet files in there. +Data loading happens by storing parquet files in an S3 bucket and defining a schema on Athena. If you query data via SQL queries on Athena, the returned data is read by scanning your bucket and reading all relevant parquet files in there. `dlt` internal tables are saved as Iceberg tables. ### Data types -Athena tables store timestamps with millisecond precision and with that precision we generate parquet files. Mind that Iceberg tables have microsecond precision. +Athena tables store timestamps with millisecond precision, and with that precision, we generate parquet files. Keep in mind that Iceberg tables have microsecond precision. -Athena does not support JSON fields so JSON is stored as string. +Athena does not support JSON fields, so JSON is stored as a string. > ā—**Athena does not support TIME columns in parquet files**. `dlt` will fail such jobs permanently. Convert `datetime.time` objects to `str` or `datetime.datetime` to load them. ### Naming Convention -We follow our snake_case name convention. Mind the following: -* DDL use HIVE escaping with `````` +We follow our snake_case name convention. Keep the following in mind: +* DDL uses HIVE escaping with `````` * Other queries use PRESTO and regular SQL escaping. ## Staging support -Using a staging destination is mandatory when using the athena destination. If you do not set staging to `filesystem`, dlt will automatically do this for you. +Using a staging destination is mandatory when using the Athena destination. If you do not set staging to `filesystem`, `dlt` will automatically do this for you. If you decide to change the [filename layout](./filesystem#data-loading) from the default value, keep the following in mind so that Athena can reliably build your tables: - - You need to provide the `{table_name}` placeholder and this placeholder needs to be followed by a forward slash - - You need to provide the `{file_id}` placeholder and it needs to be somewhere after the `{table_name}` placeholder. - - {table_name} must be the first placeholder in the layout. + - You need to provide the `{table_name}` placeholder, and this placeholder needs to be followed by a forward slash. + - You need to provide the `{file_id}` placeholder, and it needs to be somewhere after the `{table_name}` placeholder. + - `{table_name}` must be the first placeholder in the layout. ## Additional destination options -### iceberg data tables -You can save your tables as iceberg tables to athena. This will enable you to for example delete data from them later if you need to. To switch a resouce to the iceberg table-format, -supply the table_format argument like this: +### Iceberg data tables +You can save your tables as Iceberg tables to Athena. This will enable you, for example, to delete data from them later if you need to. To switch a resource to the iceberg table format, supply the table_format argument like this: ```python @dlt.resource(table_format="iceberg") @@ -131,29 +128,26 @@ def data() -> Iterable[TDataItem]: ... ``` -Alternatively you can set all tables to use the iceberg format with a config variable: +Alternatively, you can set all tables to use the iceberg format with a config variable: ```toml [destination.athena] force_iceberg = "True" ``` -For every table created as an iceberg table, the athena destination will create a regular athena table in the staging dataset of both the filesystem as well as the athena glue catalog and then -copy all data into the final iceberg table that lives with the non-iceberg tables in the same dataset on both filesystem and the glue catalog. Switching from iceberg to regular table or vice versa -is not supported. +For every table created as an iceberg table, the Athena destination will create a regular Athena table in the staging dataset of both the filesystem and the Athena glue catalog, and then copy all data into the final iceberg table that lives with the non-iceberg tables in the same dataset on both the filesystem and the glue catalog. Switching from iceberg to regular table or vice versa is not supported. ### dbt support -Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of generated dbt profile. Iceberg tables are supported but you need to make sure that you materialize your models as iceberg tables if your source table is iceberg. We encountered problems with materializing -date time columns due to different precision on iceberg (nanosecond) and regular Athena tables (millisecond). -The Athena adapter requires that you setup **region_name** in Athena configuration below. You can also setup table catalog name to change the default: **awsdatacatalog** +Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of the generated dbt profile. Iceberg tables are supported, but you need to make sure that you materialize your models as iceberg tables if your source table is iceberg. We encountered problems with materializing date time columns due to different precision on iceberg (nanosecond) and regular Athena tables (millisecond). +The Athena adapter requires that you set up **region_name** in the Athena configuration below. You can also set up the table catalog name to change the default: **awsdatacatalog** ```toml [destination.athena] aws_data_catalog="awsdatacatalog" ``` ### Syncing ofĀ `dlt`Ā state -- This destination fully supportsĀ [dlt state sync.](../../general-usage/state#syncing-state-with-destination). The state is saved in athena iceberg tables in your s3 bucket. +- This destination fully supportsĀ [dlt state sync.](../../general-usage/state#syncing-state-with-destination). The state is saved in Athena iceberg tables in your S3 bucket. ## Supported file formats diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 25b01923b5..e852bfa9e5 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -28,7 +28,7 @@ dlt init chess bigquery pip install -r requirements.txt ``` -This will install dlt with **bigquery** extra, which contains all the dependencies required by the bigquery client. +This will install dlt with the `bigquery` extra, which contains all the dependencies required by the bigquery client. **3. Log in to or create a Google Cloud account** @@ -58,7 +58,7 @@ You don't need to grant users access to this service account now, so click the ` In the service accounts table page that you're redirected to after clicking `Done` as instructed above, select the three dots under the `Actions` column for the service account you created and select `Manage keys`. -This will take you to page where you can click the `Add key` button, then the `Create new key` button, +This will take you to a page where you can click the `Add key` button, then the `Create new key` button, and finally the `Create` button, keeping the preselected `JSON` option. A `JSON` file that includes your service account private key will then be downloaded. @@ -83,11 +83,11 @@ private_key = "private_key" # please set me up! client_email = "client_email" # please set me up! ``` -You can specify the location of the data i.e. `EU` instead of `US` which is a default. +You can specify the location of the data i.e. `EU` instead of `US` which is the default. ### OAuth 2.0 Authentication -You can use the OAuth 2.0 authentication. You'll need to generate a **refresh token** with right scopes (I suggest to ask our GPT-4 assistant for details). +You can use OAuth 2.0 authentication. You'll need to generate a **refresh token** with the right scopes (we suggest asking our GPT-4 assistant for details). Then you can fill the following information in `secrets.toml` ```toml @@ -103,9 +103,9 @@ refresh_token = "refresh_token" # please set me up! ### Using Default Credentials -Google provides several ways to get default credentials i.e. from `GOOGLE_APPLICATION_CREDENTIALS` environment variable or metadata services. +Google provides several ways to get default credentials i.e. from the `GOOGLE_APPLICATION_CREDENTIALS` environment variable or metadata services. VMs available on GCP (cloud functions, Composer runners, Colab notebooks) have associated service accounts or authenticated users. -Will try to use default credentials if nothing is explicitly specified in the secrets. +`dlt` will try to use default credentials if nothing is explicitly specified in the secrets. ```toml [destination.bigquery] @@ -114,16 +114,16 @@ location = "US" ## Write Disposition -All write dispositions are supported +All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized` the destination tables will be dropped and +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and recreated with a [clone command](https://cloud.google.com/bigquery/docs/table-clones-create) from the staging tables. ## Data Loading -`dlt` uses `BigQuery` load jobs that send files from local filesystem or gcs buckets. -Loader follows [Google recommendations](https://cloud.google.com/bigquery/docs/error-messages) when retrying and terminating jobs. -Google BigQuery client implements elaborate retry mechanism and timeouts for queries and file uploads, which may be configured in destination options. +`dlt` uses `BigQuery` load jobs that send files from the local filesystem or GCS buckets. +The loader follows [Google recommendations](https://cloud.google.com/bigquery/docs/error-messages) when retrying and terminating jobs. +The Google BigQuery client implements an elaborate retry mechanism and timeouts for queries and file uploads, which may be configured in destination options. ## Supported File Formats @@ -143,36 +143,36 @@ When staging is enabled: BigQuery supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): -* `partition` - creates a partition with a day granularity on decorated column (`PARTITION BY DATE`). - May be used with `datetime`, `date` and `bigint` data types. +* `partition` - creates a partition with a day granularity on the decorated column (`PARTITION BY DATE`). + May be used with `datetime`, `date`, and `bigint` data types. Only one column per table is supported and only when a new table is created. For more information on BigQuery partitioning, read the [official docs](https://cloud.google.com/bigquery/docs/partitioned-tables). > ā— `bigint` maps to BigQuery's **INT64** data type. > Automatic partitioning requires converting an INT64 column to a UNIX timestamp, which `GENERATE_ARRAY` doesn't natively support. > With a 10,000 partition limit, we canā€™t cover the full INT64 range. - > Instead, we set 86,400 second boundaries to enable daily partitioning. + > Instead, we set 86,400-second boundaries to enable daily partitioning. > This captures typical values, but extremely large/small outliers go to an `__UNPARTITIONED__` catch-all partition. * `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. ## Staging Support -BigQuery supports gcs as a file staging destination. dlt will upload files in the parquet format to gcs and ask BigQuery to copy their data directly into the db. -Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your gcs bucket with the bucket_url and credentials. -If you use the same service account for gcs and your redshift deployment, you do not need to provide additional authentication for BigQuery to be able to read from your bucket. +BigQuery supports GCS as a file staging destination. `dlt` will upload files in the parquet format to GCS and ask BigQuery to copy their data directly into the database. +Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your GCS bucket with the bucket_url and credentials. +If you use the same service account for GCS and your Redshift deployment, you do not need to provide additional authentication for BigQuery to be able to read from your bucket. -Alternatively to parquet files, you can specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +Alternatively to parquet files, you can specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. ### BigQuery/GCS Staging Example ```python # Create a dlt pipeline that will load # chess player data to the BigQuery destination -# via a gcs bucket. +# via a GCS bucket. pipeline = dlt.pipeline( pipeline_name='chess_pipeline', - destination='biquery', + destination='bigquery', staging='filesystem', # Add this to activate the staging location. dataset_name='player_data' ) @@ -180,7 +180,7 @@ pipeline = dlt.pipeline( ## Additional Destination Options -You can configure the data location and various timeouts as shown below. This information is not a secret so can be placed in `config.toml` as well: +You can configure the data location and various timeouts as shown below. This information is not a secret so it can be placed in `config.toml` as well: ```toml [destination.bigquery] @@ -191,15 +191,15 @@ retry_deadline=60.0 ``` * `location` sets the [BigQuery data location](https://cloud.google.com/bigquery/docs/locations) (default: **US**) -* `http_timeout` sets the timeout when connecting and getting a response from BigQuery API (default: **15 seconds**) -* `file_upload_timeout` a timeout for file upload when loading local files: the total time of the upload may not exceed this value (default: **30 minutes**, set in seconds) -* `retry_deadline` a deadline for a [DEFAULT_RETRY used by Google](https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout) +* `http_timeout` sets the timeout when connecting and getting a response from the BigQuery API (default: **15 seconds**) +* `file_upload_timeout` is a timeout for file upload when loading local files: the total time of the upload may not exceed this value (default: **30 minutes**, set in seconds) +* `retry_deadline` is a deadline for a [DEFAULT_RETRY used by Google](https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout) ### dbt Support This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-bigquery](https://github.com/dbt-labs/dbt-bigquery). Credentials, if explicitly defined, are shared with `dbt` along with other settings like **location** and retries and timeouts. -In case of implicit credentials (i.e. available in cloud function), `dlt` shares the `project_id` and delegates obtaining credentials to `dbt` adapter. +In the case of implicit credentials (i.e. available in a cloud function), `dlt` shares the `project_id` and delegates obtaining credentials to the `dbt` adapter. ### Syncing of `dlt` State @@ -215,7 +215,7 @@ The adapter updates the DltResource with metadata about the destination column a ### Use an Adapter to Apply Hints to a Resource -Here is an example of how to use the `bigquery_adapter` method to apply hints to a resource on both column level and table level: +Here is an example of how to use the `bigquery_adapter` method to apply hints to a resource on both the column level and table level: ```python from datetime import date, timedelta @@ -246,9 +246,9 @@ bigquery_adapter( bigquery_adapter(event_data, table_description="Dummy event data.") ``` -Above, the adapter specifies that `event_date` should be used for partitioning and both `event_date` and `user_id` should be used for clustering (in the given order) when the table is created. +In the example above, the adapter specifies that `event_date` should be used for partitioning and both `event_date` and `user_id` should be used for clustering (in the given order) when the table is created. -Some things to note with the adapter's behaviour: +Some things to note with the adapter's behavior: - You can only partition on one column (refer to [supported hints](#supported-column-hints)). - You can cluster on as many columns as you would like. diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index fc100e41e2..d00c603c14 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -7,7 +7,7 @@ keywords: [Databricks, destination, data warehouse] --- # Databricks -*Big thanks to Evan Phillips and [swishbi.com](https://swishbi.com/) for contributing code, time and test environment* +*Big thanks to Evan Phillips and [swishbi.com](https://swishbi.com/) for contributing code, time, and a test environment.* ## Install dlt with Databricks **To install the DLT library with Databricks dependencies:** @@ -28,7 +28,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 1. Create a Databricks workspace in Azure - In your Azure Portal search for Databricks and create a new workspace. In the "Pricing Tier" section, select "Premium" to be able to use the Unity Catalog. + In your Azure Portal, search for Databricks and create a new workspace. In the "Pricing Tier" section, select "Premium" to be able to use the Unity Catalog. 2. Create an ADLS Gen 2 storage account @@ -42,7 +42,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 4. Create an Access Connector for Azure Databricks This will allow Databricks to access your storage account. - In the Azure Portal search for "Access Connector for Azure Databricks" and create a new connector. + In the Azure Portal, search for "Access Connector for Azure Databricks" and create a new connector. 5. Grant access to your storage container @@ -54,16 +54,16 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 1. Now go to your Databricks workspace - To get there from the Azure Portal, search for "Databricks" and select your Databricks and click "Launch Workspace". + To get there from the Azure Portal, search for "Databricks", select your Databricks, and click "Launch Workspace". 2. In the top right corner, click on your email address and go to "Manage Account" 3. Go to "Data" and click on "Create Metastore" Name your metastore and select a region. - If you'd like to set up a storage container for the whole metastore you can add your ADLS URL and Access Connector Id here. You can also do this on a granular level when creating the catalog. + If you'd like to set up a storage container for the whole metastore, you can add your ADLS URL and Access Connector Id here. You can also do this on a granular level when creating the catalog. - In the next step assign your metastore to your workspace. + In the next step, assign your metastore to your workspace. 4. Go back to your workspace and click on "Catalog" in the left-hand menu @@ -77,7 +77,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade Set the URL of our storage container. This should be in the form: `abfss://@.dfs.core.windows.net/` - Once created you can test the connection to make sure the container is accessible from databricks. + Once created, you can test the connection to make sure the container is accessible from Databricks. 7. Now you can create a catalog @@ -113,7 +113,7 @@ Example: [destination.databricks.credentials] server_hostname = "MY_DATABRICKS.azuredatabricks.net" http_path = "/sql/1.0/warehouses/12345" -access_token "MY_ACCESS_TOKEN" +access_token = "MY_ACCESS_TOKEN" catalog = "my_catalog" ``` @@ -123,7 +123,7 @@ All write dispositions are supported ## Data loading Data is loaded using `INSERT VALUES` statements by default. -Efficient loading from a staging filesystem is also supported by configuring an Amazon S3 or Azure Blob Storage bucket as a staging destination. When staging is enabled `dlt` will upload data in `parquet` files to the bucket and then use `COPY INTO` statements to ingest the data into Databricks. +Efficient loading from a staging filesystem is also supported by configuring an Amazon S3 or Azure Blob Storage bucket as a staging destination. When staging is enabled, `dlt` will upload data in `parquet` files to the bucket and then use `COPY INTO` statements to ingest the data into Databricks. For more information on staging, see the [staging support](#staging-support) section below. ## Supported file formats @@ -133,7 +133,7 @@ For more information on staging, see the [staging support](#staging-support) sec The `jsonl` format has some limitations when used with Databricks: -1. Compression must be disabled to load jsonl files in databricks. Set `data_writer.disable_compression` to `true` in dlt config when using this format. +1. Compression must be disabled to load jsonl files in Databricks. Set `data_writer.disable_compression` to `true` in dlt config when using this format. 2. The following data types are not supported when using `jsonl` format with `databricks`: `decimal`, `complex`, `date`, `binary`. Use `parquet` if your data contains these types. 3. `bigint` data type with precision is not supported with `jsonl` format @@ -144,16 +144,16 @@ Databricks supports both Amazon S3 and Azure Blob Storage as staging locations. ### Databricks and Amazon S3 -Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on connecting your s3 bucket with the bucket_url and credentials. +Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on connecting your S3 bucket with the bucket_url and credentials. -Example to set up Databricks with s3 as a staging destination: +Example to set up Databricks with S3 as a staging destination: ```python import dlt # Create a dlt pipeline that will load # chess player data to the Databricks destination -# via staging on s3 +# via staging on S3 pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='databricks', @@ -195,4 +195,4 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci - [Load data from Google Analytics to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-databricks) - [Load data from Google Sheets to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-databricks) - [Load data from Chess.com to Databricks in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-databricks) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index db7428dcc9..9452a80c50 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -7,38 +7,38 @@ keywords: [duckdb, destination, data warehouse] # DuckDB ## Install dlt with DuckDB -**To install the DLT library with DuckDB dependencies:** +**To install the DLT library with DuckDB dependencies, run:** ``` pip install dlt[duckdb] ``` ## Setup Guide -**1. Initialize a project with a pipeline that loads to DuckDB by running** +**1. Initialize a project with a pipeline that loads to DuckDB by running:** ``` dlt init chess duckdb ``` -**2. Install the necessary dependencies for DuckDB by running** +**2. Install the necessary dependencies for DuckDB by running:** ``` pip install -r requirements.txt ``` -**3. Run the pipeline** +**3. Run the pipeline:** ``` python3 chess_pipeline.py ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. ## Data loading -`dlt` will load data using large INSERT VALUES statements by default. Loading is multithreaded (20 threads by default). If you are ok with installing `pyarrow` we suggest to switch to `parquet` as file format. Loading is faster (and also multithreaded). +`dlt` will load data using large INSERT VALUES statements by default. Loading is multithreaded (20 threads by default). If you are okay with installing `pyarrow`, we suggest switching to `parquet` as the file format. Loading is faster (and also multithreaded). ### Names normalization -`dlt` uses standard **snake_case** naming convention to keep identical table and column identifiers across all destinations. If you want to use **duckdb** wide range of characters (ie. emojis) for table and column names, you can switch to **duck_case** naming convention which accepts almost any string as an identifier: +`dlt` uses the standard **snake_case** naming convention to keep identical table and column identifiers across all destinations. If you want to use the **duckdb** wide range of characters (i.e., emojis) for table and column names, you can switch to the **duck_case** naming convention, which accepts almost any string as an identifier: * `\n` `\r` and `" are translated to `_` -* multiple `_` are translated to single `_` +* multiple `_` are translated to a single `_` Switch the naming convention using `config.toml`: ```toml @@ -46,31 +46,31 @@ Switch the naming convention using `config.toml`: naming="duck_case" ``` -or via env variable `SCHEMA__NAMING` or directly in code: +or via the env variable `SCHEMA__NAMING` or directly in the code: ```python dlt.config["schema.naming"] = "duck_case" ``` :::caution -**duckdb** identifiers are **case insensitive** but display names preserve case. This may create name clashes if for example you load json with -`{"Column": 1, "column": 2}` will map data to a single column. +**duckdb** identifiers are **case insensitive** but display names preserve case. This may create name clashes if, for example, you load JSON with +`{"Column": 1, "column": 2}` as it will map data to a single column. ::: ## Supported file formats -You can configure the following file formats to load data to duckdb +You can configure the following file formats to load data to duckdb: * [insert-values](../file-formats/insert-format.md) is used by default * [parquet](../file-formats/parquet.md) is supported :::note -`duckdb` cannot COPY many parquet files to a single table from multiple threads. In this situation `dlt` serializes the loads. Still - that may be faster than INSERT +`duckdb` cannot COPY many parquet files to a single table from multiple threads. In this situation, `dlt` serializes the loads. Still, that may be faster than INSERT. ::: -* [jsonl](../file-formats/jsonl.md) **is supported but does not work if JSON fields are optional. the missing keys fail the COPY instead of being interpreted as NULL** +* [jsonl](../file-formats/jsonl.md) **is supported but does not work if JSON fields are optional. The missing keys fail the COPY instead of being interpreted as NULL.** ## Supported column hints -`duckdb` may create unique indexes for all columns with `unique` hints but this behavior **is disabled by default** because it slows the loading down significantly. +`duckdb` may create unique indexes for all columns with `unique` hints, but this behavior **is disabled by default** because it slows the loading down significantly. ## Destination Configuration -By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:` which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. +By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. The `duckdb` credentials do not require any secret values. You are free to pass the configuration explicitly via the `credentials` parameter to `dlt.pipeline` or `pipeline.run` methods. For example: ```python @@ -88,17 +88,17 @@ db = duckdb.connect() p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='chess_data', full_refresh=False, credentials=db) ``` -This destination accepts database connection strings in format used by [duckdb-engine](https://github.com/Mause/duckdb_engine#configuration). +This destination accepts database connection strings in the format used by [duckdb-engine](https://github.com/Mause/duckdb_engine#configuration). -You can configure a DuckDB destination with [secret / config values](../../general-usage/credentials) (e.g. using a `secrets.toml` file) +You can configure a DuckDB destination with [secret / config values](../../general-usage/credentials) (e.g., using a `secrets.toml` file) ```toml destination.duckdb.credentials=duckdb:///_storage/test_quack.duckdb ``` -**duckdb://** url above creates a **relative** path to `_storage/test_quack.duckdb`. To define **absolute** path you need to specify four slashes ie. `duckdb:////_storage/test_quack.duckdb`. +The **duckdb://** URL above creates a **relative** path to `_storage/test_quack.duckdb`. To define an **absolute** path, you need to specify four slashes, i.e., `duckdb:////_storage/test_quack.duckdb`. A few special connection strings are supported: -* **:pipeline:** creates the database in the working directory of the pipeline with name `quack.duckdb`. -* **:memory:** creates in memory database. This may be useful for testing. +* **:pipeline:** creates the database in the working directory of the pipeline with the name `quack.duckdb`. +* **:memory:** creates an in-memory database. This may be useful for testing. ### Additional configuration @@ -109,10 +109,10 @@ create_indexes=true ``` ### dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb) which is a community supported package. The `duckdb` database is shared with `dbt`. In rare cases you may see information that binary database format does not match the database format expected by `dbt-duckdb`. You may avoid that by updating the `duckdb` package in your `dlt` project with `pip install -U`. +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb), which is a community-supported package. The `duckdb` database is shared with `dbt`. In rare cases, you may see information that the binary database format does not match the database format expected by `dbt-duckdb`. You can avoid that by updating the `duckdb` package in your `dlt` project with `pip install -U`. ### Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). ## Additional Setup guides @@ -124,4 +124,4 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci - [Load data from Chess.com to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-duckdb) - [Load data from HubSpot to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-duckdb) - [Load data from GitHub to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-duckdb) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/index.md b/docs/website/docs/dlt-ecosystem/destinations/index.md index 5d26c0f138..2c24d14312 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/index.md +++ b/docs/website/docs/dlt-ecosystem/destinations/index.md @@ -5,11 +5,11 @@ keywords: ['destinations'] --- import DocCardList from '@theme/DocCardList'; -Pick one of our high quality destinations and load your data to a local database, warehouse or a data lake. Append, replace or merge your data. Apply performance hints like partitions, clusters or indexes. Load directly or via staging. Each of our destinations goes through few hundred automated tests every day. +Pick one of our high-quality destinations and load your data into a local database, warehouse, or data lake. Append, replace, or merge your data. Apply performance hints like partitions, clusters, or indexes. Load directly or via staging. Each of our destinations undergoes several hundred automated tests every day. -* Destination or feature missing? [Join our Slack community](https://dlthub.com/community) and ask for it -* Need more info? [Join our Slack community](https://dlthub.com/community) and ask in the tech help channel or [Talk to an engineer](https://calendar.app.google/kiLhuMsWKpZUpfho6) +* Is a destination or feature missing? [Join our Slack community](https://dlthub.com/community) and ask for it. +* Need more info? [Join our Slack community](https://dlthub.com/community) and ask in the tech help channel or [Talk to an engineer](https://calendar.app.google/kiLhuMsWKpZUpfho6). -Otherwise pick a destination below: +Otherwise, pick a destination below: diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index b002286bcf..1288b9caac 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -5,7 +5,7 @@ keywords: [MotherDuck, duckdb, destination, data warehouse] --- # MotherDuck -> šŸ§Ŗ MotherDuck is still invitation only and intensively tested. Please see the limitations / problems at the end. +> šŸ§Ŗ MotherDuck is still invitation-only and is being intensively tested. Please see the limitations/problems at the end. ## Install dlt with MotherDuck **To install the DLT library with MotherDuck dependencies:** @@ -14,12 +14,12 @@ pip install dlt[motherduck] ``` :::tip -Decrease the number of load workers to 3-5 depending on the quality of your internet connection if you see a lot of retries in your logs with various timeout, add the following to your `config.toml`: +If you see a lot of retries in your logs with various timeouts, decrease the number of load workers to 3-5 depending on the quality of your internet connection. Add the following to your `config.toml`: ```toml [load] workers=3 ``` -or export **LOAD__WORKERS=3** env variable. See more in [performance](../../reference/performance.md) +or export the **LOAD__WORKERS=3** env variable. See more in [performance](../../reference/performance.md) ::: ## Setup Guide @@ -34,7 +34,7 @@ dlt init chess motherduck pip install -r requirements.txt ``` -This will install dlt with **motherduck** extra which contains **duckdb** and **pyarrow** dependencies +This will install dlt with the **motherduck** extra which contains **duckdb** and **pyarrow** dependencies. **3. Add your MotherDuck token to `.dlt/secrets.toml`** ```toml @@ -42,63 +42,61 @@ This will install dlt with **motherduck** extra which contains **duckdb** and ** database = "dlt_data_3" password = "" ``` -Paste your **service token** into password. The `database` field is optional but we recommend to set it. MotherDuck will create this database (in this case `dlt_data_3`) for you. +Paste your **service token** into the password field. The `database` field is optional, but we recommend setting it. MotherDuck will create this database (in this case `dlt_data_3`) for you. -Alternatively you can use the connection string syntax +Alternatively, you can use the connection string syntax. ```toml [destination] motherduck.credentials="md:///dlt_data_3?token=" ``` -**3. Run the pipeline** +**4. Run the pipeline** ``` python3 chess_pipeline.py ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. ## Data loading -By default **parquet** files and `COPY` command is used to move files to remote duckdb database. All write dispositions are supported. +By default, Parquet files and the `COPY` command are used to move files to the remote duckdb database. All write dispositions are supported. -**INSERT** format is also supported and will execute a large INSERT queries directly into the remote database. This is way slower and may exceed maximum query size - so not advised. +The **INSERT** format is also supported and will execute large INSERT queries directly into the remote database. This method is significantly slower and may exceed the maximum query size, so it is not advised. ## dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb) which is a community supported package. `dbt` version >= 1.5 is required (which is current `dlt` default.) +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb), which is a community-supported package. `dbt` version >= 1.5 is required (which is the current `dlt` default.) ## Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). ## Automated tests -Each destination must pass few hundred automatic tests. MotherDuck is passing those tests (except the transactions OFC). However we encountered issues with ATTACH timeouts when connecting which makes running such number of tests unstable. Tests on CI are disabled. +Each destination must pass a few hundred automatic tests. MotherDuck is passing these tests (except for the transactions, of course). However, we have encountered issues with ATTACH timeouts when connecting, which makes running such a number of tests unstable. Tests on CI are disabled. ## Troubleshooting / limitations ### I see a lot of errors in the log like DEADLINE_EXCEEDED or Connection timed out -Motherduck is very sensitive to quality of the internet connection and **number of workers used to load data**. Decrease the number of workers and make sure your internet connection really works. We could not find any way to increase those timeouts yet. - +MotherDuck is very sensitive to the quality of the internet connection and the **number of workers used to load data**. Decrease the number of workers and ensure your internet connection is stable. We have not found any way to increase these timeouts yet. ### MotherDuck does not support transactions. -Do not use `begin`, `commit` and `rollback` on `dlt` **sql_client** or on duckdb dbapi connection. It has no effect for DML statements (they are autocommit). It is confusing the query engine for DDL (tables not found etc.). -If your connection if of poor quality and you get a time out when executing DML query it may happen that your transaction got executed, - +Do not use `begin`, `commit`, and `rollback` on `dlt` **sql_client** or on the duckdb dbapi connection. It has no effect on DML statements (they are autocommit). It confuses the query engine for DDL (tables not found, etc.). +If your connection is of poor quality and you get a timeout when executing a DML query, it may happen that your transaction got executed. ### I see some exception with home_dir missing when opening `md:` connection. -Some internal component (HTTPS) requires **HOME** env variable to be present. Export such variable to the command line. Here is what we do in our tests: +Some internal component (HTTPS) requires the **HOME** env variable to be present. Export such a variable to the command line. Here is what we do in our tests: ```python os.environ["HOME"] = "/tmp" ``` -before opening connection +before opening the connection. ### I see some watchdog timeouts. We also see them. ``` 'ATTACH_DATABASE': keepalive watchdog timeout ``` -My observation is that if you write a lot of data into the database then close the connection and then open it again to write, there's a chance of such timeout. Possible **WAL** file is being written to the remote duckdb database. +Our observation is that if you write a lot of data into the database, then close the connection and then open it again to write, there's a chance of such a timeout. A possible **WAL** file is being written to the remote duckdb database. ### Invalid Input Error: Initialization function "motherduck_init" from file Use `duckdb 0.8.1` or above. - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 9d216a52a3..5ed4b69707 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -7,7 +7,7 @@ keywords: [mssql, sqlserver, destination, data warehouse] # Microsoft SQL Server ## Install dlt with MS SQL -**To install the DLT library with MS SQL dependencies:** +**To install the DLT library with MS SQL dependencies, use:** ``` pip install dlt[mssql] ``` @@ -16,23 +16,23 @@ pip install dlt[mssql] ### Prerequisites -_Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. -This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). +The _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. +This cannot be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). Supported driver versions: * `ODBC Driver 18 for SQL Server` * `ODBC Driver 17 for SQL Server` -You can [configure driver name](#additional-destination-options) explicitly as well. +You can also [configure the driver name](#additional-destination-options) explicitly. ### Create a pipeline -**1. Initalize a project with a pipeline that loads to MS SQL by running** +**1. Initialize a project with a pipeline that loads to MS SQL by running:** ``` dlt init chess mssql ``` -**2. Install the necessary dependencies for MS SQL by running** +**2. Install the necessary dependencies for MS SQL by running:** ``` pip install -r requirements.txt ``` @@ -40,11 +40,11 @@ or run: ``` pip install dlt[mssql] ``` -This will install dlt with **mssql** extra which contains all the dependencies required by the SQL server client. +This will install `dlt` with the `mssql` extra, which contains all the dependencies required by the SQL server client. **3. Enter your credentials into `.dlt/secrets.toml`.** -Example, replace with your database connection info: +For example, replace with your database connection info: ```toml [destination.mssql.credentials] database = "dlt_data" @@ -61,34 +61,34 @@ You can also pass a SQLAlchemy-like database connection: destination.mssql.credentials="mssql://loader:@loader.database.windows.net/dlt_data?connect_timeout=15" ``` -To pass credentials directly you can use `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. +To pass credentials directly, you can use the `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. ```python pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="mssql://loader:@loader.database.windows.net/dlt_data?connect_timeout=15") ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized` the destination tables will be dropped and +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and recreated with an `ALTER SCHEMA ... TRANSFER`. The operation is atomic: mssql supports DDL transactions. ## Data loading -Data is loaded via INSERT statements by default. MSSQL has a limit of 1000 rows per INSERT and this is what we use. +Data is loaded via INSERT statements by default. MSSQL has a limit of 1000 rows per INSERT, and this is what we use. ## Supported file formats * [insert-values](../file-formats/insert-format.md) is used by default ## Supported column hints -**mssql** will create unique indexes for all columns with `unique` hints. This behavior **may be disabled** +**mssql** will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. ## Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). ## Data types -MS SQL does not support JSON columns, so JSON objects are stored as strings in `nvarchar` column. +MS SQL does not support JSON columns, so JSON objects are stored as strings in `nvarchar` columns. ## Additional destination options -**mssql** destination **does not** creates UNIQUE indexes by default on columns with `unique` hint (ie. `_dlt_id`). To enable this behavior +The **mssql** destination **does not** create UNIQUE indexes by default on columns with the `unique` hint (i.e., `_dlt_id`). To enable this behavior: ```toml [destination.mssql] create_indexes=true @@ -108,7 +108,7 @@ destination.mssql.credentials="mssql://loader:@loader.database.windows ``` ### dbt support -No dbt support yet +No dbt support yet. ## Additional Setup guides @@ -120,4 +120,4 @@ No dbt support yet - [Load data from GitHub to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-mssql) - [Load data from Notion to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-mssql) - [Load data from HubSpot to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-mssql) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index cd0ea08929..10b935c083 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -7,47 +7,47 @@ keywords: [postgres, destination, data warehouse] # Postgres ## Install dlt with PostgreSQL -**To install the DLT library with PostgreSQL dependencies:** +**To install the DLT library with PostgreSQL dependencies, run:** ``` pip install dlt[postgres] ``` ## Setup Guide -**1. Initialize a project with a pipeline that loads to Postgres by running** +**1. Initialize a project with a pipeline that loads to Postgres by running:** ``` dlt init chess postgres ``` -**2. Install the necessary dependencies for Postgres by running** +**2. Install the necessary dependencies for Postgres by running:** ``` pip install -r requirements.txt ``` -This will install dlt with **postgres** extra which contains `psycopg2` client. +This will install dlt with the `postgres` extra, which contains the `psycopg2` client. -**3. Create a new database after setting up a Postgres instance and `psql` / query editor by running** +**3. After setting up a Postgres instance and `psql` / query editor, create a new database by running:** ``` CREATE DATABASE dlt_data; ``` -Add `dlt_data` database to `.dlt/secrets.toml`. +Add the `dlt_data` database to `.dlt/secrets.toml`. -**4. Create a new user by running** +**4. Create a new user by running:** ``` CREATE USER loader WITH PASSWORD ''; ``` -Add `loader` user and `` password to `.dlt/secrets.toml`. +Add the `loader` user and `` password to `.dlt/secrets.toml`. -**5. Give the `loader` user owner permissions by running** +**5. Give the `loader` user owner permissions by running:** ``` ALTER DATABASE dlt_data OWNER TO loader; ``` -It is possible to set more restrictive permissions (e.g. give user access to a specific schema). +You can set more restrictive permissions (e.g., give user access to a specific schema). **6. Enter your credentials into `.dlt/secrets.toml`.** -It should now look like +It should now look like this: ```toml [destination.postgres.credentials] @@ -59,33 +59,33 @@ port = 5432 connect_timeout = 15 ``` -You can also pass a database connection string similar to the one used by `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). Credentials above will look like this: +You can also pass a database connection string similar to the one used by the `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). The credentials above will look like this: ```toml # keep it at the top of your toml file! before any section starts destination.postgres.credentials="postgresql://loader:@localhost/dlt_data?connect_timeout=15" ``` -To pass credentials directly you can use `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. +To pass credentials directly, you can use the `credentials` argument passed to the `dlt.pipeline` or `pipeline.run` methods. ```python pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="postgresql://loader:@localhost/dlt_data") ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized` the destination tables will be dropped and replaced by the staging tables. +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and replaced by the staging tables. ## Data loading `dlt` will load data using large INSERT VALUES statements by default. Loading is multithreaded (20 threads by default). ## Supported file formats -* [insert-values](../file-formats/insert-format.md) is used by default +* [insert-values](../file-formats/insert-format.md) is used by default. ## Supported column hints -`postgres` will create unique indexes for all columns with `unique` hints. This behavior **may be disabled** +`postgres` will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. ## Additional destination options -Postgres destination creates UNIQUE indexes by default on columns with `unique` hint (ie. `_dlt_id`). To disable this behavior +The Postgres destination creates UNIQUE indexes by default on columns with the `unique` hint (i.e., `_dlt_id`). To disable this behavior: ```toml [destination.postgres] create_indexes=false @@ -95,16 +95,16 @@ create_indexes=false This destination [integrates with dbt](../transformations/dbt/dbt.md) via dbt-postgres. ### Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). -## Additional Setup guides - -- [Load data from HubSpot to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-postgres) -- [Load data from GitHub to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-postgres) -- [Load data from Chess.com to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-postgres) -- [Load data from Notion to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-postgres) -- [Load data from Google Analytics to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-postgres) -- [Load data from Google Sheets to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-postgres) -- [Load data from Stripe to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-postgres) - \ No newline at end of file +## Additional Setup Guides + +- [Load data from HubSpot to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-postgres) +- [Load data from GitHub to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-postgres) +- [Load data from Chess.com to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-postgres) +- [Load data from Notion to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-postgres) +- [Load data from Google Analytics to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-postgres) +- [Load data from Google Sheets to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-postgres) +- [Load data from Stripe to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-postgres) + diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index 04b5cac19b..ff37252852 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -92,7 +92,7 @@ qdrant_adapter(data, embed) It accepts the following arguments: -- `data`: a dlt resource object or a Python data structure (e.g. a list of dictionaries). +- `data`: a dlt resource object or a Python data structure (e.g., a list of dictionaries). - `embed`: a name of the field or a list of names to generate embeddings for. Returns: [DLT resource](../../general-usage/resource.md) object that you can pass to the `pipeline.run()`. @@ -135,7 +135,7 @@ info = pipeline.run( ### Merge The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination. -For `merge` disposition, you would need to specify a `primary_key` for the resource: +For the `merge` disposition, you need to specify a `primary_key` for the resource: ```python info = pipeline.run( @@ -166,7 +166,7 @@ Qdrant uses collections to categorize and identify data. To avoid potential nami For example, if you have a dataset named `movies_dataset` and a table named `actors`, the Qdrant collection name would be `movies_dataset_actors` (the default separator is an underscore). -However, if you prefer to have class names without the dataset prefix, skip `dataset_name` argument. +However, if you prefer to have class names without the dataset prefix, skip the `dataset_name` argument. For example: @@ -185,7 +185,7 @@ pipeline = dlt.pipeline( - `upload_batch_size`: (int) The batch size for data uploads. The default value is 64. -- `upload_parallelism`: (int) The maximal number of concurrent threads to run data uploads. The default value is 1. +- `upload_parallelism`: (int) The maximum number of concurrent threads to run data uploads. The default value is 1. - `upload_max_retries`: (int) The number of retries to upload data in case of failure. The default value is 3. @@ -222,4 +222,4 @@ You can find the setup instructions to run Qdrant [here](https://qdrant.tech/doc Qdrant destination supports syncing of the `dlt` state. - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index cb220a31fc..bc03dbbbeb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -29,7 +29,7 @@ pip install -r requirements.txt or withĀ `pip install dlt[redshift]`,Ā which installs theĀ `dlt`Ā library and the necessary dependencies for working with Amazon Redshift as a destination. ### 2. Setup Redshift cluster -To load data into Redshift, it is necessary to create a Redshift cluster and enable access to your IP address through the VPC inbound rules associated with the cluster. While we recommend asking our GPT-4 assistant for details, we have provided a general outline of the process below: +To load data into Redshift, you need to create a Redshift cluster and enable access to your IP address through the VPC inbound rules associated with the cluster. While we recommend asking our GPT-4 assistant for details, we have provided a general outline of the process below: 1. You can use an existing cluster or create a new one. 2. To create a new cluster, navigate to the 'Provisioned Cluster Dashboard' and click 'Create Cluster'. @@ -59,9 +59,9 @@ To load data into Redshift, it is necessary to create a Redshift cluster and ena redshift-cluster-1.cv3cmsy7t4il.us-east-1.redshift.amazonaws.com ``` -3. The `connect_timeout` is the number of minutes the pipeline will wait before the timeout. +3. The `connect_timeout` is the number of minutes the pipeline will wait before timing out. -You can also pass a database connection string similar to the one used by `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). Credentials above will look like this: +You can also pass a database connection string similar to the one used by the `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). The credentials above will look like this: ```toml # keep it at the top of your toml file! before any section starts destination.redshift.credentials="redshift://loader:@localhost/dlt_data?connect_timeout=15" @@ -82,25 +82,24 @@ When staging is enabled: > ā— **Redshift cannot load `TIME` columns from `json` or `parquet` files**. `dlt` will fail such jobs permanently. Switch to direct `insert_values` to load time columns. -> ā— **Redshift cannot detect compression type from `json` files**. `dlt` assumes that `jsonl` files are gzip compressed which is the default. - -> ā— **Redshift loads `complex` types as strings into SUPER with `parquet`**. Use `jsonl` format to store JSON in SUPER natively or transform your SUPER columns with `PARSE_JSON``. +> ā— **Redshift cannot detect compression type from `json` files**. `dlt` assumes that `jsonl` files are gzip compressed, which is the default. +> ā— **Redshift loads `complex` types as strings into SUPER with `parquet`**. Use `jsonl` format to store JSON in SUPER natively or transform your SUPER columns with `PARSE_JSON`. ## Supported column hints Amazon Redshift supports the following column hints: -- `cluster` - hint is a Redshift term for table distribution. Applying it to a column makes it the "DISTKEY," affecting query and join performance. Check the following [documentation](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-best-dist-key.html) for more info. -- `sort` - creates SORTKEY to order rows on disk physically. It is used to improve a query and join speed in Redshift, please read the [sort key docs](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-sort-key.html) to learn more. +- `cluster` - This hint is a Redshift term for table distribution. Applying it to a column makes it the "DISTKEY," affecting query and join performance. Check the following [documentation](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-best-dist-key.html) for more info. +- `sort` - This hint creates a SORTKEY to order rows on disk physically. It is used to improve query and join speed in Redshift. Please read the [sort key docs](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-sort-key.html) to learn more. ## Staging support -Redshift supports s3 as a file staging destination. dlt will upload files in the parquet format to s3 and ask redshift to copy their data directly into the db. Please refere to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the aws credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +Redshift supports s3 as a file staging destination. dlt will upload files in the parquet format to s3 and ask Redshift to copy their data directly into the db. Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the AWS credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. -### Authentication iam Role +### Authentication IAM Role -If you would like to load from s3 without forwarding the aws staging credentials but authorize with an iam role connected to Redshift, follow the [Redshift documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/authorizing-redshift-service.html) to create a role with access to s3 linked to your redshift cluster and change your destination settings to use the iam role: +If you would like to load from s3 without forwarding the AWS staging credentials but authorize with an IAM role connected to Redshift, follow the [Redshift documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/authorizing-redshift-service.html) to create a role with access to s3 linked to your Redshift cluster and change your destination settings to use the IAM role: ```toml [destination] @@ -143,4 +142,4 @@ Supported loader file formats for Redshift are `sql` and `insert_values` (defaul - [Load data from GitHub to Redshift in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-redshift) - [Load data from Stripe to Redshift in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-redshift) - [Load data from Google Sheets to Redshift in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-redshift) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 34efb0df39..a6058a255e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -7,30 +7,30 @@ keywords: [Snowflake, destination, data warehouse] # Snowflake ## Install dlt with Snowflake -**To install the DLT library with Snowflake dependencies:** +**To install the DLT library with Snowflake dependencies, run:** ``` pip install dlt[snowflake] ``` ## Setup Guide -**1. Initialize a project with a pipeline that loads to snowflake by running** +**1. Initialize a project with a pipeline that loads to Snowflake by running:** ``` dlt init chess snowflake ``` -**2. Install the necessary dependencies for snowflake by running** +**2. Install the necessary dependencies for Snowflake by running:** ``` pip install -r requirements.txt ``` -This will install dlt with **snowflake** extra which contains Snowflake Python dbapi client. +This will install `dlt` with the `snowflake` extra, which contains the Snowflake Python dbapi client. -**3. Create a new database, user and give dlt access** +**3. Create a new database, user, and give dlt access.** Read the next chapter below. **4. Enter your credentials into `.dlt/secrets.toml`.** -It should now look like +It should now look like this: ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -40,14 +40,13 @@ host = "kgiotue-wn98412" warehouse = "COMPUTE_WH" role = "DLT_LOADER_ROLE" ``` -In case of snowflake **host** is your [Account Identifier](https://docs.snowflake.com/en/user-guide/admin-account-identifier). You can get in **Admin**/**Accounts** by copying account url: -https://kgiotue-wn98412.snowflakecomputing.com and extracting the host name (**kgiotue-wn98412**) +In the case of Snowflake, the **host** is your [Account Identifier](https://docs.snowflake.com/en/user-guide/admin-account-identifier). You can get it in **Admin**/**Accounts** by copying the account URL: https://kgiotue-wn98412.snowflakecomputing.com and extracting the host name (**kgiotue-wn98412**). -The **warehouse** and **role** are optional if you assign defaults to your user. In the example below we do not do that, so we set them explicitly. +The **warehouse** and **role** are optional if you assign defaults to your user. In the example below, we do not do that, so we set them explicitly. ### Setup the database user and permissions -Instructions below assume that you use the default account setup that you get after creating Snowflake account. You should have default warehouse named **COMPUTE_WH** and snowflake account. Below we create a new database, user and assign permissions. The permissions are very generous. A more experienced user can easily reduce `dlt` permissions to just one schema in the database. +The instructions below assume that you use the default account setup that you get after creating a Snowflake account. You should have a default warehouse named **COMPUTE_WH** and a Snowflake account. Below, we create a new database, user, and assign permissions. The permissions are very generous. A more experienced user can easily reduce `dlt` permissions to just one schema in the database. ```sql --create database with standard settings CREATE DATABASE dlt_data; @@ -67,17 +66,17 @@ GRANT ALL PRIVILEGES ON FUTURE SCHEMAS IN DATABASE dlt_data TO DLT_LOADER_ROLE; GRANT ALL PRIVILEGES ON FUTURE TABLES IN DATABASE dlt_data TO DLT_LOADER_ROLE; ``` -Now you can use the user named `LOADER` to access database `DLT_DATA` and log in with specified password. +Now you can use the user named `LOADER` to access the database `DLT_DATA` and log in with the specified password. You can also decrease the suspend time for your warehouse to 1 minute (**Admin**/**Warehouses** in Snowflake UI) ### Authentication types -Snowflake destination accepts three authentication types +Snowflake destination accepts three authentication types: - password authentication - [key pair authentication](https://docs.snowflake.com/en/user-guide/key-pair-auth) - external authentication -The **password authentication** is not any different from other databases like Postgres or Redshift. `dlt` follows the same syntax as [SQLAlchemy dialect](https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#required-parameters). +The **password authentication** is not any different from other databases like Postgres or Redshift. `dlt` follows the same syntax as the [SQLAlchemy dialect](https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#required-parameters). You can also pass credentials as a database connection string. For example: ```toml @@ -85,7 +84,7 @@ You can also pass credentials as a database connection string. For example: destination.snowflake.credentials="snowflake://loader:@kgiotue-wn98412/dlt_data?warehouse=COMPUTE_WH&role=DLT_LOADER_ROLE" ``` -In **key pair authentication** you replace password with a private key string that should be in Base64-encoded DER format ([DBT also recommends](https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication) base64-encoded private keys for Snowflake connections). The private key may also be encrypted. In that case you must provide a passphrase alongside with the private key. +In **key pair authentication**, you replace the password with a private key string that should be in Base64-encoded DER format ([DBT also recommends](https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication) base64-encoded private keys for Snowflake connections). The private key may also be encrypted. In that case, you must provide a passphrase alongside the private key. ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -96,13 +95,13 @@ private_key_passphrase="passphrase" ``` > You can easily get the base64-encoded value of your private key by running `base64 -i .pem` in your terminal -If you pass a passphrase in the connection string, please url encode it. +If you pass a passphrase in the connection string, please URL encode it. ```toml # keep it at the top of your toml file! before any section starts destination.snowflake.credentials="snowflake://loader:@kgiotue-wn98412/dlt_data?private_key=&private_key_passphrase=" ``` -In **external authentication** you can use oauth provider like Okta or external browser to authenticate. You pass your authenticator and refresh token as below: +In **external authentication**, you can use an OAuth provider like Okta or an external browser to authenticate. You pass your authenticator and refresh token as below: ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -110,17 +109,17 @@ username = "loader" authenticator="..." token="..." ``` -or in connection string as query parameters. +or in the connection string as query parameters. Refer to Snowflake [OAuth](https://docs.snowflake.com/en/user-guide/oauth-intro) for more details. ## Write disposition -All write dispositions are supported +All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized` the destination tables will be dropped and +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. ## Data loading -The data is loaded using internal Snowflake stage. We use `PUT` command and per-table built-in stages by default. Stage files are immediately removed (if not specified otherwise). +The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are immediately removed (if not specified otherwise). ## Supported file formats * [insert-values](../file-formats/insert-format.md) is used by default @@ -131,47 +130,47 @@ When staging is enabled: * [jsonl](../file-formats/jsonl.md) is used by default * [parquet](../file-formats/parquet.md) is supported -> ā— When loading from `parquet`, Snowflake will store `complex` types (JSON) in `VARIANT` as string. Use `jsonl` format instead or use `PARSE_JSON` to update the `VARIANT`` field after loading. +> ā— When loading from `parquet`, Snowflake will store `complex` types (JSON) in `VARIANT` as a string. Use the `jsonl` format instead or use `PARSE_JSON` to update the `VARIANT` field after loading. ## Supported column hints Snowflake supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): -* `cluster` - creates a cluster column(s). Many column per table are supported and only when a new table is created. +* `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. ### Table and column identifiers -Snowflake makes all unquoted identifiers uppercase and then resolves them case-insensitive in SQL statements. `dlt` (effectively) does not quote identifies in DDL preserving default behavior. +Snowflake makes all unquoted identifiers uppercase and then resolves them case-insensitively in SQL statements. `dlt` (effectively) does not quote identifiers in DDL, preserving default behavior. -Names of tables and columns in [schemas](../../general-usage/schema.md) are kept in lower case like for all other destinations. This is the pattern we observed in other tools ie. `dbt`. In case of `dlt` it is however trivial to define your own uppercase [naming convention](../../general-usage/schema.md#naming-convention) +Names of tables and columns in [schemas](../../general-usage/schema.md) are kept in lower case like for all other destinations. This is the pattern we observed in other tools, i.e., `dbt`. In the case of `dlt`, it is, however, trivial to define your own uppercase [naming convention](../../general-usage/schema.md#naming-convention) ## Staging support -Snowflake supports s3 and gcs as a file staging destinations. dlt will upload files in the parquet format to the bucket provider and will ask snowflake to copy their data directly into the db. +Snowflake supports S3 and GCS as file staging destinations. dlt will upload files in the parquet format to the bucket provider and will ask Snowflake to copy their data directly into the db. -Alternavitely to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +Alternatively to parquet files, you can also specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. ### Snowflake and Amazon S3 -Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your bucket with the bucket_url and credentials. For s3 The dlt Redshift loader will use the aws credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively you can create a stage for your S3 Bucket by following the instructions provided in the [Snowflake S3 documentation](https://docs.snowflake.com/en/user-guide/data-load-s3-config-storage-integration). +Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your bucket with the bucket_url and credentials. For S3, the dlt Redshift loader will use the AWS credentials provided for S3 to access the S3 bucket if not specified otherwise (see config options below). Alternatively, you can create a stage for your S3 Bucket by following the instructions provided in the [Snowflake S3 documentation](https://docs.snowflake.com/en/user-guide/data-load-s3-config-storage-integration). The basic steps are as follows: * Create a storage integration linked to GCS and the right bucket -* Grant access to this storage integration to the snowflake role you are using to load the data into snowflake. +* Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. -* Also grant access to this stage for the role you are using to load data into snowflake. +* Also grant access to this stage for the role you are using to load data into Snowflake. * Provide the name of your stage (including the namespace) to dlt like so: -To prevent dlt from forwarding the s3 bucket credentials on every command, and set your s3 stage, change these settings: +To prevent dlt from forwarding the S3 bucket credentials on every command, and set your S3 stage, change these settings: ```toml [destination] stage_name=PUBLIC.my_s3_stage ``` -To run Snowflake with s3 as staging destination: +To run Snowflake with S3 as the staging destination: ```python # Create a dlt pipeline that will load -# chess player data to the snowflake destination -# via staging on s3 +# chess player data to the Snowflake destination +# via staging on S3 pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='snowflake', @@ -182,12 +181,12 @@ pipeline = dlt.pipeline( ### Snowflake and Google Cloud Storage -Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your bucket with the bucket_url and credentials. For gcs you can define a stage in Snowflake and provide the stage identifier in the configuration (see config options below.) Please consult the snowflake Documentation on [how to create a stage for your GCS Bucket](https://docs.snowflake.com/en/user-guide/data-load-gcs-config). The basic steps are as follows: +Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your bucket with the bucket_url and credentials. For GCS, you can define a stage in Snowflake and provide the stage identifier in the configuration (see config options below.) Please consult the Snowflake Documentation on [how to create a stage for your GCS Bucket](https://docs.snowflake.com/en/user-guide/data-load-gcs-config). The basic steps are as follows: * Create a storage integration linked to GCS and the right bucket -* Grant access to this storage integration to the snowflake role you are using to load the data into snowflake. +* Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. -* Also grant access to this stage for the role you are using to load data into snowflake. +* Also grant access to this stage for the role you are using to load data into Snowflake. * Provide the name of your stage (including the namespace) to dlt like so: ```toml @@ -195,12 +194,12 @@ Please refer to the [Google Storage filesystem documentation](./filesystem.md#go stage_name=PUBLIC.my_gcs_stage ``` -To run Snowflake with gcs as staging destination: +To run Snowflake with GCS as the staging destination: ```python # Create a dlt pipeline that will load -# chess player data to the snowflake destination -# via staging on gcs +# chess player data to the Snowflake destination +# via staging on GCS pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='snowflake', @@ -211,14 +210,14 @@ pipeline = dlt.pipeline( ### Snowflake and Azure Blob Storage -Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) to learn how to set up your bucket with the bucket_url and credentials. For azure the Snowflake loader will use -the filesystem credentials for your azure blob storage container if not specified otherwise (see config options below). Alternatively you can define an external stage in Snowflake and provide the stage identifier. -Please consult the snowflake Documentation on [how to create a stage for your Azure Blob Storage Container](https://docs.snowflake.com/en/user-guide/data-load-azure). The basic steps are as follows: +Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) to learn how to set up your bucket with the bucket_url and credentials. For Azure, the Snowflake loader will use +the filesystem credentials for your Azure Blob Storage container if not specified otherwise (see config options below). Alternatively, you can define an external stage in Snowflake and provide the stage identifier. +Please consult the Snowflake Documentation on [how to create a stage for your Azure Blob Storage Container](https://docs.snowflake.com/en/user-guide/data-load-azure). The basic steps are as follows: * Create a storage integration linked to Azure Blob Storage and the right container -* Grant access to this storage integration to the snowflake role you are using to load the data into snowflake. +* Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. -* Also grant access to this stage for the role you are using to load data into snowflake. +* Also grant access to this stage for the role you are using to load data into Snowflake. * Provide the name of your stage (including the namespace) to dlt like so: ```toml @@ -226,12 +225,12 @@ Please consult the snowflake Documentation on [how to create a stage for your Az stage_name=PUBLIC.my_azure_stage ``` -To run Snowflake with azure as staging destination: +To run Snowflake with Azure as the staging destination: ```python # Create a dlt pipeline that will load -# chess player data to the snowflake destination -# via staging on azure +# chess player data to the Snowflake destination +# via staging on Azure pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='snowflake', @@ -241,7 +240,7 @@ pipeline = dlt.pipeline( ``` ## Additional destination options -You can define your own stage to PUT files and disable removing of the staged files after loading. +You can define your own stage to PUT files and disable the removal of the staged files after loading. ```toml [destination.snowflake] # Use an existing named stage instead of the default. Default uses the implicit table stage per table @@ -251,7 +250,7 @@ keep_staged_files=true ``` ### dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake). Both password and key pair authentication is supported and shared with dbt runners. +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake). Both password and key pair authentication are supported and shared with dbt runners. ### Syncing of `dlt` state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) @@ -266,4 +265,4 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci - [Load data from HubSpot to Snowflake in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-snowflake) - [Load data from Chess.com to Snowflake in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-snowflake) - [Load data from Google Sheets to Snowflake in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-snowflake) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index 6ace1ac5a8..bac184fd41 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -18,13 +18,13 @@ pip install dlt[synapse] * **Microsoft ODBC Driver for SQL Server** - _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. + The _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). Supported driver versions: * `ODBC Driver 18 for SQL Server` - > šŸ’” Older driver versions don't properly work, because they don't support the `LongAsMax` keyword that got [introduced](https://learn.microsoft.com/en-us/sql/connect/odbc/windows/features-of-the-microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15#microsoft-odbc-driver-180-for-sql-server-on-windows) in `ODBC Driver 18 for SQL Server`. Synapse does not support the legacy ["long data types"](https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql), and requires "max data types" instead. `dlt` uses the `LongAsMax` keyword to automatically do the conversion. + > šŸ’” Older driver versions don't work properly because they don't support the `LongAsMax` keyword that was [introduced](https://learn.microsoft.com/en-us/sql/connect/odbc/windows/features-of-the-microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15#microsoft-odbc-driver-180-for-sql-server-on-windows) in `ODBC Driver 18 for SQL Server`. Synapse does not support the legacy ["long data types"](https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql), and requires "max data types" instead. `dlt` uses the `LongAsMax` keyword to automatically do the conversion. * **Azure Synapse Workspace and dedicated SQL pool** You need an Azure Synapse workspace with a dedicated SQL pool to load data into. If you don't have one yet, you can use this [quickstart](https://learn.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-sql-pool-studio). @@ -67,7 +67,7 @@ GRANT ADMINISTER DATABASE BULK OPERATIONS TO loader; -- only required when loadi Optionally, you can create a `WORKLOAD GROUP` and add the `loader` user as a member to manage [workload isolation](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-workload-isolation). See the [instructions](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/data-loading-best-practices#create-a-loading-user) on setting up a loader user for an example of how to do this. -**3. Enter your credentials into `.dlt/secrets.toml`.** +**4. Enter your credentials into `.dlt/secrets.toml`.** Example, replace with your database connection info: ```toml @@ -97,7 +97,7 @@ pipeline = dlt.pipeline( ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and replaced by the staging tables with an `ALTER SCHEMA ... TRANSFER` command. Please note that this operation is **not** atomicā€”it involves multiple DDL commands and Synapse does not support DDL transactions. @@ -134,12 +134,11 @@ Possible values: > ā— Important: >* **Set `default_table_index_type` to `"clustered_columnstore_index"` if you want to change the default** (see [additional destination options](#additional-destination-options)). >* **CLUSTERED COLUMNSTORE INDEX tables do not support the `varchar(max)`, `nvarchar(max)`, and `varbinary(max)` data types.** If you don't specify the `precision` for columns that map to any of these types, `dlt` will use the maximum lengths `varchar(4000)`, `nvarchar(4000)`, and `varbinary(8000)`. ->* **While Synapse creates CLUSTERED COLUMNSTORE INDEXES by default, `dlt` creates HEAP tables by default.** HEAP is a more robust choice, because it supports all data types and doesn't require conversions. ->* **When using the `insert-from-staging` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**ā€”any configuration of the table index types is ignored. The HEAP strategy makes sense - for staging tables for reasons explained [here](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables). ->* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are already created with the configured table index type**, because the staging table becomes the final table. ->* **`dlt` system tables are always created as HEAP tables, regardless of any configuration.** This is in line with Microsoft's recommendation that "for small lookup tables, less than 60 million rows, consider using HEAP or clustered index for faster query performance." ->* Child tables, if any, inherent the table index type of their parent table. +>* **While Synapse creates CLUSTERED COLUMNSTORE INDEXES by default, `dlt` creates HEAP tables by default.** HEAP is a more robust choice because it supports all data types and doesn't require conversions. +>* **When using the `insert-from-staging` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**ā€”any configuration of the table index types is ignored. The HEAP strategy makes sense for staging tables for reasons explained [here](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables). +>* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are already created with the configured table index type**, because the staging table becomes the final table. +>* **`dlt` system tables are always created as HEAP tables, regardless of any configuration.** This is in line with Microsoft's recommendation that "for small lookup tables, less than 60 million rows, consider using HEAP or clustered index for faster query performance." +>* Child tables, if any, inherit the table index type of their parent table. ## Supported column hints @@ -148,7 +147,7 @@ Synapse supports the following [column hints](https://dlthub.com/docs/general-us * `primary_key` - creates a `PRIMARY KEY NONCLUSTERED NOT ENFORCED` constraint on the column * `unique` - creates a `UNIQUE NOT ENFORCED` constraint on the column -> ā— These hints are **disabled by default**. This is because the `PRIMARY KEY` and `UNIQUE` [constraints](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints) are tricky in Synapse: they are **not enforced** and can lead to innacurate results if the user does not ensure all column values are unique. For the column hints to take effect, the `create_indexes` configuration needs to be set to `True`, see [additional destination options](#additional-destination-options). +> ā— These hints are **disabled by default**. This is because the `PRIMARY KEY` and `UNIQUE` [constraints](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints) are tricky in Synapse: they are **not enforced** and can lead to inaccurate results if the user does not ensure all column values are unique. For the column hints to take effect, the `create_indexes` configuration needs to be set to `True`, see [additional destination options](#additional-destination-options). ## Staging support Synapse supports Azure Blob Storage (both standard and [ADLS Gen2](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)) as a file staging destination. `dlt` first uploads Parquet files to the blob container, and then instructs Synapse to read the Parquet file and load its data into a Synapse table using the [COPY INTO](https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql) statement. @@ -190,9 +189,9 @@ destination.synapse.credentials = "synapse://loader:your_loader_password@your_sy ``` Descriptions: -- `default_table_index_type` sets the [table index type](#table-index-type) that is used if no table index type is specified on the resource. +- `default_table_index_type` sets the [table index type](#table-index-type) that is used if no table index type is specified on the resource. - `create_indexes` determines if `primary_key` and `unique` [column hints](#supported-column-hints) are applied. -- `staging_use_msi` determines if the Managed Identity of the Synapse workspace is used to authorize access to the [staging](#staging-support) Storage Account. Ensure the Managed Identity has the [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#storage-blob-data-reader) role (or a higher-priviliged role) assigned on the blob container if you set this option to `"true"`. +- `staging_use_msi` determines if the Managed Identity of the Synapse workspace is used to authorize access to the [staging](#staging-support) Storage Account. Ensure the Managed Identity has the [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#storage-blob-data-reader) role (or a higher-privileged role) assigned on the blob container if you set this option to `"true"`. - `port` used for the ODBC connection. - `connect_timeout` sets the timeout for the `pyodbc` connection attempt, in seconds. @@ -212,4 +211,4 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci - [Load data from GitHub to Azure Synapse in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-synapse) - [Load data from Stripe to Azure Synapse in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-synapse) - [Load data from Chess.com to Azure Synapse in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-synapse) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 2ec09e9c24..6bd52acd35 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -6,8 +6,8 @@ keywords: [weaviate, vector database, destination, dlt] # Weaviate -[Weaviate](https://weaviate.io/) is an open source vector database. It allows you to store data objects and perform similarity searches over them. -This destination helps you to load data into Weaviate from [dlt resources](../../general-usage/resource.md). +[Weaviate](https://weaviate.io/) is an open-source vector database. It allows you to store data objects and perform similarity searches over them. +This destination helps you load data into Weaviate from [dlt resources](../../general-usage/resource.md). ## Setup Guide @@ -30,13 +30,13 @@ X-OpenAI-Api-Key = "your-openai-api-key" In this setup guide, we are using the [Weaviate Cloud Services](https://console.weaviate.cloud/) to get a Weaviate instance and [OpenAI API](https://platform.openai.com/) for generating embeddings through the [text2vec-openai](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-openai) module. -You can host your own weaviate instance using docker compose, kubernetes or embedded. Refer to Weaviate's [How-to: Install](https://weaviate.io/developers/weaviate/installation) or [dlt recipe we use for our tests](#run-weaviate-fully-standalone). In that case you can skip the credentials part altogether: +You can host your own Weaviate instance using Docker Compose, Kubernetes, or embedded. Refer to Weaviate's [How-to: Install](https://weaviate.io/developers/weaviate/installation) or [dlt recipe we use for our tests](#run-weaviate-fully-standalone). In that case, you can skip the credentials part altogether: ```toml [destination.weaviate.credentials.additional_headers] X-OpenAI-Api-Key = "your-openai-api-key" ``` -The `url` will default to **http://localhost:8080** and `api_key` is not defined - which are the defaults for Weaviate container. +The `url` will default to **http://localhost:8080** and `api_key` is not defined - which are the defaults for the Weaviate container. 3. Define the source of the data. For starters, let's load some data from a simple data structure: @@ -101,7 +101,7 @@ weaviate_adapter(data, vectorize, tokenization) ``` It accepts the following arguments: -- `data`: a dlt resource object or a Python data structure (e.g. a list of dictionaries). +- `data`: a dlt resource object or a Python data structure (e.g., a list of dictionaries). - `vectorize`: a name of the field or a list of names that should be vectorized by Weaviate. - `tokenization`: the dictionary containing the tokenization configuration for a field. The dictionary should have the following structure `{'field_name': 'method'}`. Valid methods are "word", "lowercase", "whitespace", "field". The default is "word". See [Property tokenization](https://weaviate.io/developers/weaviate/config-refs/schema#property-tokenization) in Weaviate documentation for more details. @@ -146,7 +146,7 @@ info = pipeline.run( ### Merge The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data in the destination. -For `merge` disposition you would need to specify a `primary_key` for the resource: +For the `merge` disposition, you would need to specify a `primary_key` for the resource: ```python info = pipeline.run( @@ -159,18 +159,18 @@ info = pipeline.run( ) ``` -Internally dlt will use `primary_key` (`document_id` in the example above) to generate a unique identifier ([UUID](https://weaviate.io/developers/weaviate/manage-data/create#id)) for each object in Weaviate. If the object with the same UUID already exists in Weaviate, it will be updated with the new data. Otherwise, a new object will be created. +Internally, dlt will use `primary_key` (`document_id` in the example above) to generate a unique identifier ([UUID](https://weaviate.io/developers/weaviate/manage-data/create#id)) for each object in Weaviate. If the object with the same UUID already exists in Weaviate, it will be updated with the new data. Otherwise, a new object will be created. :::caution -If you are using the merge write disposition, you must set it from the first run of your pipeline, otherwise the data will be duplicated in the database on subsequent loads. +If you are using the `merge` write disposition, you must set it from the first run of your pipeline; otherwise, the data will be duplicated in the database on subsequent loads. ::: ### Append -This is the default disposition. It will append the data to the existing data in the destination ignoring the `primary_key` field. +This is the default disposition. It will append the data to the existing data in the destination, ignoring the `primary_key` field. ## Data loading @@ -199,7 +199,7 @@ Weaviate uses classes to categorize and identify data. To avoid potential naming For example, if you have a dataset named `movies_dataset` and a table named `actors`, the Weaviate class name would be `MoviesDataset_Actors` (the default separator is an underscore). -However, if you prefer to have class names without the dataset prefix, skip `dataset_name` argument. +However, if you prefer to have class names without the dataset prefix, skip the `dataset_name` argument. For example: @@ -241,7 +241,7 @@ The default naming convention described above will preserve the casing of the pr in Weaviate but also requires that your input data does not have clashing property names when comparing case insensitive ie. (`caseName` == `casename`). In such case Weaviate destination will fail to create classes and report a conflict. -You can configure alternative naming convention which will lowercase all properties. The clashing properties will be merged and the classes created. Still if you have a document where clashing properties like: +You can configure an alternative naming convention which will lowercase all properties. The clashing properties will be merged and the classes created. Still, if you have a document where clashing properties like: ```json {"camelCase": 1, "CamelCase": 2} ``` @@ -249,7 +249,7 @@ it will be normalized to: ``` {"camelcase": 2} ``` -so your best course of action is to clean up the data yourself before loading and use default naming convention. Nevertheless you can configure the alternative in `config.toml`: +so your best course of action is to clean up the data yourself before loading and use the default naming convention. Nevertheless, you can configure the alternative in `config.toml`: ```toml [schema] naming="dlt.destinations.weaviate.impl.ci_naming" @@ -291,12 +291,12 @@ Below is an example that configures the **contextionary** vectorizer. You can pu vectorizer="text2vec-contextionary" module_config={text2vec-contextionary = { vectorizeClassName = false, vectorizePropertyName = true}} ``` -You can find docker composer with the instructions to run [here](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/weaviate/README.md) +You can find Docker Compose with the instructions to run [here](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/weaviate/README.md) ### dbt support -Currently Weaviate destination does not support dbt. +Currently, Weaviate destination does not support dbt. ### Syncing of `dlt` state @@ -304,4 +304,4 @@ Weaviate destination supports syncing of the `dlt` state. - \ No newline at end of file + From 66042897adf317a35073462f71173679b8802e52 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Mon, 11 Mar 2024 15:07:42 +0300 Subject: [PATCH 03/27] Fix grammar in docs: batch 2 (#1075) --- .../file-formats/insert-format.md | 25 +++++------ .../docs/dlt-ecosystem/file-formats/jsonl.md | 18 ++++---- .../dlt-ecosystem/file-formats/parquet.md | 41 +++++++----------- .../dlt-ecosystem/transformations/dbt/dbt.md | 24 +++++------ .../transformations/dbt/dbt_cloud.md | 26 ++++++------ .../dlt-ecosystem/transformations/pandas.md | 18 ++++---- .../docs/dlt-ecosystem/transformations/sql.md | 12 +++--- .../verified-sources/amazon_kinesis.md | 12 +++--- .../verified-sources/google_analytics.md | 26 ++++++------ .../dlt-ecosystem/verified-sources/inbox.md | 40 +++++++++--------- .../dlt-ecosystem/verified-sources/jira.md | 28 ++++++------- .../dlt-ecosystem/verified-sources/kafka.md | 42 +++++++++---------- .../verified-sources/pipedrive.md | 10 ++--- .../dlt-ecosystem/verified-sources/slack.md | 26 ++++++------ 14 files changed, 167 insertions(+), 181 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md index a6d9fe78b6..ff73e3741e 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md @@ -4,29 +4,26 @@ description: The INSERT file format keywords: [insert values, file formats] --- -# SQL INSERT file format +# SQL INSERT File Format -This file format contains an INSERT...VALUES statement to be executed on the destination during the -`load` stage. +This file format contains an INSERT...VALUES statement to be executed on the destination during the `load` stage. Additional data types are stored as follows: -- `datetime` and `date` as ISO strings; -- `decimal` as text representation of decimal number; -- `binary` depends on the format accepted by the destination; -- `complex` depends on the format accepted by the destination. +- `datetime` and `date` are stored as ISO strings; +- `decimal` is stored as a text representation of a decimal number; +- `binary` storage depends on the format accepted by the destination; +- `complex` storage also depends on the format accepted by the destination. -This file format is -[compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. +This file format is [compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. -## Supported destinations +## Supported Destinations -Used by default by: **DuckDB**, **Postgres**, **Redshift**. +This format is used by default by: **DuckDB**, **Postgres**, **Redshift**. -Supported by: **filesystem**. +It is also supported by: **filesystem**. -By setting the `loader_file_format` argument to `insert_values` in the run command, the pipeline -will store your data in the INSERT format to the destination: +By setting the `loader_file_format` argument to `insert_values` in the run command, the pipeline will store your data in the INSERT format at the destination: ```python info = pipeline.run(some_source(), loader_file_format="insert_values") diff --git a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md index 34f636f88d..130464578e 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md @@ -4,28 +4,28 @@ description: The jsonl file format keywords: [jsonl, file formats] --- -# jsonl - JSON delimited +# jsonl - JSON Delimited -`JSON delimited` is a file format that stores several `JSON` documents in one file. The `JSON` +JSON Delimited is a file format that stores several JSON documents in one file. The JSON documents are separated by a new line. Additional data types are stored as follows: -- `datetime` and `date` as ISO strings; -- `decimal` as text representation of decimal number; -- `binary` is base64 encoded string; -- `HexBytes` is hex encoded string; +- `datetime` and `date` are stored as ISO strings; +- `decimal` is stored as a text representation of a decimal number; +- `binary` is stored as a base64 encoded string; +- `HexBytes` is stored as a hex encoded string; - `complex` is serialized as a string. This file format is [compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. -## Supported destinations +## Supported Destinations -Used by default by: **BigQuery**, **Snowflake**, **filesystem**. +This format is used by default by: **BigQuery**, **Snowflake**, **filesystem**. By setting the `loader_file_format` argument to `jsonl` in the run command, the pipeline will store -your data in the jsonl format to the destination: +your data in the jsonl format at the destination: ```python info = pipeline.run(some_source(), loader_file_format="jsonl") diff --git a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md index 4b0f63d22b..cc2fcfb200 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md @@ -4,52 +4,41 @@ description: The parquet file format keywords: [parquet, file formats] --- -# Parquet file format +# Parquet File Format -[Apache Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is a free and open-source -column-oriented data storage format in the Apache Hadoop ecosystem. `dlt` is able to store data in -this format when configured to do so. +[Apache Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is a free and open-source column-oriented data storage format in the Apache Hadoop ecosystem. `dlt` is capable of storing data in this format when configured to do so. -To use this format you need a `pyarrow` package. You can get this package as a `dlt` extra as well: +To use this format, you need a `pyarrow` package. You can get this package as a `dlt` extra as well: ```sh pip install dlt[parquet] ``` -## Supported destinations +## Supported Destinations Supported by: **BigQuery**, **DuckDB**, **Snowflake**, **filesystem**, **Athena** -By setting the `loader_file_format` argument to `parquet` in the run command, the pipeline will -store your data in the parquet format to the destination: +By setting the `loader_file_format` argument to `parquet` in the run command, the pipeline will store your data in the parquet format at the destination: ```python info = pipeline.run(some_source(), loader_file_format="parquet") ``` ## Destination AutoConfig -`dlt` uses [destination capabilities](../../walkthroughs/create-new-destination.md#3-set-the-destination-capabilities) to configure parquet writer: -* uses decimal and wei precision to pick the right **decimal type** and sets precision and scale -* uses timestamp precision to pick right **timestamp type** resolution (seconds, micro or nano) +`dlt` uses [destination capabilities](../../walkthroughs/create-new-destination.md#3-set-the-destination-capabilities) to configure the parquet writer: +* It uses decimal and wei precision to pick the right **decimal type** and sets precision and scale. +* It uses timestamp precision to pick the right **timestamp type** resolution (seconds, micro, or nano). ## Options -Under the hood `dlt` uses the -[pyarrow parquet writer](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) -to create the files. The following options can be used to change the behavior of the writer: +Under the hood, `dlt` uses the [pyarrow parquet writer](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) to create the files. The following options can be used to change the behavior of the writer: -- `flavor`: Sanitize schema or set other compatibility options to work with various target systems. - Defaults to "spark". -- `version`: Determine which Parquet logical types are available for use, whether the reduced set - from the Parquet 1.x.x format or the expanded logical types added in later format versions. - Defaults to "2.4". -- `data_page_size`: Set a target threshold for the approximate encoded size of data pages within a - column chunk (in bytes). Defaults to "1048576". -- `timestamp_timezone`: A string specifying timezone, default is UTC +- `flavor`: Sanitize schema or set other compatibility options to work with various target systems. Defaults to "spark". +- `version`: Determine which Parquet logical types are available for use, whether the reduced set from the Parquet 1.x.x format or the expanded logical types added in later format versions. Defaults to "2.4". +- `data_page_size`: Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes). Defaults to "1048576". +- `timestamp_timezone`: A string specifying timezone, default is UTC. -Read the -[pyarrow parquet docs](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) -to learn more about these settings. +Read the [pyarrow parquet docs](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) to learn more about these settings. Example: @@ -62,7 +51,7 @@ data_page_size=1048576 timestamp_timezone="Europe/Berlin" ``` -or using environment variables: +Or using environment variables: ``` NORMALIZE__DATA_WRITER__FLAVOR diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index b2b6b27fc3..1cf7a91bfb 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -6,30 +6,30 @@ keywords: [transform, dbt, runner] # Transform the data with dbt -[dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows simple structuring of your transformations into DAGs. The benefits of +[dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows for the simple structuring of your transformations into DAGs. The benefits of using dbt include: - End-to-end cross-db compatibility for dltā†’dbt pipelines. -- Easy to use by SQL analysts, low learning curve. -- Highly flexible and configurable in usage, supports templating, can run backfills etc. -- Supports testing and accelerates troubleshooting. +- Ease of use by SQL analysts, with a low learning curve. +- High flexibility and configurability in usage, supports templating, can run backfills, etc. +- Support for testing and accelerated troubleshooting. ## dbt runner in dlt You can run dbt with `dlt` by using the dbt runner. -The dbt runner +The dbt runner: -- can create a virtual env for dbt on the fly; -- can run a dbt package from online (e.g. GitHub) or from local files; -- passes configuration and credentials to dbt, so you do not need to handle them separately from +- Can create a virtual env for dbt on the fly; +- Can run a dbt package from online sources (e.g., GitHub) or from local files; +- Passes configuration and credentials to dbt, so you do not need to handle them separately from `dlt`, enabling dbt to configure on the fly. ## How to use the dbt runner For an example of how to use the dbt runner, see the [jaffle shop example](https://github.com/dlt-hub/dlt/blob/devel/docs/examples/archive/dbt_run_jaffle.py). -Included below in another example where we run a `dlt` pipeline and then a dbt package via `dlt`: +Included below is another example where we run a `dlt` pipeline and then a dbt package via `dlt`: > šŸ’” Docstrings are available to read in your IDE. @@ -81,7 +81,7 @@ for m in models: ``` ## How to run dbt runner without pipeline -You can use dbt runner without dlt pipeline. Example below will clone and run **jaffle shop** using a dbt profile that you supply. +You can use the dbt runner without a dlt pipeline. The example below will clone and run **jaffle shop** using a dbt profile that you supply. It assumes that dbt is installed in the current Python environment and the `profile.yml` is in the same folder as the Python script. ```py @@ -102,7 +102,7 @@ models = runner.run_all() ``` -Here's example **duckdb** profile +Here's an example **duckdb** profile ```yaml config: # do not track usage, do not create .user.yml @@ -128,4 +128,4 @@ If you want to transform the data before loading, you can use Python. If you wan data after loading, you can use dbt or one of the following: 1. [`dlt` SQL client.](../sql.md) -1. [Pandas.](../pandas.md) +2. [Pandas.](../pandas.md) diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md index 1f658e4f95..43321aab97 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md @@ -1,5 +1,5 @@ --- -title: Transforming the data with dbt Cloud +title: Transforming the Data with dbt Cloud description: Transforming the data loaded by a dlt pipeline with dbt Cloud keywords: [transform, sql] --- @@ -9,7 +9,7 @@ keywords: [transform, sql] ## API Client The DBT Cloud Client is a Python class designed to interact with the dbt Cloud API (version 2). -It provides methods to perform various operations on dbt Cloud, such as triggering job runs and retrieving job run status. +It provides methods to perform various operations on dbt Cloud, such as triggering job runs and retrieving job run statuses. ```python from dlt.helpers.dbt_cloud import DBTCloudClientV2 @@ -26,7 +26,7 @@ run_status = client.get_run_status(run_id=job_run_id) print(f"Job run status: {run_status['status_humanized']}") ``` -## Helper functions +## Helper Functions These Python functions provide an interface to interact with the dbt Cloud API. They simplify the process of triggering and monitoring job runs in dbt Cloud. @@ -53,7 +53,7 @@ status = run_dbt_cloud_job(job_id=1234, data=additional_data, wait_for_outcome=T ### `get_dbt_cloud_run_status()` -If you have already started job run and have a run ID, then you can use the `get_dbt_cloud_run_status` function. +If you have already started a job run and have a run ID, then you can use the `get_dbt_cloud_run_status` function. This function retrieves the full information about a specific dbt Cloud job run. It also supports options for waiting until the run is complete. @@ -65,7 +65,7 @@ from dlt.helpers.dbt_cloud import get_dbt_cloud_run_status status = get_dbt_cloud_run_status(run_id=1234, wait_for_outcome=True) ``` -## Set credentials +## Set Credentials ### secrets.toml @@ -74,27 +74,27 @@ When using a dlt locally, we recommend using the `.dlt/secrets.toml` method to s If you used the `dlt init` command, then the `.dlt` folder has already been created. Otherwise, create a `.dlt` folder in your working directory and a `secrets.toml` file inside it. -It's where you store sensitive information securely, like access tokens. Keep this file safe. +This is where you store sensitive information securely, like access tokens. Keep this file safe. Use the following format for dbt Cloud API authentication: ```toml [dbt_cloud] api_token = "set me up!" # required for authentication -account_id = "set me up!" # required for both helpers function -job_id = "set me up!" # optional only for run_dbt_cloud_job function (you can pass this explicitly as an argument to the function) -run_id = "set me up!" # optional for get_dbt_cloud_run_status (you can pass this explicitly as an argument to the function) +account_id = "set me up!" # required for both helper functions +job_id = "set me up!" # optional only for the run_dbt_cloud_job function (you can pass this explicitly as an argument to the function) +run_id = "set me up!" # optional for the get_dbt_cloud_run_status function (you can pass this explicitly as an argument to the function) ``` -### Environment variables +### Environment Variables -`dlt` supports reading credentials from environment. +`dlt` supports reading credentials from the environment. If dlt tries to read this from environment variables, it will use a different naming convention. -For environment variables all names are capitalized and sections are separated with double underscore "__". +For environment variables, all names are capitalized and sections are separated with a double underscore "__". -For example, for the above secrets, we would need to put into environment: +For example, for the above secrets, we would need to put into the environment: ``` DBT_CLOUD__API_TOKEN diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md index 6ab98090ba..dc2fc6d40a 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ b/docs/website/docs/dlt-ecosystem/transformations/pandas.md @@ -4,12 +4,12 @@ description: Transform the data loaded by a dlt pipeline with Pandas keywords: [transform, pandas] --- -# Transform the data with Pandas +# Transform the Data with Pandas -You can fetch results of any SQL query as a dataframe. If the destination is supporting that -natively (i.e. BigQuery and DuckDB), `dlt` uses the native method. Thanks to that, reading -dataframes may be really fast! The example below reads GitHub reactions data from the `issues` table and -counts reaction types. +You can fetch the results of any SQL query as a dataframe. If the destination supports that +natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to this, reading +dataframes can be really fast! The example below reads GitHub reactions data from the `issues` table and +counts the reaction types. ```python pipeline = dlt.pipeline( @@ -27,15 +27,15 @@ with pipeline.sql_client() as client: counts = reactions.sum(0).sort_values(0, ascending=False) ``` -The `df` method above returns all the data in the cursor as data frame. You can also fetch data in -chunks by passing `chunk_size` argument to the `df` method. +The `df` method above returns all the data in the cursor as a data frame. You can also fetch data in +chunks by passing the `chunk_size` argument to the `df` method. Once your data is in a Pandas dataframe, you can transform it as needed. -## Other transforming tools +## Other Transforming Tools If you want to transform the data before loading, you can use Python. If you want to transform the data after loading, you can use Pandas or one of the following: 1. [dbt.](dbt/dbt.md) (recommended) -1. [`dlt` SQL client.](sql.md) +2. [`dlt` SQL client.](sql.md) diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index cc1576229b..6131cac85a 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -6,10 +6,10 @@ keywords: [transform, sql] # Transform the data using the `dlt` SQL client -A simple alternative to dbt is to query the data using the `dlt` SQL client and then performing the +A simple alternative to dbt is to query the data using the `dlt` SQL client and then perform the transformations using Python. The `execute_sql` method allows you to execute any SQL statement, -including statements that change the database schema or data in the tables. In the example below we -insert a row into `customers` table. Note that the syntax is the same as for any standard `dbapi` +including statements that change the database schema or data in the tables. In the example below, we +insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi` connection. ```python @@ -24,7 +24,7 @@ try: ) ``` -In the case of SELECT queries, the data is returned as a list of row, with the elements of a row +In the case of SELECT queries, the data is returned as a list of rows, with the elements of a row corresponding to selected columns. ```python @@ -34,7 +34,7 @@ try: "SELECT id, name, email FROM customers WHERE id = %s", 10 ) - # prints columns values of first row + # prints column values of the first row print(res[0]) ``` @@ -44,4 +44,4 @@ If you want to transform the data before loading, you can use Python. If you wan data after loading, you can use SQL or one of the following: 1. [dbt](dbt/dbt.md) (recommended). -1. [Pandas.](pandas.md) +2. [Pandas.](pandas.md) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md index 2fb97ff320..4118902a6c 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md @@ -9,7 +9,7 @@ keywords: [amazon kinesis, verified source] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Amazon Kinesis](https://docs.aws.amazon.com/streams/latest/dev/key-concepts.html) is a cloud-based @@ -36,7 +36,7 @@ You can check out our pipeline example ### Grab credentials -To use this verified source you need AWS `Access key` and `Secret access key`, that can be obtained +To use this verified source, you need an AWS `Access key` and `Secret access key`, which can be obtained as follows: 1. Sign in to your AWS Management Console. @@ -122,7 +122,7 @@ For more information, read [Credentials](../../general-usage/credentials). ```bash dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `kinesis_pipeline`, you may + For example, the `pipeline_name` for the above pipeline example is `kinesis_pipeline`. You may also use any custom name instead. For more information, read [Run a pipeline.](../../walkthroughs/run-a-pipeline) @@ -178,7 +178,7 @@ def kinesis_stream( You create a resource `kinesis_stream` by passing the stream name and a few other options. The resource will have the same name as the stream. When you iterate this resource (or pass it to -`pipeline.run` records) it will query Kinesis for all the shards in the requested stream. For each +`pipeline.run` records), it will query Kinesis for all the shards in the requested stream. For each shard, it will create an iterator to read messages: 1. If `initial_at_timestamp` is present, the resource will read all messages after this timestamp. @@ -192,7 +192,7 @@ will load messages incrementally: 1. For shards that didn't have messages (or new shards), the last run time is used to get messages. Please check the `kinesis_stream` [docstring](https://github.com/dlt-hub/verified-sources/blob/master/sources/kinesis/__init__.py#L31-L46) -for additional options, i.e. to limit the number of messages +for additional options, i.e., to limit the number of messages returned or to automatically parse JSON messages. ### Kinesis message format @@ -220,7 +220,7 @@ verified source. ) ``` -1. To load messages from a stream from last one hour: +1. To load messages from a stream from the last one hour: ```python # the resource below will take its name from the stream name, diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md index 02d7803a9b..b6a3a0a5a8 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md @@ -12,7 +12,7 @@ or application. This Google Analytics `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/google_analytics_pipeline.py) -loads data using ā€œGoogle Analytics APIā€ to the destination of your choice. +loads data using the "Google Analytics API" to the destination of your choice. Sources and resources that can be loaded using this verified source are: @@ -29,7 +29,7 @@ Sources and resources that can be loaded using this verified source are: There are two methods to get authenticated for using this verified source: - OAuth credentials -- Service account credential +- Service account credentials Let's go over how to set up both OAuth tokens and service account credentials. In general, OAuth tokens are preferred when user consent is required, while service account credentials are better @@ -39,14 +39,14 @@ requirement. ### Grab Google service account credentials You need to create a GCP service account to get API credentials if you don't have one. To create - one, follow these steps: +one, follow these steps: 1. Sign in to [console.cloud.google.com](http://console.cloud.google.com/). 1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#creating) if needed. -1. Enable "Google Analytics API", refer +1. Enable the "Google Analytics API". Refer to the [Google documentation](https://support.google.com/googleapi/answer/6158841?hl=en) for comprehensive instructions on this process. @@ -58,7 +58,7 @@ You need to create a GCP service account to get API credentials if you don't hav 1. Create a new JSON key by selecting "Manage Keys" > "ADD KEY" > "CREATE". 1. You can download the ".json" file containing the necessary credentials for future use. -### Grab google OAuth credentials +### Grab Google OAuth credentials You need to create a GCP account to get OAuth credentials if you don't have one. To create one, follow these steps: @@ -69,17 +69,17 @@ follow these steps: 1. Enable the Analytics API in the project. -1. Search credentials in the search bar and go to Credentials. +1. Search for credentials in the search bar and go to Credentials. 1. Go to Credentials -> OAuth client ID -> Select Desktop App from the Application type and give an appropriate name. -1. Download the credentials and fill "client_id", "client_secret" and "project_id" in +1. Download the credentials and fill in "client_id", "client_secret", and "project_id" in "secrets.toml". 1. Go back to credentials and select the OAuth consent screen on the left. -1. Fill in the App name, user support email(your email), authorized domain (localhost.com), and dev +1. Fill in the App name, user support email (your email), authorized domain (localhost.com), and dev contact info (your email again). 1. Add the following scope: @@ -90,7 +90,7 @@ follow these steps: 1. Add your email as a test user. -After configuring "client_id", "client_secret" and "project_id" in "secrets.toml". To generate the +After configuring "client_id", "client_secret", and "project_id" in "secrets.toml", to generate the refresh token, run the following script from the root folder: ```bash @@ -239,7 +239,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug ### Source `simple_load` This function returns a list of resources including metadata, metrics, and dimensions data from -Google Analytics API. +the Google Analytics API. ```python @dlt.source(max_table_nesting=2) @@ -293,7 +293,7 @@ def metrics_table(metadata: Metadata) -> Iterator[TDataItem]: `metadata`: GA4 metadata is stored in this "Metadata" class object. -Similarly, there is a transformer function called `dimensions_table` that populates table called +Similarly, there is a transformer function called `dimensions_table` that populates a table called "dimensions" with the data from each dimension. ## Customization @@ -330,7 +330,7 @@ verified source. ```python load_data = google_analytics(start_date='2023-01-01') - load_info = pipeline.run(load_data). + load_info = pipeline.run(load_data) print(load_info) ``` @@ -349,4 +349,4 @@ verified source. - [Load data from Google Analytics to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-databricks) - [Load data from Google Analytics to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-postgres) - [Load data from Google Analytics to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-athena) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md index 2aa1d1130f..75106df609 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md @@ -9,14 +9,14 @@ keywords: [inbox, inbox verified source, inbox mail, email] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: This source collects inbox emails, retrieves attachments, and stores relevant email data. It uses the imaplib library for IMAP interactions and the dlt library for data processing. This Inbox `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/inbox_pipeline.py) -loads data using ā€œInboxā€ verified source to the destination of your choice. +load data using the ā€œInboxā€ verified source to the destination of your choice. Sources and resources that can be loaded using this verified source are: @@ -36,14 +36,14 @@ Sources and resources that can be loaded using this verified source are: - "email_account": Associated email account name (e.g. dlthub@dlthub.com). - "password": APP password (for third-party clients) from the email provider. -1. Host addresses and APP password procedures vary by provider and can be found via a quick Google search. For Google Mail's app password, read [here](https://support.google.com/mail/answer/185833?hl=en#:~:text=An%20app%20password%20is%20a,2%2DStep%20Verification%20turned%20on). +2. Host addresses and APP password procedures vary by provider and can be found via a quick Google search. For Google Mail's app password, read [here](https://support.google.com/mail/answer/185833?hl=en#:~:text=An%20app%20password%20is%20a,2%2DStep%20Verification%20turned%20on). -1. However, this guide covers Gmail inbox configuration; similar steps apply to other providers. +3. However, this guide covers Gmail inbox configuration; similar steps apply to other providers. ### Accessing Gmail Inbox 1. SMTP server DNS: 'imap.gmail.com' for Gmail. -1. Port: 993 (for internet messaging access protocol over TLS/SSL). +2. Port: 993 (for internet messaging access protocol over TLS/SSL). ### Grab App password for Gmail @@ -52,12 +52,12 @@ Sources and resources that can be loaded using this verified source are: #### Steps to Create and Use App Passwords: 1. Visit your Google Account > Security. -1. Under "How you sign in to Google", enable 2-Step Verification. -1. Choose App passwords at the bottom. -1. Name the device for reference. -1. Click Generate. -1. Input the generated 16-character app password as prompted. -1. Click Done. +2. Under "How you sign in to Google", enable 2-Step Verification. +3. Choose App passwords at the bottom. +4. Name the device for reference. +5. Click Generate. +6. Input the generated 16-character app password as prompted. +7. Click Done. Read more in [this article](https://pythoncircle.com/post/727/accessing-gmail-inbox-using-python-imaplib-module/) or [Google official documentation.](https://support.google.com/mail/answer/185833#zippy=%2Cwhy-you-may-need-an-app-password) @@ -76,10 +76,10 @@ To get started with your data pipeline, follow these steps: with Inbox as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). -1. If you'd like to use a different destination, simply replace `duckdb` with the name of your +2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../destinations). -1. After running this command, a new directory will be created with the necessary files and +3. After running this command, a new directory will be created with the necessary files and configuration settings to get started. For more information, read the @@ -100,11 +100,11 @@ For more information, read the password = "Please set me up!" # # APP Password for the above email account. ``` -1. Replace the host, email and password value with the [previously copied one](#grab-credentials) +2. Replace the host, email, and password value with the [previously copied one](#grab-credentials) to ensure secure access to your Inbox resources. > When adding the App Password, remove any spaces. For instance, "abcd efgh ijkl mnop" should be "abcdefghijklmnop". -1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to +3. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to add credentials for your chosen destination, ensuring proper routing of your data to the final destination. @@ -126,7 +126,7 @@ For more information, read the For pdf parsing: - PyPDF2: `pip install PyPDF2` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +2. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```bash dlt pipeline show @@ -246,7 +246,7 @@ verified source. To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load messages from "mycreditcard@bank.com" starting "2023-10-1": +2. To load messages from "mycreditcard@bank.com" starting "2023-10-1": - Set `DEFAULT_START_DATE = pendulum.datetime(2023, 10, 1)` in `./inbox/settings.py`. - Use the following code: @@ -261,7 +261,7 @@ verified source. print(load_info) ``` > Please refer to inbox_source() docstring for email filtering options by sender, date, or mime type. -1. To load messages from multiple emails, including "community@dlthub.com": +3. To load messages from multiple emails, including "community@dlthub.com": ```python messages = inbox_source( @@ -269,7 +269,7 @@ verified source. ).messages ``` -1. In `inbox_pipeline.py`, the `pdf_to_text` transformer extracts text from PDFs, treating each page as a separate data item. +4. In `inbox_pipeline.py`, the `pdf_to_text` transformer extracts text from PDFs, treating each page as a separate data item. Using the `pdf_to_text` function to load parsed pdfs from mail to the database: ```python @@ -285,4 +285,4 @@ verified source. ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md index 4588f4f4c6..c796014835 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md @@ -3,7 +3,7 @@ :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Jira](https://www.atlassian.com/software/jira) by Atlassian helps teams manage projects and tasks @@ -11,16 +11,16 @@ efficiently, prioritize work, and collaborate. This Jira `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/jira_pipeline.py) -loads data using Jira API to the destination of your choice. +loads data using the Jira API to the destination of your choice. The endpoints that this verified source supports are: | Name | Description | | --------- | ---------------------------------------------------------------------------------------- | -| issues | individual pieces of work to be completed | -| users | administrator of a given project | -| workflows | the key aspect of managing and tracking the progress of issues or tasks within a project | -| projects | a collection of tasks that need to be completed to achieve a certain outcome | +| issues | Individual pieces of work to be completed | +| users | Administrators of a given project | +| workflows | The key aspect of managing and tracking the progress of issues or tasks within a project | +| projects | A collection of tasks that need to be completed to achieve a certain outcome | To get a complete list of sub-endpoints that can be loaded, see [jira/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/jira/settings.py) @@ -96,7 +96,7 @@ For more information, read the guide on [how to add a verified source](../../wal add credentials for your chosen destination, ensuring proper routing of your data to the final destination. -For more information, read the [General Usage: Credentials.](../../general-usage/credentials) +For more information, read [General Usage: Credentials.](../../general-usage/credentials) ## Run the pipeline @@ -114,7 +114,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```bash dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`, you may also + For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`. You may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). @@ -145,7 +145,7 @@ def jira( - `subdomain`: The subdomain of the Jira account. Configured in ".dlt/secrets.toml". - `email`: The email associated with the Jira account. Configured in ".dlt/secrets.toml". -- `api_token`: The API token for accessing the Jira account.Configured in ".dlt/secrets.toml". +- `api_token`: The API token for accessing the Jira account. Configured in ".dlt/secrets.toml". ### Source `jira_search` @@ -161,8 +161,8 @@ def jira_search( ) -> Iterable[DltResource]: ``` -The above function uses the same arguments `subdomain`, `email` and `api_token` as described above -for [jira source](jira.md#source-jira). +The above function uses the same arguments `subdomain`, `email`, and `api_token` as described above +for the [jira source](jira.md#source-jira). ### Resource `issues` @@ -179,7 +179,7 @@ def issues(jql_queries: List[str]) -> Iterable[TDataItem]: ## Customization ### Create your own pipeline -If you wish to create your own pipelines you can leverage source and resource methods as discussed +If you wish to create your own pipelines, you can leverage source and resource methods as discussed above. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset. To read more @@ -202,7 +202,7 @@ above. print(f"Load Information: {load_info}") ``` -3. To load the custom issues using JQL queries, you can use custom queries, here is an example +3. To load the custom issues using JQL queries, you can use custom queries. Here is an example below: ```python @@ -218,4 +218,4 @@ above. ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md index 694a81ba1f..5bff03e357 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md @@ -9,13 +9,13 @@ keywords: [kafka api, kafka verified source, kafka] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://join.slack.com/t/dlthub-community/shared_invite/zt-1n5193dbq-rCBmJ6p~ckpSFK4hCF2dYA) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Kafka](https://www.confluent.io/) is an open-source distributed event streaming platform, organized in the form of a log with message publishers and subscribers. -The Kafka `dlt` verified source loads data using Confluent Kafka API to the destination of your choice, -see a [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/kafka_pipeline.py). +The Kafka `dlt` verified source loads data using the Confluent Kafka API to the destination of your choice. +See a [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/kafka_pipeline.py). The resource that can be loaded: @@ -29,7 +29,7 @@ The resource that can be loaded: 1. Follow the [Kafka Setup](https://developer.confluent.io/get-started/python/#kafka-setup) to tweak a project. -1. Follow the [Configuration](https://developer.confluent.io/get-started/python/#configuration) to +2. Follow the [Configuration](https://developer.confluent.io/get-started/python/#configuration) to get the project credentials. ### Initialize the verified source @@ -47,10 +47,10 @@ To get started with your data pipeline, follow these steps: with Kafka as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). -1. If you'd like to use a different destination, simply replace `duckdb` with the name of your +2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../destinations). -1. After running this command, a new directory will be created with the necessary files and +3. After running this command, a new directory will be created with the necessary files and configuration settings to get started. For more information, read the @@ -84,13 +84,13 @@ sasl_password="example_secret" pip install -r requirements.txt ``` -1. You're now ready to run the pipeline! To get started, run the following command: +2. You're now ready to run the pipeline! To get started, run the following command: ```bash python kafka_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```bash @@ -122,25 +122,25 @@ def kafka_consumer( `topics`: A list of Kafka topics to be extracted. -`credentials`: By default, is initialized with the data from -the `secrets.toml`. May be used explicitly to pass an initialized +`credentials`: By default, it is initialized with the data from +the `secrets.toml`. It may be used explicitly to pass an initialized Kafka Consumer object. -`msg_processor`: A function, which'll be used to process every message +`msg_processor`: A function, which will be used to process every message read from the given topics before saving them in the destination. -Can be used explicitly to pass a custom processor. See the +It can be used explicitly to pass a custom processor. See the [default processor](https://github.com/dlt-hub/verified-sources/blob/fe8ed7abd965d9a0ca76d100551e7b64a0b95744/sources/kafka/helpers.py#L14-L50) as an example of how to implement processors. -`batch_size`: The amount of messages to extract from the cluster -at once. Can be set to tweak performance. +`batch_size`: The number of messages to extract from the cluster +at once. It can be set to tweak performance. `batch_timeout`: The maximum timeout for a single batch reading -operation. Can be set to tweak performance. +operation. It can be set to tweak performance. -`start_from`: A timestamp, starting with which the messages must +`start_from`: A timestamp, starting from which the messages must be read. When passed, `dlt` asks the Kafka cluster for an offset, -actual for the given timestamp, and starts to read messages from +which is actual for the given timestamp, and starts to read messages from this offset. @@ -159,7 +159,7 @@ this offset. ) ``` -1. To extract several topics: +2. To extract several topics: ```python topics = ["topic1", "topic2", "topic3"] @@ -168,7 +168,7 @@ this offset. pipeline.run(source, write_disposition="replace") ``` -1. To extract messages and process them in a custom way: +3. To extract messages and process them in a custom way: ```python def custom_msg_processor(msg: confluent_kafka.Message) -> Dict[str, Any]: @@ -185,7 +185,7 @@ this offset. pipeline.run(data) ``` -1. To extract messages, starting from a timestamp: +4. To extract messages, starting from a timestamp: ```python data = kafka_consumer("topic", start_from=pendulum.datetime(2023, 12, 15)) @@ -193,4 +193,4 @@ this offset. ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index 9d1a5a0a02..17907c9467 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -9,7 +9,7 @@ keywords: [pipedrive api, pipedrive verified source, pipedrive] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Pipedrive](https://developers.pipedrive.com/docs/api/v1) is a cloud-based sales Customer @@ -18,7 +18,7 @@ communication, and automate sales processes. This Pipedrive `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/pipedrive_pipeline.py) -loads data using ā€œPipedrive APIā€ to the destination of your choice. +load data using the ā€œPipedrive APIā€ to the destination of your choice. Sources and resources that can be loaded using this verified source are: @@ -105,7 +105,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```bash dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `pipedrive`, you may also use + For example, the `pipeline_name` for the above pipeline example is `pipedrive`, but you may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). @@ -151,7 +151,7 @@ def pipedrive_source( `since_timestamp`: Starting timestamp for incremental loading. By default, complete history is loaded on the first run. And new data in subsequent runs. -> Note: Incremental loading can be enabled or disabled depending on user prefrences. +> Note: Incremental loading can be enabled or disabled depending on user preferences. ### Resource `iterator RECENTS_ENTITIES` @@ -293,4 +293,4 @@ verified source. ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 85fd3f2a3a..647e39a427 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -9,24 +9,24 @@ keywords: [slack api, slack verified source, slack] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Slack](https://slack.com/) is a popular messaging and collaboration platform for teams and organizations. This Slack `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/slack_pipeline.py) -loads data using ā€œSlack APIā€ to the destination of your choice. +load data using the ā€œSlack APIā€ to the destination of your choice. Sources and resources that can be loaded using this verified source are: | Name | Description | |-----------------------|------------------------------------------------------------------------------------| -| slack | Retrives all the Slack data: channels, messages for selected channels, users, logs | -| channels | Retrives all the channels data | -| users | Retrives all the users info | -| get_messages_resource | Retrives all the messages for a given channel | -| access_logs | Retrives the access logs | +| slack | Retrieves all the Slack data: channels, messages for selected channels, users, logs | +| channels | Retrieves all the channels data | +| users | Retrieves all the users info | +| get_messages_resource | Retrieves all the messages for a given channel | +| access_logs | Retrieves the access logs | ## Setup Guide @@ -96,7 +96,7 @@ For more information, read the guide on [how to add a verified source](../../wal access_token = "Please set me up!" # please set me up! ``` -1. Copy the user Oauth token you [copied above](#grab-user-oauth-token). +1. Copy the user OAuth token you [copied above](#grab-user-oauth-token). 1. Finally, enter credentials for your chosen destination as per the [docs](../destinations/). @@ -161,7 +161,7 @@ def slack_source( ### Resource `channels` -This function yields all the channels data as `dlt` resource. +This function yields all the channels data as a `dlt` resource. ```python @dlt.resource(name="channels", primary_key="id", write_disposition="replace") @@ -170,7 +170,7 @@ def channels_resource() -> Iterable[TDataItem]: ### Resource `users` -This function yields all the users data as `dlt` resource. +This function yields all the users data as a `dlt` resource. ```python @dlt.resource(name="users", primary_key="id", write_disposition="replace") @@ -179,7 +179,7 @@ def users_resource() -> Iterable[TDataItem]: ### Resource `get_messages_resource` -This method fetches messages for a specified channel from the Slack API. It creates a resource for each channel with channel's name. +This method fetches messages for a specified channel from the Slack API. It creates a resource for each channel with the channel's name. ```python def get_messages_resource( @@ -285,10 +285,10 @@ verified source. start_date=datetime(2023, 9, 1), end_date=datetime(2023, 9, 8), ) - # It loads only massages from the channel "general". + # It loads only messages from the channel "general". load_info = pipeline.run(source.with_resources("general")) print(load_info) ``` - \ No newline at end of file + From 5275449093afc5c1992f815af547ad2afb7b388c Mon Sep 17 00:00:00 2001 From: Sultan Iman <354868+sultaniman@users.noreply.github.com> Date: Tue, 12 Mar 2024 14:57:28 +0100 Subject: [PATCH 04/27] Docs: Add note about google secret name normalization (#1056) * Add note about google secret name normalization * Change space -> whitespace --- .../general-usage/credentials/config_providers.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/website/docs/general-usage/credentials/config_providers.md b/docs/website/docs/general-usage/credentials/config_providers.md index c0dc459da0..860370d38a 100644 --- a/docs/website/docs/general-usage/credentials/config_providers.md +++ b/docs/website/docs/general-usage/credentials/config_providers.md @@ -99,6 +99,19 @@ the `private_key` for Google credentials. It will look 1. first in env variable `MY_SECTION__GCP_CREDENTIALS__PRIVATE_KEY` and if not found, 1. in `secrets.toml` with key `my_section.gcp_credentials.private_key`. + +:::info +While using Google secrets provider please make sure your pipeline name +contains no whitespace or any other punctuation characters except "-" and "_". + +Per Google the secret name can contain + + 1. Uppercase and lowercase letters, + 2. Numerals, + 3. Hyphens, + 4. Underscores. +::: + ### Environment provider Looks for the values in the environment variables. From a0e83b70767b6fb3efcd13f8ca181e5176e255cf Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 12 Mar 2024 18:26:04 +0100 Subject: [PATCH 05/27] validates class instances in typed dict (#1082) --- dlt/common/validation.py | 23 +++++++++++++++++++---- tests/common/test_validation.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 6bf1356aeb..4b54d6a29e 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -1,5 +1,6 @@ import contextlib import functools +import inspect from typing import Callable, Any, Type from typing_extensions import get_type_hints, get_args @@ -38,11 +39,10 @@ def validate_dict( filter_f (TFilterFunc, optional): A function to filter keys in `doc`. It should return `True` for keys to be kept. Defaults to a function that keeps all keys. validator_f (TCustomValidator, optional): A function to perform additional validation - for types not covered by this function. It should return `True` if the validation passes. + for types not covered by this function. It should return `True` if the validation passes + or raise DictValidationException on validation error. For types it cannot validate, it + should return False to allow chaining. Defaults to a function that rejects all such types. - filter_required (TFilterFunc, optional): A function to filter out required fields, useful - for testing historic versions of dict that might now have certain fields yet. - Raises: DictValidationException: If there are missing required fields, unexpected fields, type mismatches or unvalidated types in `doc` compared to `spec`. @@ -162,8 +162,23 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: elif t is Any: # pass everything with any type pass + elif inspect.isclass(t) and isinstance(pv, t): + # allow instances of classes + pass else: + type_name = getattr(t, "__name__", str(t)) + pv_type_name = getattr(type(pv), "__name__", str(type(pv))) + # try to apply special validator if not validator_f(path, pk, pv, t): + # type `t` cannot be validated by validator_f + if inspect.isclass(t): + if not isinstance(pv, t): + raise DictValidationException( + f"In {path}: field {pk} expect class {type_name} but got instance of" + f" {pv_type_name}", + path, + pk, + ) # TODO: when Python 3.9 and earlier support is # dropped, just __name__ can be used type_name = getattr(t, "__name__", str(t)) diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index f7773fb89c..3297df1038 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -3,6 +3,7 @@ import yaml from typing import Callable, List, Literal, Mapping, Sequence, TypedDict, TypeVar, Optional, Union +from dlt.common import Decimal from dlt.common.exceptions import DictValidationException from dlt.common.schema.typing import TStoredSchema, TColumnSchema from dlt.common.schema.utils import simple_regex_validator @@ -18,6 +19,14 @@ TTableHintTemplate = Union[TDynHintType, TFunHintTemplate[TDynHintType]] +class ClassTest: + a: str + + +class SubClassTest(ClassTest): + b: str + + class TDict(TypedDict): field: TLiteral @@ -41,6 +50,7 @@ class TTestRecord(TypedDict): f_literal_optional: Optional[TLiteral] f_seq_literal: Sequence[Optional[TLiteral]] f_optional_union: Optional[Union[TLiteral, TDict]] + f_class: ClassTest TEST_COL: TColumnSchema = {"name": "col1", "data_type": "bigint", "nullable": False} @@ -70,6 +80,7 @@ class TTestRecord(TypedDict): "f_literal_optional": "dos", "f_seq_literal": ["uno", "dos", "tres"], "f_optional_union": {"field": "uno"}, + "f_class": SubClassTest(), } @@ -275,6 +286,26 @@ def f(item: Union[TDataItem, TDynHintType]) -> TDynHintType: ) +def test_class() -> None: + class TTestRecordInvalidClass(TypedDict): + prop: SubClassTest + + # prop must be SubClassTest or derive from it. not the case below + test_item_1 = {"prop": ClassTest()} + with pytest.raises(DictValidationException): + validate_dict(TTestRecordInvalidClass, test_item_1, path=".") + + # unions are accepted + class TTestRecordClassUnion(TypedDict): + prop: Union[SubClassTest, ClassTest] + + validate_dict(TTestRecordClassUnion, test_item_1, path=".") + + test_item_2 = {"prop": Decimal(1)} + with pytest.raises(DictValidationException): + validate_dict(TTestRecordClassUnion, test_item_2, path=".") + + # def test_union_merge() -> None: # """Overriding fields is simply illegal in TypedDict""" # class EndpointResource(TypedDict, total=False): From 8d7ca4c4315202af7f169ce19676da26dae111ca Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 12 Mar 2024 18:28:48 +0100 Subject: [PATCH 06/27] bumps for pre-release 0.4.7a0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 88e6bd9390..a81f3cd81c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.4.6" +version = "0.4.7a0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Ty Dunn "] From ee5db595abdedab8fd14799b6f29bf6f8f1bb806 Mon Sep 17 00:00:00 2001 From: Ilya Gurov Date: Tue, 12 Mar 2024 23:39:02 +0400 Subject: [PATCH 07/27] feat(airflow): allow re-using sources in airflow wrapper (#1080) * feat(airflow): allow re-using sources in airflow wrapper * lint fix --- dlt/helpers/airflow_helper.py | 66 ++++++++++-------- .../airflow_tests/test_airflow_wrapper.py | 67 ++++++++++++++++++- 2 files changed, 103 insertions(+), 30 deletions(-) diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 9a6616e9ea..e01cf790d2 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -1,7 +1,7 @@ import functools import os from tempfile import gettempdir -from typing import Any, Callable, List, Literal, Optional, Sequence, Tuple +from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple from tenacity import ( retry_if_exception, @@ -103,6 +103,7 @@ def __init__( """ super().__init__(group_id=pipeline_name, **kwargs) + self._used_names: Dict[str, Any] = {} self.use_task_logger = use_task_logger self.log_progress_period = log_progress_period self.buffer_max_items = buffer_max_items @@ -132,6 +133,33 @@ def __init__( if ConfigProvidersContext in Container(): del Container()[ConfigProvidersContext] + def _task_name(self, pipeline: Pipeline, data: Any) -> str: + """Generate a task name. + + Args: + pipeline (Pipeline): The pipeline to run. + data (Any): The data to run the pipeline with. + + Returns: + str: The name of the task. + """ + task_name = pipeline.pipeline_name + + if isinstance(data, DltSource): + resource_names = list(data.selected_resources.keys()) + task_name = data.name + "_" + "-".join(resource_names[:4]) + + if len(resource_names) > 4: + task_name += f"-{len(resource_names)-4}-more" + + num = self._used_names.setdefault(task_name, 0) + self._used_names[task_name] = num + 1 + + if num: + task_name += f"-{num + 1}" + + return task_name + def run( self, pipeline: Pipeline, @@ -175,7 +203,7 @@ def run( schema_contract=schema_contract, pipeline_name=pipeline_name, ) - return PythonOperator(task_id=_task_name(pipeline, data), python_callable=f, **kwargs) + return PythonOperator(task_id=self._task_name(pipeline, data), python_callable=f, **kwargs) def _run( self, @@ -363,7 +391,7 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator pipeline_name=name, ) return PythonOperator( - task_id=_task_name(pipeline, data), python_callable=f, **kwargs + task_id=self._task_name(pipeline, data), python_callable=f, **kwargs ) if decompose == "none": @@ -393,7 +421,7 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator tasks = [] sources = data.decompose("scc") - t_name = _task_name(pipeline, data) + t_name = self._task_name(pipeline, data) start = make_task(pipeline, sources[0]) # parallel tasks @@ -434,16 +462,18 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator start = make_task( pipeline, sources[0], - naming.normalize_identifier(_task_name(pipeline, sources[0])), + naming.normalize_identifier(self._task_name(pipeline, sources[0])), ) # parallel tasks for source in sources[1:]: # name pipeline the same as task - new_pipeline_name = naming.normalize_identifier(_task_name(pipeline, source)) + new_pipeline_name = naming.normalize_identifier( + self._task_name(pipeline, source) + ) tasks.append(make_task(pipeline, source, new_pipeline_name)) - t_name = _task_name(pipeline, data) + t_name = self._task_name(pipeline, data) end = DummyOperator(task_id=f"{t_name}_end") if tasks: @@ -468,25 +498,3 @@ def airflow_get_execution_dates() -> Tuple[pendulum.DateTime, Optional[pendulum. return context["data_interval_start"], context["data_interval_end"] except Exception: return None, None - - -def _task_name(pipeline: Pipeline, data: Any) -> str: - """Generate a task name. - - Args: - pipeline (Pipeline): The pipeline to run. - data (Any): The data to run the pipeline with. - - Returns: - str: The name of the task. - """ - task_name = pipeline.pipeline_name - - if isinstance(data, DltSource): - resource_names = list(data.selected_resources.keys()) - task_name = data.name + "_" + "-".join(resource_names[:4]) - - if len(resource_names) > 4: - task_name += f"-{len(resource_names)-4}-more" - - return task_name diff --git a/tests/helpers/airflow_tests/test_airflow_wrapper.py b/tests/helpers/airflow_tests/test_airflow_wrapper.py index d01330c8b2..84a30f730c 100644 --- a/tests/helpers/airflow_tests/test_airflow_wrapper.py +++ b/tests/helpers/airflow_tests/test_airflow_wrapper.py @@ -435,7 +435,7 @@ def dag_parallel(): for i in range(0, 3): pipeline_dag_parallel = dlt.attach( pipeline_name=snake_case.normalize_identifier( - dag_def.tasks[i].task_id.replace("pipeline_dag_parallel.", "") + dag_def.tasks[i].task_id.replace("pipeline_dag_parallel.", "")[:-2] ) ) pipeline_dag_decomposed_counts = load_table_counts( @@ -852,3 +852,68 @@ def get_task_run(dag_def: DAG, task_name: str, now: pendulum.DateTime) -> TaskIn dag_def.run(start_date=now, run_at_least_once=True) task_def = dag_def.task_dict[task_name] return TaskInstance(task=task_def, execution_date=now) + + +def test_task_already_added(): + """ + Test that the error 'Task id {id} has already been added to the DAG' + is not happening while adding two same sources. + """ + tasks_list: List[PythonOperator] = None + + @dag(schedule=None, start_date=pendulum.today(), catchup=False) + def dag_parallel(): + nonlocal tasks_list + + tasks = PipelineTasksGroup( + "test_pipeline", + local_data_folder="_storage", + wipe_local_data=False, + ) + + source = mock_data_source() + + pipe = dlt.pipeline( + pipeline_name="test_pipeline", + dataset_name="mock_data", + destination="duckdb", + credentials=os.path.join("_storage", "test_pipeline.duckdb"), + ) + task = tasks.add_run( + pipe, + source, + decompose="none", + trigger_rule="all_done", + retries=0, + provide_context=True, + )[0] + assert task.task_id == "test_pipeline.mock_data_source__r_init-_t_init_post-_t1-_t2-2-more" + + task = tasks.add_run( + pipe, + source, + decompose="none", + trigger_rule="all_done", + retries=0, + provide_context=True, + )[0] + assert ( + task.task_id == "test_pipeline.mock_data_source__r_init-_t_init_post-_t1-_t2-2-more-2" + ) + + tasks_list = tasks.add_run( + pipe, + source, + decompose="none", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) + assert ( + tasks_list[0].task_id + == "test_pipeline.mock_data_source__r_init-_t_init_post-_t1-_t2-2-more-3" + ) + + dag_def = dag_parallel() + assert len(tasks_list) == 1 + dag_def.test() From e6223001085da566449db904896f6af9ceb44e8d Mon Sep 17 00:00:00 2001 From: Ilya Gurov Date: Wed, 13 Mar 2024 01:48:17 +0400 Subject: [PATCH 08/27] feat(core): drop default value for write disposition (#1057) * feat(core): drop default value for write disposition * don't use default value in apply_hints * applies default write disposition in empty apply hints --------- Co-authored-by: Marcin Rudolf --- dlt/common/schema/schema.py | 4 +++- dlt/extract/hints.py | 5 ++++- tests/extract/test_extract.py | 8 ++++++++ tests/extract/test_sources.py | 15 +++++++++++++-- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 4c81c8af72..302ac54148 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -82,7 +82,9 @@ class Schema: _imported_version_hash: str # version hash of recently imported schema _schema_description: str # optional schema description _schema_tables: TSchemaTables - _settings: TSchemaSettings # schema settings to hold default hints, preferred types and other settings + _settings: ( + TSchemaSettings # schema settings to hold default hints, preferred types and other settings + ) # list of preferred types: map regex on columns into types _compiled_preferred_types: List[Tuple[REPattern, TDataType]] diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index f298e414a1..54ce00a806 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -82,6 +82,8 @@ def make_hints( ) if not table_name: new_template.pop("name") + if not write_disposition and "write_disposition" in new_template: + new_template.pop("write_disposition") # remember original columns if columns is not None: new_template["original_columns"] = columns @@ -197,10 +199,11 @@ def apply_hints( """ if not self._hints: # if there is no template yet, create and set a new one. + default_wd = None if parent_table_name else DEFAULT_WRITE_DISPOSITION t = make_hints( table_name, parent_table_name, - write_disposition, + write_disposition or default_wd, columns, primary_key, merge_key, diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 28b08c3648..b86e198988 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -89,6 +89,14 @@ def table_name_with_lambda(_range): assert "table_name_with_lambda" not in schema.tables +def test_make_hints_default() -> None: + hints = make_hints() + assert hints == {"columns": {}} + + hints = make_hints(write_disposition=None) + assert hints == {"columns": {}} + + def test_extract_hints_mark(extract_step: Extract) -> None: @dlt.resource def with_table_hints(): diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 5895c3b658..d9c73dfb20 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -1326,9 +1326,11 @@ def empty_gen(): empty_r = empty() with pytest.raises(InconsistentTableTemplate): - empty_r.apply_hints(parent_table_name=lambda ev: ev["p"]) + empty_r.apply_hints(parent_table_name=lambda ev: ev["p"], write_disposition=None) - empty_r.apply_hints(table_name=lambda ev: ev["t"], parent_table_name=lambda ev: ev["p"]) + empty_r.apply_hints( + table_name=lambda ev: ev["t"], parent_table_name=lambda ev: ev["p"], write_disposition=None + ) assert empty_r._table_name_hint_fun is not None assert empty_r._table_has_other_dynamic_hints is True @@ -1360,6 +1362,15 @@ def empty_gen(): assert table["columns"]["tags"] == {"name": "tags"} +def test_resource_no_template() -> None: + empty = DltResource.from_data([1, 2, 3], name="table") + assert empty.write_disposition == "append" + assert empty.compute_table_schema()["write_disposition"] == "append" + empty.apply_hints() + assert empty.write_disposition == "append" + assert empty.compute_table_schema()["write_disposition"] == "append" + + def test_selected_pipes_with_duplicates(): def input_gen(): yield from [1, 2, 3] From 7f43e76db77a6d3f84115ed6efe4d7cee5fe8c5f Mon Sep 17 00:00:00 2001 From: David Scharf Date: Thu, 14 Mar 2024 09:43:30 +0100 Subject: [PATCH 09/27] Generic destination / sink decorator (#1065) * start sink * parquet sink prototype * some more sink implementations * finish first batch of helpers * add missing tests and fix linting * make configuratio more versatile * implement sink function progress state * move to iterator * persist sink load state in pipeline state * fix unrelated typo * move sink state storage to loadpackage state * additional pr fixes * disable creating empty state file on loadpackage init * add sink docs page * small changes * make loadstorage state versioned and separate out common base functions * restrict access of destinations to load package state in accessor functions * fix tests * add tests for state and new injectable context * fix linter * fix linter error * some pr fixes * more pr fixes * small readme changes * add load id to loadpackage info in current * add support for directly passing through the naming convention to the sink * add support for batch size zero (filepath passthrouh) * use patched version of flak8 encoding * fix tests * add support for secrets and config in sink * update sink docs * revert encodings branch * fix small linting problem * add support for config specs * add possibility to create a resolved partial * add lock for resolving config add test for nested configs * change resolved partial method to dedicated function * change signatures in decorator lock injection context for wrapped functions small pr fixes * fixes bug in inject wrapper refactor * mark destination decorator as experimental in the docs * change injection context locking strategy forward generic destiation call params into config small fixes * make tests independent from gcp imports * move generic destination tests into common tests section destinations * fix global instantiation test after file move * add tests for locking injection context * make inject test a bit better make simple test for loading load package without state * skip generic destination in init test --- dlt/__init__.py | 3 + dlt/common/configuration/__init__.py | 2 +- dlt/common/configuration/container.py | 67 ++- dlt/common/configuration/inject.py | 150 ++++-- dlt/common/configuration/resolve.py | 5 +- .../configuration/specs/base_configuration.py | 1 + dlt/common/data_types/type_helpers.py | 10 +- dlt/common/destination/capabilities.py | 2 +- dlt/common/pipeline.py | 10 +- dlt/common/reflection/spec.py | 7 +- dlt/common/storages/exceptions.py | 8 + dlt/common/storages/load_package.py | 177 ++++++- dlt/common/storages/load_storage.py | 16 +- dlt/common/storages/normalize_storage.py | 4 +- dlt/common/versioned_state.py | 45 ++ dlt/destinations/__init__.py | 2 + dlt/destinations/decorators.py | 57 +++ dlt/destinations/impl/destination/__init__.py | 14 + .../impl/destination/configuration.py | 32 ++ .../impl/destination/destination.py | 186 ++++++++ dlt/destinations/impl/destination/factory.py | 116 +++++ dlt/extract/incremental/__init__.py | 1 + dlt/helpers/streamlit_helper.py | 4 +- dlt/load/load.py | 32 +- dlt/pipeline/current.py | 7 + dlt/pipeline/pipeline.py | 46 +- dlt/pipeline/state_sync.py | 113 +++-- .../dlt-ecosystem/destinations/destination.md | 151 ++++++ .../verified-sources/pipedrive.md | 2 +- docs/website/sidebars.js | 3 +- tests/cases.py | 25 +- tests/cli/test_init_command.py | 2 + tests/common/configuration/test_inject.py | 111 ++++- tests/common/schema/test_coercion.py | 10 + tests/common/storages/test_load_package.py | 111 +++++ tests/common/test_destination.py | 12 +- tests/common/test_versioned_state.py | 43 ++ .../destinations/test_generic_destination.py | 436 ++++++++++++++++++ tests/load/pipeline/test_drop.py | 4 +- tests/load/pipeline/test_restore_state.py | 22 +- tests/load/weaviate/utils.py | 2 + tests/pipeline/test_pipeline.py | 4 +- tests/pipeline/test_pipeline_state.py | 47 +- tests/utils.py | 3 +- 44 files changed, 1893 insertions(+), 212 deletions(-) create mode 100644 dlt/common/versioned_state.py create mode 100644 dlt/destinations/decorators.py create mode 100644 dlt/destinations/impl/destination/__init__.py create mode 100644 dlt/destinations/impl/destination/configuration.py create mode 100644 dlt/destinations/impl/destination/destination.py create mode 100644 dlt/destinations/impl/destination/factory.py create mode 100644 docs/website/docs/dlt-ecosystem/destinations/destination.md create mode 100644 tests/common/test_versioned_state.py create mode 100644 tests/destinations/test_generic_destination.py diff --git a/dlt/__init__.py b/dlt/__init__.py index e2a6b1a3a7..eee105e47e 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -29,6 +29,8 @@ from dlt import sources from dlt.extract.decorators import source, resource, transformer, defer +from dlt.destinations.decorators import destination + from dlt.pipeline import ( pipeline as _pipeline, run, @@ -62,6 +64,7 @@ "resource", "transformer", "defer", + "destination", "pipeline", "run", "attach", diff --git a/dlt/common/configuration/__init__.py b/dlt/common/configuration/__init__.py index b7d868ff8b..8de57f7799 100644 --- a/dlt/common/configuration/__init__.py +++ b/dlt/common/configuration/__init__.py @@ -1,7 +1,7 @@ from .specs.base_configuration import configspec, is_valid_hint, is_secret_hint, resolve_type from .specs import known_sections from .resolve import resolve_configuration, inject_section -from .inject import with_config, last_config, get_fun_spec +from .inject import with_config, last_config, get_fun_spec, create_resolved_partial from .exceptions import ( ConfigFieldMissingException, diff --git a/dlt/common/configuration/container.py b/dlt/common/configuration/container.py index ad20765489..441b0e21bc 100644 --- a/dlt/common/configuration/container.py +++ b/dlt/common/configuration/container.py @@ -1,7 +1,7 @@ -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext, AbstractContextManager import re import threading -from typing import ClassVar, Dict, Iterator, Tuple, Type, TypeVar +from typing import ClassVar, Dict, Iterator, Tuple, Type, TypeVar, Any from dlt.common.configuration.specs.base_configuration import ContainerInjectableContext from dlt.common.configuration.exceptions import ( @@ -34,6 +34,9 @@ class Container: thread_contexts: Dict[int, Dict[Type[ContainerInjectableContext], ContainerInjectableContext]] """A thread aware mapping of injection context """ + _context_container_locks: Dict[str, threading.Lock] + """Locks for container types on threads.""" + main_context: Dict[Type[ContainerInjectableContext], ContainerInjectableContext] """Injection context for the main thread""" @@ -41,6 +44,7 @@ def __new__(cls: Type["Container"]) -> "Container": if not cls._INSTANCE: cls._INSTANCE = super().__new__(cls) cls._INSTANCE.thread_contexts = {} + cls._INSTANCE._context_container_locks = {} cls._INSTANCE.main_context = cls._INSTANCE.thread_contexts[ Container._MAIN_THREAD_ID ] = {} @@ -84,22 +88,22 @@ def _thread_context( self, spec: Type[TConfiguration] ) -> Dict[Type[ContainerInjectableContext], ContainerInjectableContext]: if spec.global_affinity: - context = self.main_context + return self.main_context else: # thread pool names used in dlt contain originating thread id. use this id over pool id if m := re.match(r"dlt-pool-(\d+)-", threading.currentThread().getName()): thread_id = int(m.group(1)) else: thread_id = threading.get_ident() + # return main context for main thread if thread_id == Container._MAIN_THREAD_ID: return self.main_context # we may add a new empty thread context so lock here with Container._LOCK: - context = self.thread_contexts.get(thread_id) - if context is None: + if (context := self.thread_contexts.get(thread_id)) is None: context = self.thread_contexts[thread_id] = {} - return context + return context def _thread_getitem( self, spec: Type[TConfiguration] @@ -127,29 +131,44 @@ def _thread_delitem( del context[spec] @contextmanager - def injectable_context(self, config: TConfiguration) -> Iterator[TConfiguration]: + def injectable_context( + self, config: TConfiguration, lock_context: bool = False + ) -> Iterator[TConfiguration]: """A context manager that will insert `config` into the container and restore the previous value when it gets out of scope.""" + config.resolve() spec = type(config) previous_config: ContainerInjectableContext = None - context, previous_config = self._thread_getitem(spec) - - # set new config and yield context - self._thread_setitem(context, spec, config) - try: - yield config - finally: - # before setting the previous config for given spec, check if there was no overlapping modification - context, current_config = self._thread_getitem(spec) - if current_config is config: - # config is injected for spec so restore previous - if previous_config is None: - self._thread_delitem(context, spec) + context = self._thread_context(spec) + lock: AbstractContextManager[Any] + + # if there is a lock_id, we need a lock for the lock_id in the scope of the current context + if lock_context: + lock_key = f"{id(context)}" + if (lock := self._context_container_locks.get(lock_key)) is None: + with Container._LOCK: + self._context_container_locks[lock_key] = lock = threading.Lock() + else: + lock = nullcontext() + + with lock: + # remember context and set item + previous_config = context.get(spec) + self._thread_setitem(context, spec, config) + try: + yield config + finally: + # before setting the previous config for given spec, check if there was no overlapping modification + context, current_config = self._thread_getitem(spec) + if current_config is config: + # config is injected for spec so restore previous + if previous_config is None: + self._thread_delitem(context, spec) + else: + self._thread_setitem(context, spec, previous_config) else: - self._thread_setitem(context, spec, previous_config) - else: - # value was modified in the meantime and not restored - raise ContainerInjectableContextMangled(spec, context[spec], config) + # value was modified in the meantime and not restored + raise ContainerInjectableContextMangled(spec, context[spec], config) @staticmethod def thread_pool_prefix() -> str: diff --git a/dlt/common/configuration/inject.py b/dlt/common/configuration/inject.py index a22f299ae8..03f640e6df 100644 --- a/dlt/common/configuration/inject.py +++ b/dlt/common/configuration/inject.py @@ -1,12 +1,15 @@ import inspect + from functools import wraps -from typing import Callable, Dict, Type, Any, Optional, Tuple, TypeVar, overload +from typing import Callable, Dict, Type, Any, Optional, Tuple, TypeVar, overload, cast from inspect import Signature, Parameter +from contextlib import nullcontext from dlt.common.typing import DictStrAny, StrAny, TFun, AnyFun from dlt.common.configuration.resolve import resolve_configuration, inject_section from dlt.common.configuration.specs.base_configuration import BaseConfiguration from dlt.common.configuration.specs.config_section_context import ConfigSectionContext + from dlt.common.reflection.spec import spec_from_signature @@ -32,6 +35,9 @@ def with_config( auto_pipeline_section: bool = False, include_defaults: bool = True, accept_partial: bool = False, + initial_config: BaseConfiguration = None, + base: Type[BaseConfiguration] = BaseConfiguration, + lock_context_on_injection: bool = True, ) -> TFun: ... @@ -45,6 +51,9 @@ def with_config( auto_pipeline_section: bool = False, include_defaults: bool = True, accept_partial: bool = False, + initial_config: Optional[BaseConfiguration] = None, + base: Type[BaseConfiguration] = BaseConfiguration, + lock_context_on_injection: bool = True, ) -> Callable[[TFun], TFun]: ... @@ -58,6 +67,8 @@ def with_config( include_defaults: bool = True, accept_partial: bool = False, initial_config: Optional[BaseConfiguration] = None, + base: Type[BaseConfiguration] = BaseConfiguration, + lock_context_on_injection: bool = True, ) -> Callable[[TFun], TFun]: """Injects values into decorated function arguments following the specification in `spec` or by deriving one from function's signature. @@ -71,10 +82,12 @@ def with_config( prefer_existing_sections: (bool, optional): When joining existing section context, the existing context will be preferred to the one in `sections`. Default: False auto_pipeline_section (bool, optional): If True, a top level pipeline section will be added if `pipeline_name` argument is present . Defaults to False. include_defaults (bool, optional): If True then arguments with default values will be included in synthesized spec. If False only the required arguments marked with `dlt.secrets.value` and `dlt.config.value` are included - + base (Type[BaseConfiguration], optional): A base class for synthesized spec. Defaults to BaseConfiguration. + lock_context_on_injection (bool, optional): If True, the thread context will be locked during injection to prevent race conditions. Defaults to True. Returns: Callable[[TFun], TFun]: A decorated function """ + section_f: Callable[[StrAny], str] = None # section may be a function from function arguments to section if callable(sections): @@ -88,9 +101,8 @@ def decorator(f: TFun) -> TFun: ) spec_arg: Parameter = None pipeline_name_arg: Parameter = None - if spec is None: - SPEC = spec_from_signature(f, sig, include_defaults) + SPEC = spec_from_signature(f, sig, include_defaults, base=base) else: SPEC = spec @@ -109,49 +121,52 @@ def decorator(f: TFun) -> TFun: pipeline_name_arg = p pipeline_name_arg_default = None if p.default == Parameter.empty else p.default - @wraps(f) - def _wrap(*args: Any, **kwargs: Any) -> Any: + def resolve_config(bound_args: inspect.BoundArguments) -> BaseConfiguration: + """Resolve arguments using the provided spec""" # bind parameters to signature - bound_args = sig.bind(*args, **kwargs) # for calls containing resolved spec in the kwargs, we do not need to resolve again config: BaseConfiguration = None - if _LAST_DLT_CONFIG in kwargs: - config = last_config(**kwargs) + + # if section derivation function was provided then call it + if section_f: + curr_sections: Tuple[str, ...] = (section_f(bound_args.arguments),) + # sections may be a string + elif isinstance(sections, str): + curr_sections = (sections,) else: - # if section derivation function was provided then call it - if section_f: - curr_sections: Tuple[str, ...] = (section_f(bound_args.arguments),) - # sections may be a string - elif isinstance(sections, str): - curr_sections = (sections,) - else: - curr_sections = sections - - # if one of arguments is spec the use it as initial value - if initial_config: - config = initial_config - elif spec_arg: - config = bound_args.arguments.get(spec_arg.name, None) - # resolve SPEC, also provide section_context with pipeline_name - if pipeline_name_arg: - curr_pipeline_name = bound_args.arguments.get( - pipeline_name_arg.name, pipeline_name_arg_default - ) - else: - curr_pipeline_name = None - section_context = ConfigSectionContext( - pipeline_name=curr_pipeline_name, - sections=curr_sections, - merge_style=sections_merge_style, + curr_sections = sections + + # if one of arguments is spec the use it as initial value + if initial_config: + config = initial_config + elif spec_arg: + config = bound_args.arguments.get(spec_arg.name, None) + # resolve SPEC, also provide section_context with pipeline_name + if pipeline_name_arg: + curr_pipeline_name = bound_args.arguments.get( + pipeline_name_arg.name, pipeline_name_arg_default ) - # this may be called from many threads so section_context is thread affine - with inject_section(section_context): - # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections}") - config = resolve_configuration( - config or SPEC(), - explicit_value=bound_args.arguments, - accept_partial=accept_partial, - ) + else: + curr_pipeline_name = None + section_context = ConfigSectionContext( + pipeline_name=curr_pipeline_name, + sections=curr_sections, + merge_style=sections_merge_style, + ) + + # this may be called from many threads so section_context is thread affine + with inject_section(section_context, lock_context=lock_context_on_injection): + # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections}") + return resolve_configuration( + config or SPEC(), + explicit_value=bound_args.arguments, + accept_partial=accept_partial, + ) + + def update_bound_args( + bound_args: inspect.BoundArguments, config: BaseConfiguration, args: Any, kwargs: Any + ) -> None: + # overwrite or add resolved params resolved_params = dict(config) # overwrite or add resolved params for p in sig.parameters.values(): @@ -167,12 +182,56 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: bound_args.arguments[kwargs_arg.name].update(resolved_params) bound_args.arguments[kwargs_arg.name][_LAST_DLT_CONFIG] = config bound_args.arguments[kwargs_arg.name][_ORIGINAL_ARGS] = (args, kwargs) + + def with_partially_resolved_config(config: Optional[BaseConfiguration] = None) -> Any: + # creates a pre-resolved partial of the decorated function + empty_bound_args = sig.bind_partial() + if not config: + config = resolve_config(empty_bound_args) + + def wrapped(*args: Any, **kwargs: Any) -> Any: + nonlocal config + + # Do we need an exception here? + if spec_arg and spec_arg.name in kwargs: + from dlt.common import logger + + logger.warning( + "Spec argument is provided in kwargs, ignoring it for resolved partial" + " function." + ) + + # we can still overwrite the config + if _LAST_DLT_CONFIG in kwargs: + config = last_config(**kwargs) + + # call the function with the pre-resolved config + bound_args = sig.bind(*args, **kwargs) + update_bound_args(bound_args, config, args, kwargs) + return f(*bound_args.args, **bound_args.kwargs) + + return wrapped + + @wraps(f) + def _wrap(*args: Any, **kwargs: Any) -> Any: + # Resolve config + config: BaseConfiguration = None + bound_args = sig.bind(*args, **kwargs) + if _LAST_DLT_CONFIG in kwargs: + config = last_config(**kwargs) + else: + config = resolve_config(bound_args) + # call the function with resolved config + update_bound_args(bound_args, config, args, kwargs) return f(*bound_args.args, **bound_args.kwargs) # register the spec for a wrapped function _FUNC_SPECS[id(_wrap)] = SPEC + # add a method to create a pre-resolved partial + setattr(_wrap, "__RESOLVED_PARTIAL_FUNC__", with_partially_resolved_config) # noqa: B010 + return _wrap # type: ignore # See if we're being called as @with_config or @with_config(). @@ -197,3 +256,10 @@ def last_config(**kwargs: Any) -> Any: def get_orig_args(**kwargs: Any) -> Tuple[Tuple[Any], DictStrAny]: return kwargs[_ORIGINAL_ARGS] # type: ignore + + +def create_resolved_partial(f: AnyFun, config: Optional[BaseConfiguration] = None) -> AnyFun: + """Create a pre-resolved partial of the with_config decorated function""" + if partial_func := getattr(f, "__RESOLVED_PARTIAL_FUNC__", None): + return cast(AnyFun, partial_func(config)) + return f diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index db69cd9572..b398f0463a 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -92,13 +92,14 @@ def initialize_credentials(hint: Any, initial_value: Any) -> CredentialsConfigur def inject_section( - section_context: ConfigSectionContext, merge_existing: bool = True + section_context: ConfigSectionContext, merge_existing: bool = True, lock_context: bool = False ) -> ContextManager[ConfigSectionContext]: """Context manager that sets section specified in `section_context` to be used during configuration resolution. Optionally merges the context already in the container with the one provided Args: section_context (ConfigSectionContext): Instance providing a pipeline name and section context merge_existing (bool, optional): Merges existing section context with `section_context` in the arguments by executing `merge_style` function on `section_context`. Defaults to True. + lock_context (bool, optional): Instruct to threadlock the current thread to prevent race conditions in context injection. Default Merge Style: Gets `pipeline_name` and `sections` from existing context if they are not provided in `section_context` argument. @@ -112,7 +113,7 @@ def inject_section( if merge_existing: section_context.merge(existing_context) - return container.injectable_context(section_context) + return container.injectable_context(section_context, lock_context=lock_context) def _maybe_parse_native_value( diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 84f59fa894..62abf42f27 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -2,6 +2,7 @@ import inspect import contextlib import dataclasses + from collections.abc import Mapping as C_Mapping from typing import ( Callable, diff --git a/dlt/common/data_types/type_helpers.py b/dlt/common/data_types/type_helpers.py index 659b4951df..61a0aa1dbf 100644 --- a/dlt/common/data_types/type_helpers.py +++ b/dlt/common/data_types/type_helpers.py @@ -7,7 +7,7 @@ from enum import Enum from dlt.common import pendulum, json, Decimal, Wei -from dlt.common.json import custom_pua_remove +from dlt.common.json import custom_pua_remove, json from dlt.common.json._simplejson import custom_encode as json_custom_encode from dlt.common.arithmetics import InvalidOperation from dlt.common.data_types.typing import TDataType @@ -105,6 +105,14 @@ def coerce_value(to_type: TDataType, from_type: TDataType, value: Any) -> Any: return int(value.value) return value + if to_type == "complex": + # try to coerce from text + if from_type == "text": + try: + return json.loads(value) + except Exception: + raise ValueError(value) + if to_type == "text": if from_type == "complex": return complex_to_str(value) diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index a78a31fdf3..0f2500c2cd 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -19,7 +19,7 @@ ] ALL_SUPPORTED_FILE_FORMATS: Set[TLoaderFileFormat] = set(get_args(TLoaderFileFormat)) # file formats used internally by dlt -INTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = {"puae-jsonl", "sql", "reference", "arrow"} +INTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = {"sql", "reference", "arrow"} # file formats that may be chosen by the user EXTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = ( set(get_args(TLoaderFileFormat)) - INTERNAL_LOADER_FILE_FORMATS diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index df221ec703..3cbaafefbe 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -3,6 +3,7 @@ import datetime # noqa: 251 import humanize import contextlib + from typing import ( Any, Callable, @@ -40,11 +41,15 @@ from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo +from dlt.common.storages.load_package import PackageStorage + from dlt.common.time import ensure_pendulum_datetime, precise_time from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize from dlt.common.jsonpath import delete_matches, TAnyJsonPath from dlt.common.data_writers.writers import DataWriterMetrics, TLoaderFileFormat from dlt.common.utils import RowCounts, merge_row_counts +from dlt.common.versioned_state import TVersionedState +from dlt.common.storages.load_package import TLoadPackageState class _StepInfo(NamedTuple): @@ -454,7 +459,7 @@ class TPipelineLocalState(TypedDict, total=False): """Hash of state that was recently synced with destination""" -class TPipelineState(TypedDict, total=False): +class TPipelineState(TVersionedState, total=False): """Schema for a pipeline state that is stored within the pipeline working directory""" pipeline_name: str @@ -469,9 +474,6 @@ class TPipelineState(TypedDict, total=False): staging_type: Optional[str] # properties starting with _ are not automatically applied to pipeline object when state is restored - _state_version: int - _version_hash: str - _state_engine_version: int _local: TPipelineLocalState """A section of state that is not synchronized with the destination and does not participate in change merging and version control""" diff --git a/dlt/common/reflection/spec.py b/dlt/common/reflection/spec.py index 0a486088c8..ffc12e908c 100644 --- a/dlt/common/reflection/spec.py +++ b/dlt/common/reflection/spec.py @@ -26,7 +26,10 @@ def _first_up(s: str) -> str: def spec_from_signature( - f: AnyFun, sig: Signature, include_defaults: bool = True + f: AnyFun, + sig: Signature, + include_defaults: bool = True, + base: Type[BaseConfiguration] = BaseConfiguration, ) -> Type[BaseConfiguration]: name = _get_spec_name_from_f(f) module = inspect.getmodule(f) @@ -109,7 +112,7 @@ def dlt_config_literal_to_type(arg_name: str) -> AnyType: # set annotations so they are present in __dict__ fields["__annotations__"] = annotations # synthesize type - T: Type[BaseConfiguration] = type(name, (BaseConfiguration,), fields) + T: Type[BaseConfiguration] = type(name, (base,), fields) SPEC = configspec()(T) # add to the module setattr(module, spec_id, SPEC) diff --git a/dlt/common/storages/exceptions.py b/dlt/common/storages/exceptions.py index 22d6dfaf79..f4288719c1 100644 --- a/dlt/common/storages/exceptions.py +++ b/dlt/common/storages/exceptions.py @@ -116,3 +116,11 @@ def __init__(self, schema_name: str, storage_path: str, stored_name: str) -> Non f"A schema file name '{schema_name}' in {storage_path} does not correspond to the name" f" of schema in the file {stored_name}" ) + + +class CurrentLoadPackageStateNotAvailable(StorageException): + def __init__(self) -> None: + super().__init__( + "State of the current load package is not available. Current load package state is" + " only available in a function decorated with @dlt.destination during loading." + ) diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index 63409aa878..bb66e28671 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -1,6 +1,8 @@ import contextlib import os from copy import deepcopy +import threading + import datetime # noqa: 251 import humanize from pathlib import Path @@ -17,23 +19,92 @@ Set, get_args, cast, + Any, + Tuple, + TYPE_CHECKING, + TypedDict, ) from dlt.common import pendulum, json + +from dlt.common.configuration import configspec +from dlt.common.configuration.specs import ContainerInjectableContext +from dlt.common.configuration.exceptions import ContextDefaultCannotBeCreated +from dlt.common.configuration.container import Container + from dlt.common.data_writers import DataWriter, new_file_id from dlt.common.destination import TLoaderFileFormat from dlt.common.exceptions import TerminalValueError from dlt.common.schema import Schema, TSchemaTables from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns from dlt.common.storages import FileStorage -from dlt.common.storages.exceptions import LoadPackageNotFound -from dlt.common.typing import DictStrAny, StrAny, SupportsHumanize +from dlt.common.storages.exceptions import LoadPackageNotFound, CurrentLoadPackageStateNotAvailable +from dlt.common.typing import DictStrAny, SupportsHumanize from dlt.common.utils import flatten_list_or_items +from dlt.common.versioned_state import ( + generate_state_version_hash, + bump_state_version_if_modified, + TVersionedState, + default_versioned_state, +) +from typing_extensions import NotRequired + + +class TLoadPackageState(TVersionedState, total=False): + created_at: str + """Timestamp when the loadpackage was created""" + + """A section of state that does not participate in change merging and version control""" + destination_state: NotRequired[Dict[str, Any]] + """private space for destinations to store state relevant only to the load package""" + + +class TLoadPackage(TypedDict, total=False): + load_id: str + """Load id""" + state: TLoadPackageState + """State of the load package""" + + +# allows to upgrade state when restored with a new version of state logic/schema +LOADPACKAGE_STATE_ENGINE_VERSION = 1 + + +def generate_loadpackage_state_version_hash(state: TLoadPackageState) -> str: + return generate_state_version_hash(state) + + +def bump_loadpackage_state_version_if_modified(state: TLoadPackageState) -> Tuple[int, str, str]: + return bump_state_version_if_modified(state) + + +def migrate_load_package_state( + state: DictStrAny, from_engine: int, to_engine: int +) -> TLoadPackageState: + # TODO: if you start adding new versions, we need proper tests for these migrations! + # NOTE: do not touch destinations state, it is not versioned + if from_engine == to_engine: + return cast(TLoadPackageState, state) + + # check state engine + if from_engine != to_engine: + raise Exception("No upgrade path for loadpackage state") + + state["_state_engine_version"] = from_engine + return cast(TLoadPackageState, state) + + +def default_load_package_state() -> TLoadPackageState: + return { + **default_versioned_state(), + "_state_engine_version": LOADPACKAGE_STATE_ENGINE_VERSION, + } + # folders to manage load jobs in a single load package TJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] WORKING_FOLDERS: Set[TJobState] = set(get_args(TJobState)) -TLoadPackageState = Literal["new", "extracted", "normalized", "loaded", "aborted"] +TLoadPackageStatus = Literal["new", "extracted", "normalized", "loaded", "aborted"] class ParsedLoadJobFileName(NamedTuple): @@ -125,7 +196,7 @@ def __str__(self) -> str: class _LoadPackageInfo(NamedTuple): load_id: str package_path: str - state: TLoadPackageState + state: TLoadPackageStatus schema: Schema schema_update: TSchemaTables completed_at: datetime.datetime @@ -201,8 +272,11 @@ class PackageStorage: PACKAGE_COMPLETED_FILE_NAME = ( # completed package marker file, currently only to store data with os.stat "package_completed.json" ) + LOAD_PACKAGE_STATE_FILE_NAME = ( # internal state of the load package, will not be synced to the destination + "load_package_state.json" + ) - def __init__(self, storage: FileStorage, initial_state: TLoadPackageState) -> None: + def __init__(self, storage: FileStorage, initial_state: TLoadPackageStatus) -> None: """Creates storage that manages load packages with root at `storage` and initial package state `initial_state`""" self.storage = storage self.initial_state = initial_state @@ -334,8 +408,13 @@ def create_package(self, load_id: str) -> None: self.storage.create_folder(os.path.join(load_id, PackageStorage.COMPLETED_JOBS_FOLDER)) self.storage.create_folder(os.path.join(load_id, PackageStorage.FAILED_JOBS_FOLDER)) self.storage.create_folder(os.path.join(load_id, PackageStorage.STARTED_JOBS_FOLDER)) + # ensure created timestamp is set in state when load package is created + state = self.get_load_package_state(load_id) + if not state.get("created_at"): + state["created_at"] = pendulum.now().to_iso8601_string() + self.save_load_package_state(load_id, state) - def complete_loading_package(self, load_id: str, load_state: TLoadPackageState) -> str: + def complete_loading_package(self, load_id: str, load_state: TLoadPackageStatus) -> str: """Completes loading the package by writing marker file with`package_state. Returns path to the completed package""" load_path = self.get_package_path(load_id) # save marker file @@ -381,6 +460,36 @@ def save_schema_updates(self, load_id: str, schema_update: TSchemaTables) -> Non ) as f: json.dump(schema_update, f) + # + # Loadpackage state + # + def get_load_package_state(self, load_id: str) -> TLoadPackageState: + package_path = self.get_package_path(load_id) + if not self.storage.has_folder(package_path): + raise LoadPackageNotFound(load_id) + try: + state_dump = self.storage.load(self.get_load_package_state_path(load_id)) + state = json.loads(state_dump) + return migrate_load_package_state( + state, state["_state_engine_version"], LOADPACKAGE_STATE_ENGINE_VERSION + ) + except FileNotFoundError: + return default_load_package_state() + + def save_load_package_state(self, load_id: str, state: TLoadPackageState) -> None: + package_path = self.get_package_path(load_id) + if not self.storage.has_folder(package_path): + raise LoadPackageNotFound(load_id) + bump_loadpackage_state_version_if_modified(state) + self.storage.save( + self.get_load_package_state_path(load_id), + json.dumps(state), + ) + + def get_load_package_state_path(self, load_id: str) -> str: + package_path = self.get_package_path(load_id) + return os.path.join(package_path, PackageStorage.LOAD_PACKAGE_STATE_FILE_NAME) + # # Get package info # @@ -514,3 +623,59 @@ def filter_jobs_for_table( all_jobs: Iterable[LoadJobInfo], table_name: str ) -> Sequence[LoadJobInfo]: return [job for job in all_jobs if job.job_file_info.table_name == table_name] + + +@configspec +class LoadPackageStateInjectableContext(ContainerInjectableContext): + storage: PackageStorage + load_id: str + can_create_default: ClassVar[bool] = False + global_affinity: ClassVar[bool] = False + + def commit(self) -> None: + with self.state_save_lock: + self.storage.save_load_package_state(self.load_id, self.state) + + def on_resolved(self) -> None: + self.state_save_lock = threading.Lock() + self.state = self.storage.get_load_package_state(self.load_id) + + if TYPE_CHECKING: + + def __init__(self, load_id: str, storage: PackageStorage) -> None: ... + + +def load_package() -> TLoadPackage: + """Get full load package state present in current context. Across all threads this will be the same in memory dict.""" + container = Container() + # get injected state if present. injected load package state is typically "managed" so changes will be persisted + # if you need to save the load package state during a load, you need to call commit_load_package_state + try: + state_ctx = container[LoadPackageStateInjectableContext] + except ContextDefaultCannotBeCreated: + raise CurrentLoadPackageStateNotAvailable() + return TLoadPackage(state=state_ctx.state, load_id=state_ctx.load_id) + + +def commit_load_package_state() -> None: + """Commit load package state present in current context. This is thread safe.""" + container = Container() + try: + state_ctx = container[LoadPackageStateInjectableContext] + except ContextDefaultCannotBeCreated: + raise CurrentLoadPackageStateNotAvailable() + state_ctx.commit() + + +def destination_state() -> DictStrAny: + """Get segment of load package state that is specific to the current destination.""" + lp = load_package() + return lp["state"].setdefault("destination_state", {}) + + +def clear_destination_state(commit: bool = True) -> None: + """Clear segment of load package state that is specific to the current destination. Optionally commit to load package.""" + lp = load_package() + lp["state"].pop("destination_state", None) + if commit: + commit_load_package_state() diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index a83502cb9b..ffd55e7f29 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -1,6 +1,7 @@ from os.path import join from typing import Iterable, Optional, Sequence +from dlt.common.typing import DictStrAny from dlt.common import json from dlt.common.configuration import known_sections from dlt.common.configuration.inject import with_config @@ -18,6 +19,7 @@ PackageStorage, ParsedLoadJobFileName, TJobState, + TLoadPackageState, ) from dlt.common.storages.exceptions import JobWithUnsupportedWriterException, LoadPackageNotFound @@ -38,6 +40,11 @@ def __init__( supported_file_formats: Iterable[TLoaderFileFormat], config: LoadStorageConfiguration = config.value, ) -> None: + # puae-jsonl jobs have the extension .jsonl, so cater for this here + if supported_file_formats and "puae-jsonl" in supported_file_formats: + supported_file_formats = list(supported_file_formats) + supported_file_formats.append("jsonl") + if not LoadStorage.ALL_SUPPORTED_FILE_FORMATS.issuperset(supported_file_formats): raise TerminalValueError(supported_file_formats) if preferred_file_format and preferred_file_format not in supported_file_formats: @@ -79,7 +86,7 @@ def _get_data_item_path_template(self, load_id: str, _: str, table_name: str) -> def list_new_jobs(self, load_id: str) -> Sequence[str]: """Lists all jobs in new jobs folder of normalized package storage and checks if file formats are supported""" new_jobs = self.normalized_packages.list_new_jobs(load_id) - # # make sure all jobs have supported writers + # make sure all jobs have supported writers wrong_job = next( ( j @@ -184,3 +191,10 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: return self.loaded_packages.get_load_package_info(load_id) except LoadPackageNotFound: return self.normalized_packages.get_load_package_info(load_id) + + def get_load_package_state(self, load_id: str) -> TLoadPackageState: + """Gets state of normlized or loaded package with given load_id, all jobs and their statuses.""" + try: + return self.loaded_packages.get_load_package_state(load_id) + except LoadPackageNotFound: + return self.normalized_packages.get_load_package_state(load_id) diff --git a/dlt/common/storages/normalize_storage.py b/dlt/common/storages/normalize_storage.py index 8a247c2021..2b90b7c088 100644 --- a/dlt/common/storages/normalize_storage.py +++ b/dlt/common/storages/normalize_storage.py @@ -51,7 +51,9 @@ def list_files_to_normalize_sorted(self) -> Sequence[str]: [ file for file in files - if not file.endswith(PackageStorage.SCHEMA_FILE_NAME) and os.path.isfile(file) + if not file.endswith(PackageStorage.SCHEMA_FILE_NAME) + and os.path.isfile(file) + and not file.endswith(PackageStorage.LOAD_PACKAGE_STATE_FILE_NAME) ] ) diff --git a/dlt/common/versioned_state.py b/dlt/common/versioned_state.py new file mode 100644 index 0000000000..a051a6660c --- /dev/null +++ b/dlt/common/versioned_state.py @@ -0,0 +1,45 @@ +import base64 +import hashlib +from copy import copy + +import datetime # noqa: 251 +from dlt.common import json +from typing import TypedDict, Dict, Any, List, Tuple, cast + + +class TVersionedState(TypedDict, total=False): + _state_version: int + _version_hash: str + _state_engine_version: int + + +def generate_state_version_hash(state: TVersionedState, exclude_attrs: List[str] = None) -> str: + # generates hash out of stored schema content, excluding hash itself, version and local state + state_copy = copy(state) + exclude_attrs = exclude_attrs or [] + exclude_attrs.extend(["_state_version", "_state_engine_version", "_version_hash"]) + for attr in exclude_attrs: + state_copy.pop(attr, None) # type: ignore + content = json.typed_dumpb(state_copy, sort_keys=True) # type: ignore + h = hashlib.sha3_256(content) + return base64.b64encode(h.digest()).decode("ascii") + + +def bump_state_version_if_modified( + state: TVersionedState, exclude_attrs: List[str] = None +) -> Tuple[int, str, str]: + """Bumps the `state` version and version hash if content modified, returns (new version, new hash, old hash) tuple""" + hash_ = generate_state_version_hash(state, exclude_attrs) + previous_hash = state.get("_version_hash") + if not previous_hash: + # if hash was not set, set it without bumping the version, that's the initial state + pass + elif hash_ != previous_hash: + state["_state_version"] += 1 + + state["_version_hash"] = hash_ + return state["_state_version"], hash_, previous_hash + + +def default_versioned_state() -> TVersionedState: + return {"_state_version": 0, "_state_engine_version": 1} diff --git a/dlt/destinations/__init__.py b/dlt/destinations/__init__.py index c0a0b419c1..4a10deffc0 100644 --- a/dlt/destinations/__init__.py +++ b/dlt/destinations/__init__.py @@ -10,6 +10,7 @@ from dlt.destinations.impl.qdrant.factory import qdrant from dlt.destinations.impl.motherduck.factory import motherduck from dlt.destinations.impl.weaviate.factory import weaviate +from dlt.destinations.impl.destination.factory import destination from dlt.destinations.impl.synapse.factory import synapse from dlt.destinations.impl.databricks.factory import databricks @@ -29,4 +30,5 @@ "weaviate", "synapse", "databricks", + "destination", ] diff --git a/dlt/destinations/decorators.py b/dlt/destinations/decorators.py new file mode 100644 index 0000000000..d5e6e816b3 --- /dev/null +++ b/dlt/destinations/decorators.py @@ -0,0 +1,57 @@ +import functools + +from typing import Any, Type, Optional, Callable, Union +from typing_extensions import Concatenate + +from functools import wraps + +from dlt.common import logger +from dlt.destinations.impl.destination.factory import destination as _destination +from dlt.destinations.impl.destination.configuration import ( + TDestinationCallableParams, + GenericDestinationClientConfiguration, +) +from dlt.common.destination import TLoaderFileFormat +from dlt.common.destination.reference import Destination +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema + + +def destination( + *, + loader_file_format: TLoaderFileFormat = None, + batch_size: int = 10, + name: str = None, + naming_convention: str = "direct", + spec: Type[GenericDestinationClientConfiguration] = GenericDestinationClientConfiguration, +) -> Callable[ + [Callable[Concatenate[Union[TDataItems, str], TTableSchema, TDestinationCallableParams], Any]], + Callable[TDestinationCallableParams, _destination], +]: + def decorator( + destination_callable: Callable[ + Concatenate[Union[TDataItems, str], TTableSchema, TDestinationCallableParams], Any + ] + ) -> Callable[TDestinationCallableParams, _destination]: + @wraps(destination_callable) + def wrapper( + *args: TDestinationCallableParams.args, **kwargs: TDestinationCallableParams.kwargs + ) -> _destination: + if args: + logger.warning( + "Ignoring positional arguments for destination callable %s", + destination_callable, + ) + return _destination( + spec=spec, + destination_callable=destination_callable, + loader_file_format=loader_file_format, + batch_size=batch_size, + destination_name=name, + naming_convention=naming_convention, + **kwargs, # type: ignore + ) + + return wrapper + + return decorator diff --git a/dlt/destinations/impl/destination/__init__.py b/dlt/destinations/impl/destination/__init__.py new file mode 100644 index 0000000000..fbad2d570f --- /dev/null +++ b/dlt/destinations/impl/destination/__init__.py @@ -0,0 +1,14 @@ +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.data_writers import TLoaderFileFormat + + +def capabilities( + preferred_loader_file_format: TLoaderFileFormat = "puae-jsonl", + naming_convention: str = "direct", +) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext.generic_capabilities(preferred_loader_file_format) + caps.supported_loader_file_formats = ["puae-jsonl", "parquet"] + caps.supports_ddl_transactions = False + caps.supports_transactions = False + caps.naming_convention = naming_convention + return caps diff --git a/dlt/destinations/impl/destination/configuration.py b/dlt/destinations/impl/destination/configuration.py new file mode 100644 index 0000000000..339df9bff2 --- /dev/null +++ b/dlt/destinations/impl/destination/configuration.py @@ -0,0 +1,32 @@ +from typing import TYPE_CHECKING, Optional, Final, Callable, Union, Any +from typing_extensions import ParamSpec + +from dlt.common.configuration import configspec +from dlt.common.destination import TLoaderFileFormat +from dlt.common.destination.reference import ( + DestinationClientConfiguration, +) +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema +from dlt.common.destination import Destination + +TDestinationCallable = Callable[[Union[TDataItems, str], TTableSchema], None] +TDestinationCallableParams = ParamSpec("TDestinationCallableParams") + + +@configspec +class GenericDestinationClientConfiguration(DestinationClientConfiguration): + destination_type: Final[str] = "destination" # type: ignore + destination_callable: Optional[Union[str, TDestinationCallable]] = None # noqa: A003 + loader_file_format: TLoaderFileFormat = "puae-jsonl" + batch_size: int = 10 + + if TYPE_CHECKING: + + def __init__( + self, + *, + loader_file_format: TLoaderFileFormat = "puae-jsonl", + batch_size: int = 10, + destination_callable: Union[TDestinationCallable, str] = None, + ) -> None: ... diff --git a/dlt/destinations/impl/destination/destination.py b/dlt/destinations/impl/destination/destination.py new file mode 100644 index 0000000000..181e9f287f --- /dev/null +++ b/dlt/destinations/impl/destination/destination.py @@ -0,0 +1,186 @@ +from abc import ABC, abstractmethod +from types import TracebackType +from typing import ClassVar, Dict, Optional, Type, Iterable, Iterable, cast, Dict + +from dlt.destinations.job_impl import EmptyLoadJob +from dlt.common.typing import TDataItems, AnyFun +from dlt.common import json +from dlt.pipeline.current import ( + destination_state, + commit_load_package_state, +) +from dlt.common.configuration import create_resolved_partial + +from dlt.common.schema import Schema, TTableSchema, TSchemaTables +from dlt.common.schema.typing import TTableSchema +from dlt.common.storages import FileStorage +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.reference import ( + TLoadJobState, + LoadJob, + JobClientBase, +) + +from dlt.destinations.impl.destination import capabilities +from dlt.destinations.impl.destination.configuration import ( + GenericDestinationClientConfiguration, + TDestinationCallable, +) + + +class DestinationLoadJob(LoadJob, ABC): + def __init__( + self, + table: TTableSchema, + file_path: str, + config: GenericDestinationClientConfiguration, + schema: Schema, + destination_state: Dict[str, int], + destination_callable: TDestinationCallable, + ) -> None: + super().__init__(FileStorage.get_file_name_from_file_path(file_path)) + self._file_path = file_path + self._config = config + self._table = table + self._schema = schema + # we create pre_resolved callable here + self._callable = destination_callable + self._state: TLoadJobState = "running" + self._storage_id = f"{self._parsed_file_name.table_name}.{self._parsed_file_name.file_id}" + try: + if self._config.batch_size == 0: + # on batch size zero we only call the callable with the filename + self.call_callable_with_items(self._file_path) + else: + current_index = destination_state.get(self._storage_id, 0) + for batch in self.run(current_index): + self.call_callable_with_items(batch) + current_index += len(batch) + destination_state[self._storage_id] = current_index + + self._state = "completed" + except Exception as e: + self._state = "retry" + raise e + finally: + # save progress + commit_load_package_state() + + @abstractmethod + def run(self, start_index: int) -> Iterable[TDataItems]: + pass + + def call_callable_with_items(self, items: TDataItems) -> None: + if not items: + return + # call callable + self._callable(items, self._table) + + def state(self) -> TLoadJobState: + return self._state + + def exception(self) -> str: + raise NotImplementedError() + + +class DestinationParquetLoadJob(DestinationLoadJob): + def run(self, start_index: int) -> Iterable[TDataItems]: + # stream items + from dlt.common.libs.pyarrow import pyarrow + + # guard against changed batch size after restart of loadjob + assert ( + start_index % self._config.batch_size + ) == 0, "Batch size was changed during processing of one load package" + + start_batch = start_index / self._config.batch_size + with pyarrow.parquet.ParquetFile(self._file_path) as reader: + for record_batch in reader.iter_batches(batch_size=self._config.batch_size): + if start_batch > 0: + start_batch -= 1 + continue + yield record_batch + + +class DestinationJsonlLoadJob(DestinationLoadJob): + def run(self, start_index: int) -> Iterable[TDataItems]: + current_batch: TDataItems = [] + + # stream items + with FileStorage.open_zipsafe_ro(self._file_path) as f: + encoded_json = json.typed_loads(f.read()) + + for item in encoded_json: + # find correct start position + if start_index > 0: + start_index -= 1 + continue + current_batch.append(item) + if len(current_batch) == self._config.batch_size: + yield current_batch + current_batch = [] + yield current_batch + + +class DestinationClient(JobClientBase): + """Sink Client""" + + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() + + def __init__(self, schema: Schema, config: GenericDestinationClientConfiguration) -> None: + super().__init__(schema, config) + self.config: GenericDestinationClientConfiguration = config + # create pre-resolved callable to avoid multiple config resolutions during execution of the jobs + self.destination_callable = create_resolved_partial( + cast(AnyFun, self.config.destination_callable), self.config + ) + + def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: + pass + + def is_storage_initialized(self) -> bool: + return True + + def drop_storage(self) -> None: + pass + + def update_stored_schema( + self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None + ) -> Optional[TSchemaTables]: + return super().update_stored_schema(only_tables, expected_update) + + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: + # save our state in destination name scope + load_state = destination_state() + if file_path.endswith("parquet"): + return DestinationParquetLoadJob( + table, + file_path, + self.config, + self.schema, + load_state, + self.destination_callable, + ) + if file_path.endswith("jsonl"): + return DestinationJsonlLoadJob( + table, + file_path, + self.config, + self.schema, + load_state, + self.destination_callable, + ) + return None + + def restore_file_load(self, file_path: str) -> LoadJob: + return EmptyLoadJob.from_file_path(file_path, "completed") + + def complete_load(self, load_id: str) -> None: ... + + def __enter__(self) -> "DestinationClient": + return self + + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType + ) -> None: + pass diff --git a/dlt/destinations/impl/destination/factory.py b/dlt/destinations/impl/destination/factory.py new file mode 100644 index 0000000000..992d78795b --- /dev/null +++ b/dlt/destinations/impl/destination/factory.py @@ -0,0 +1,116 @@ +import typing as t +import inspect +from importlib import import_module + +from types import ModuleType +from dlt.common.typing import AnyFun + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.configuration import known_sections, with_config, get_fun_spec +from dlt.common.configuration.exceptions import ConfigurationValueError + +from dlt.destinations.impl.destination.configuration import ( + GenericDestinationClientConfiguration, + TDestinationCallable, +) +from dlt.destinations.impl.destination import capabilities +from dlt.common.data_writers import TLoaderFileFormat +from dlt.common.utils import get_callable_name + +if t.TYPE_CHECKING: + from dlt.destinations.impl.destination.destination import DestinationClient + + +class DestinationInfo(t.NamedTuple): + """Runtime information on a discovered destination""" + + SPEC: t.Type[GenericDestinationClientConfiguration] + f: AnyFun + module: ModuleType + + +_DESTINATIONS: t.Dict[str, DestinationInfo] = {} +"""A registry of all the decorated destinations""" + + +class destination(Destination[GenericDestinationClientConfiguration, "DestinationClient"]): + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities( + self.config_params.get("loader_file_format", "puae-jsonl"), + self.config_params.get("naming_convention", "direct"), + ) + + @property + def spec(self) -> t.Type[GenericDestinationClientConfiguration]: + """A spec of destination configuration resolved from the sink function signature""" + return self._spec + + @property + def client_class(self) -> t.Type["DestinationClient"]: + from dlt.destinations.impl.destination.destination import DestinationClient + + return DestinationClient + + def __init__( + self, + destination_callable: t.Union[TDestinationCallable, str] = None, # noqa: A003 + destination_name: t.Optional[str] = None, + environment: t.Optional[str] = None, + loader_file_format: TLoaderFileFormat = None, + batch_size: int = 10, + naming_convention: str = "direct", + spec: t.Type[GenericDestinationClientConfiguration] = GenericDestinationClientConfiguration, + **kwargs: t.Any, + ) -> None: + # resolve callable + if callable(destination_callable): + pass + elif destination_callable: + try: + module_path, attr_name = destination_callable.rsplit(".", 1) + dest_module = import_module(module_path) + except ModuleNotFoundError as e: + raise ConfigurationValueError( + f"Could not find callable module at {module_path}" + ) from e + try: + destination_callable = getattr(dest_module, attr_name) + except AttributeError as e: + raise ConfigurationValueError( + f"Could not find callable function at {destination_callable}" + ) from e + + if not callable(destination_callable): + raise ConfigurationValueError("Resolved Sink destination callable is not a callable.") + + # resolve destination name + if destination_name is None: + destination_name = get_callable_name(destination_callable) + func_module = inspect.getmodule(destination_callable) + + # build destination spec + destination_sections = (known_sections.DESTINATION, destination_name) + conf_callable = with_config( + destination_callable, + sections=destination_sections, + include_defaults=True, + base=spec, + ) + + # save destination in registry + resolved_spec = t.cast( + t.Type[GenericDestinationClientConfiguration], get_fun_spec(conf_callable) + ) + _DESTINATIONS[callable.__qualname__] = DestinationInfo(resolved_spec, callable, func_module) + + # remember spec + self._spec = resolved_spec or spec + super().__init__( + destination_name=destination_name, + environment=environment, + loader_file_format=loader_file_format, + batch_size=batch_size, + naming_convention=naming_convention, + destination_callable=conf_callable, + **kwargs, + ) diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 9ad174fd63..e74e87d094 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -6,6 +6,7 @@ import inspect from functools import wraps + import dlt from dlt.common.exceptions import MissingDependencyException from dlt.common import pendulum, logger diff --git a/dlt/helpers/streamlit_helper.py b/dlt/helpers/streamlit_helper.py index f6b2f3a62f..f9f318323b 100644 --- a/dlt/helpers/streamlit_helper.py +++ b/dlt/helpers/streamlit_helper.py @@ -12,7 +12,7 @@ from dlt.common.libs.pandas import pandas from dlt.pipeline import Pipeline from dlt.pipeline.exceptions import CannotRestorePipelineException, SqlClientNotAvailable -from dlt.pipeline.state_sync import load_state_from_destination +from dlt.pipeline.state_sync import load_pipeline_state_from_destination try: import streamlit as st @@ -190,7 +190,7 @@ def _query_data_live(query: str, schema_name: str = None) -> pandas.DataFrame: st.header("Pipeline state info") with pipeline.destination_client() as client: if isinstance(client, WithStateSync): - remote_state = load_state_from_destination(pipeline.pipeline_name, client) + remote_state = load_pipeline_state_from_destination(pipeline.pipeline_name, client) local_state = pipeline.state col1, col2 = st.columns(2) diff --git a/dlt/load/load.py b/dlt/load/load.py index 050e7bce67..23c3dea820 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -7,10 +7,17 @@ from dlt.common import sleep, logger from dlt.common.configuration import with_config, known_sections +from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.accessors import config -from dlt.common.pipeline import LoadInfo, LoadMetrics, SupportsPipeline, WithStepInfo -from dlt.common.schema.utils import get_top_level_table +from dlt.common.pipeline import ( + LoadInfo, + LoadMetrics, + SupportsPipeline, + WithStepInfo, +) +from dlt.common.schema.utils import get_child_tables, get_top_level_table from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState +from dlt.common.storages.load_package import LoadPackageStateInjectableContext from dlt.common.runners import TRunMetrics, Runnable, workermethod, NullExecutor from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.runtime.logger import pretty_format_exception @@ -19,7 +26,10 @@ DestinationTerminalException, DestinationTransientException, ) +from dlt.common.configuration.container import Container + from dlt.common.schema import Schema, TSchemaTables + from dlt.common.storages import LoadStorage from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, @@ -34,6 +44,7 @@ SupportsStagingDestination, TDestination, ) +from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.destinations.job_impl import EmptyLoadJob @@ -414,7 +425,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: failed_job.job_file_info.job_id(), failed_job.failed_message, ) - # possibly raise on too many retires + # possibly raise on too many retries if self.config.raise_on_max_retries: for new_job in package_info.jobs["new_jobs"]: r_c = new_job.job_file_info.retry_count @@ -452,12 +463,19 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: schema = self.load_storage.normalized_packages.load_schema(load_id) logger.info(f"Loaded schema name {schema.name} and version {schema.stored_version}") + container = Container() # get top load id and mark as being processed with self.collector(f"Load {schema.name} in {load_id}"): - # the same load id may be processed across multiple runs - if not self.current_load_id: - self._step_info_start_load_id(load_id) - self.load_single_package(load_id, schema) + with container.injectable_context( + LoadPackageStateInjectableContext( + storage=self.load_storage.normalized_packages, + load_id=load_id, + ) + ): + # the same load id may be processed across multiple runs + if not self.current_load_id: + self._step_info_start_load_id(load_id) + self.load_single_package(load_id, schema) return TRunMetrics(False, len(self.load_storage.list_normalized_packages())) diff --git a/dlt/pipeline/current.py b/dlt/pipeline/current.py index 7fdc0f095c..25fd398623 100644 --- a/dlt/pipeline/current.py +++ b/dlt/pipeline/current.py @@ -2,6 +2,13 @@ from dlt.common.pipeline import source_state as _state, resource_state, get_current_pipe_name from dlt.pipeline import pipeline as _pipeline +from dlt.extract.decorators import get_source_schema +from dlt.common.storages.load_package import ( + load_package, + commit_load_package_state, + destination_state, + clear_destination_state, +) from dlt.extract.decorators import get_source_schema, get_source pipeline = _pipeline diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 185a11962a..042a62e8fb 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -47,7 +47,7 @@ ) from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.exceptions import LoadPackageNotFound -from dlt.common.typing import DictStrStr, TFun, TSecretValue, is_optional_type +from dlt.common.typing import DictStrAny, TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner from dlt.common.storages import ( LiveSchemaStorage, @@ -126,15 +126,17 @@ ) from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.state_sync import ( - STATE_ENGINE_VERSION, - bump_version_if_modified, - load_state_from_destination, - migrate_state, + PIPELINE_STATE_ENGINE_VERSION, + bump_pipeline_state_version_if_modified, + load_pipeline_state_from_destination, + migrate_pipeline_state, state_resource, json_encode_state, json_decode_state, + default_pipeline_state, ) from dlt.pipeline.warnings import credentials_argument_deprecated +from dlt.common.storages.load_package import TLoadPackageState def with_state_sync(may_extract_state: bool = False) -> Callable[[TFun], TFun]: @@ -143,6 +145,7 @@ def decorator(f: TFun) -> TFun: def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # activate pipeline so right state is always provided self.activate() + # backup and restore state should_extract_state = may_extract_state and self.config.restore_from_destination with self.managed_state(extract_state=should_extract_state) as state: @@ -263,7 +266,14 @@ class Pipeline(SupportsPipeline): STATE_FILE: ClassVar[str] = "state.json" STATE_PROPS: ClassVar[List[str]] = list( set(get_type_hints(TPipelineState).keys()) - - {"sources", "destination_type", "destination_name", "staging_type", "staging_name"} + - { + "sources", + "destination_type", + "destination_name", + "staging_type", + "staging_name", + "destinations", + } ) LOCAL_STATE_PROPS: ClassVar[List[str]] = list(get_type_hints(TPipelineLocalState).keys()) DEFAULT_DATASET_SUFFIX: ClassVar[str] = "_dataset" @@ -438,6 +448,7 @@ def normalize( """Normalizes the data prepared with `extract` method, infers the schema and creates load packages for the `load` method. Requires `destination` to be known.""" if is_interactive(): workers = 1 + if loader_file_format and loader_file_format in INTERNAL_LOADER_FILE_FORMATS: raise ValueError(f"{loader_file_format} is one of internal dlt file formats.") # check if any schema is present, if not then no data was extracted @@ -745,7 +756,7 @@ def sync_destination( # write the state back self._props_to_state(state) - bump_version_if_modified(state) + bump_pipeline_state_version_if_modified(state) self._save_state(state) except Exception as ex: raise PipelineStepFailed(self, "sync", None, ex, None) from ex @@ -845,6 +856,10 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: except LoadPackageNotFound: return self._get_normalize_storage().extracted_packages.get_load_package_info(load_id) + def get_load_package_state(self, load_id: str) -> TLoadPackageState: + """Returns information on extracted/normalized/completed package with given load_id, all jobs and their statuses.""" + return self._get_load_storage().get_load_package_state(load_id) + def list_failed_jobs_in_package(self, load_id: str) -> Sequence[LoadJobInfo]: """List all failed jobs and associated error messages for a specified `load_id`""" return self._get_load_storage().get_load_package_info(load_id).jobs.get("failed_jobs", []) @@ -1365,16 +1380,15 @@ def _get_step_info(self, step: WithStepInfo[TStepMetrics, TStepInfo]) -> TStepIn def _get_state(self) -> TPipelineState: try: state = json_decode_state(self._pipeline_storage.load(Pipeline.STATE_FILE)) - return migrate_state( - self.pipeline_name, state, state["_state_engine_version"], STATE_ENGINE_VERSION + return migrate_pipeline_state( + self.pipeline_name, + state, + state["_state_engine_version"], + PIPELINE_STATE_ENGINE_VERSION, ) except FileNotFoundError: # do not set the state hash, this will happen on first merge - return { - "_state_version": 0, - "_state_engine_version": STATE_ENGINE_VERSION, - "_local": {"first_run": True}, - } + return default_pipeline_state() # state["_version_hash"] = generate_version_hash(state) # return state @@ -1404,7 +1418,7 @@ def _restore_state_from_destination(self) -> Optional[TPipelineState]: schema = Schema(schema_name) with self._get_destination_clients(schema)[0] as job_client: if isinstance(job_client, WithStateSync): - state = load_state_from_destination(self.pipeline_name, job_client) + state = load_pipeline_state_from_destination(self.pipeline_name, job_client) if state is None: logger.info( "The state was not found in the destination" @@ -1538,7 +1552,7 @@ def _bump_version_and_extract_state( Storage will be created on demand. In that case the extracted package will be immediately committed. """ - _, hash_, _ = bump_version_if_modified(self._props_to_state(state)) + _, hash_, _ = bump_pipeline_state_version_if_modified(self._props_to_state(state)) should_extract = hash_ != state["_local"].get("_last_extracted_hash") if should_extract and extract_state: data = state_resource(state) diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index fa3939969b..8c72a218a4 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -1,25 +1,28 @@ -import base64 import binascii from copy import copy -import hashlib -from typing import Tuple, cast +from typing import Tuple, cast, List import pendulum import dlt from dlt.common import json -from dlt.common.pipeline import TPipelineState from dlt.common.typing import DictStrAny from dlt.common.schema.typing import STATE_TABLE_NAME, TTableSchemaColumns from dlt.common.destination.reference import WithStateSync, Destination from dlt.common.utils import compressed_b64decode, compressed_b64encode +from dlt.common.versioned_state import ( + generate_state_version_hash, + bump_state_version_if_modified, + default_versioned_state, +) +from dlt.common.pipeline import TPipelineState from dlt.extract import DltResource -from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException +from dlt.pipeline.exceptions import ( + PipelineStateEngineNoUpgradePathException, +) - -# allows to upgrade state when restored with a new version of state logic/schema -STATE_ENGINE_VERSION = 4 +PIPELINE_STATE_ENGINE_VERSION = 4 # state table columns STATE_TABLE_COLUMNS: TTableSchemaColumns = { @@ -57,59 +60,15 @@ def decompress_state(state_str: str) -> DictStrAny: return json.typed_loadb(state_bytes) # type: ignore[no-any-return] -def generate_version_hash(state: TPipelineState) -> str: - # generates hash out of stored schema content, excluding hash itself, version and local state - state_copy = copy(state) - state_copy.pop("_state_version", None) - state_copy.pop("_state_engine_version", None) - state_copy.pop("_version_hash", None) - state_copy.pop("_local", None) - content = json.typed_dumpb(state_copy, sort_keys=True) - h = hashlib.sha3_256(content) - return base64.b64encode(h.digest()).decode("ascii") - +def generate_pipeline_state_version_hash(state: TPipelineState) -> str: + return generate_state_version_hash(state, exclude_attrs=["_local"]) -def bump_version_if_modified(state: TPipelineState) -> Tuple[int, str, str]: - """Bumps the `state` version and version hash if content modified, returns (new version, new hash, old hash) tuple""" - hash_ = generate_version_hash(state) - previous_hash = state.get("_version_hash") - if not previous_hash: - # if hash was not set, set it without bumping the version, that's initial schema - pass - elif hash_ != previous_hash: - state["_state_version"] += 1 - state["_version_hash"] = hash_ - return state["_state_version"], hash_, previous_hash +def bump_pipeline_state_version_if_modified(state: TPipelineState) -> Tuple[int, str, str]: + return bump_state_version_if_modified(state, exclude_attrs=["_local"]) -def state_resource(state: TPipelineState) -> DltResource: - state = copy(state) - state.pop("_local") - state_str = compress_state(state) - state_doc = { - "version": state["_state_version"], - "engine_version": state["_state_engine_version"], - "pipeline_name": state["pipeline_name"], - "state": state_str, - "created_at": pendulum.now(), - "version_hash": state["_version_hash"], - } - return dlt.resource( - [state_doc], name=STATE_TABLE_NAME, write_disposition="append", columns=STATE_TABLE_COLUMNS - ) - - -def load_state_from_destination(pipeline_name: str, client: WithStateSync) -> TPipelineState: - # NOTE: if dataset or table holding state does not exist, the sql_client will rise DestinationUndefinedEntity. caller must handle this - state = client.get_stored_state(pipeline_name) - if not state: - return None - s = decompress_state(state.state) - return migrate_state(pipeline_name, s, s["_state_engine_version"], STATE_ENGINE_VERSION) - - -def migrate_state( +def migrate_pipeline_state( pipeline_name: str, state: DictStrAny, from_engine: int, to_engine: int ) -> TPipelineState: if from_engine == to_engine: @@ -119,7 +78,7 @@ def migrate_state( from_engine = 2 if from_engine == 2 and to_engine > 2: # you may want to recompute hash - state["_version_hash"] = generate_version_hash(state) # type: ignore[arg-type] + state["_version_hash"] = generate_pipeline_state_version_hash(state) # type: ignore[arg-type] from_engine = 3 if from_engine == 3 and to_engine > 3: if state.get("destination"): @@ -139,3 +98,41 @@ def migrate_state( ) state["_state_engine_version"] = from_engine return cast(TPipelineState, state) + + +def state_resource(state: TPipelineState) -> DltResource: + state = copy(state) + state.pop("_local") + state_str = compress_state(state) + state_doc = { + "version": state["_state_version"], + "engine_version": state["_state_engine_version"], + "pipeline_name": state["pipeline_name"], + "state": state_str, + "created_at": pendulum.now(), + "version_hash": state["_version_hash"], + } + return dlt.resource( + [state_doc], name=STATE_TABLE_NAME, write_disposition="append", columns=STATE_TABLE_COLUMNS + ) + + +def load_pipeline_state_from_destination( + pipeline_name: str, client: WithStateSync +) -> TPipelineState: + # NOTE: if dataset or table holding state does not exist, the sql_client will rise DestinationUndefinedEntity. caller must handle this + state = client.get_stored_state(pipeline_name) + if not state: + return None + s = decompress_state(state.state) + return migrate_pipeline_state( + pipeline_name, s, s["_state_engine_version"], PIPELINE_STATE_ENGINE_VERSION + ) + + +def default_pipeline_state() -> TPipelineState: + return { + **default_versioned_state(), + "_state_engine_version": PIPELINE_STATE_ENGINE_VERSION, + "_local": {"first_run": True}, + } diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md new file mode 100644 index 0000000000..e00bbdfc38 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -0,0 +1,151 @@ +--- +title: šŸ§Ŗ Destination Decorator / Reverse ETL +description: Sink function `dlt` destination for reverse ETL +keywords: [reverse etl, sink, function, decorator, destination] +--- + +# Destination decorator / Reverse ETL + +:::caution +The Destination Decorator is currently in alpha, while we think the interface is stable at this point and all is working pretty well, there still might be +small changes done or bugs found in the next weeks. +::: + +The dlt destination decorator allows you to receive all data passing through your pipeline in a simple function. This can be extremely useful for +reverse ETL, where you are pushing data back to an api. You can also use this for sending data to a queue or a simple database destination that is not +yet supported by dlt, be aware that you will have to manually handle your own migrations in this case. It will also allow you to simply get a path +to the files of your normalized data, so if you need direct access to parquet or jsonl files to copy them somewhere or push them to a database, +you can do this here too. + +## Install dlt for Sink / reverse ETL +** To install the DLT without additional dependencies ** +``` +pip install dlt +``` + +## Setup Guide +### 1. Initialize the dlt project + +Let's start by initializing a new dlt project as follows: + +```bash +dlt init chess sink +``` +> šŸ’” This command will initialize your pipeline with chess as the source and sink as the destination. + +The above command generates several files and directories, including `.dlt/secrets.toml`. + +### 2. Set up a destination function for your pipeline +The sink destination differs from other destinations in that you do not need to provide connection credentials, but rather you provide a function which +gets called for all items loaded during a pipeline run or load operation. For the chess example, you can add the following lines at the top of the file. +With the `@dlt.destination` decorator you can convert + +A very simple dlt pipeline that pushes a list of items into a sink function might look like this: + +```python +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema + +@dlt.destination(batch_size=10) +def my_sink(items: TDataItems, table: TTableSchema) -> None: + print(table["name"]) + print(items) + +pipe = dlt.pipeline("sink_pipeline", destination=my_sink) +pipe.run([1, 2, 3], table_name="items") + +``` + +To enable this destination decorator in your chess example, replace the line `destination='sink'` with `destination=sink` (without the quotes) to directly reference +the sink from your pipeline constructor. Now you can run your pipeline and see the output of all the items coming from the chess pipeline to your console. + +:::tip +1. You can also remove the typing information (TDataItems and TTableSchema) from this example, typing generally are useful to know the shape of the incoming objects though. +2. There are a few other ways for declaring sink functions for your pipeline described below. +::: + +## destination decorator function and signature + +The full signature of the destination decorator plus its function is the following: + +```python +@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_sink", naming="direct") +def sink(items: TDataItems, table: TTableSchema) -> None: + ... +``` + +#### Decorator +* The `batch_size` parameter on the destination decorator defines how many items per function call are batched together and sent as an array. If you set a batch-size of `0`, +instead of passing in actual dataitems, you will receive one call per load job with the path of the file as the items argument. You can then open and process that file +in any way you like. +* The `loader_file_format` parameter on the destination decorator defines in which format files are stored in the load package before being sent to the sink function, +this can be `jsonl` or `parquet`. +* The `name` parameter on the destination decorator defines the name of the destination that get's created by the destination decorator. +* The `naming_convention` parameter on the destination decorator defines the name of the destination that gets created by the destination decorator. This controls +how table and column names are normalized. The default is `direct` which will keep all names the same. + +#### Sink function +* The `items` parameter on the sink function contains the items being sent into the sink function. +* The `table` parameter contains the schema table the current call belongs to including all table hints and columns. For example the table name can be access with table["name"]. Keep in mind that dlt also created special tables prefixed with `_dlt` which you may want to ignore when processing data. +* You can also add config values and secrets to the function arguments, see below! + + +## Adding config variables and secrets +The destination decorator supports settings and secrets variables. If you, for example, plan to connect to a service that requires an api secret or a login, you can do the following: + +```python +@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_sink") +def my_sink(items: TDataItems, table: TTableSchema, api_key: dlt.secrets.value) -> None: + ... +``` + + +## Sink destination state +The sink destination keeps a local record of how many DataItems were processed, so if you, for example, use the sink destination to push DataItems to a remote api, and this +api becomes unavailable during the load resulting in a failed dlt pipeline run, you can repeat the run of your pipeline at a later stage and the sink destination will continue +where it left of. For this reason it makes sense to choose a batch size that you can process in one transaction (say one api request or one database transaction) so that if this +request or transaction fail repeatedly you can repeat it at the next run without pushing duplicate data to your remote location. + + + +And add the api key to your toml: + +```toml +[destination.my_sink] +api_key="some secrets" +``` + + +## Concurrency +Calls to the sink function by default will be executed on multiple threads, so you need to make sure you are not using any non-thread-safe nonlocal or global variables from outside +your sink function. If, for whichever reason, you need to have all calls be executed from the same thread, you can set the `workers` config variable of the load step to 1. For performance +reasons we recommend to keep the multithreaded approach and make sure that you, for example, are using threadsafe connection pools to a remote database or queue. + +## Referencing the sink function +There are multiple ways to reference the sink function you want to use. These are: + +```python +# file my_pipeline.py + +@dlt.destination(batch_size=10) +def local_sink_func(items: TDataItems, table: TTableSchema) -> None: + ... + +# reference function directly +p = dlt.pipeline(name="my_pipe", destination=local_sink_func) + +# fully qualified string to function location (can be used from config.toml or env vars) +p = dlt.pipeline(name="my_pipe", destination="sink", credentials="my_pipeline.local_sink_func") + +# via destination reference +p = dlt.pipeline(name="my_pipe", destination=Destination.from_reference("sink", credentials=local_sink_func, environment="staging")) +``` + +## Write disposition + +The sink destination will forward all normalized DataItems encountered during a pipeline run to the sink function, so there is no notion of write dispositions for the sink. + +## Staging support + +The sink destination does not support staging files in remote locations before being called at this time. If you need this feature, please let us know. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index 17907c9467..1da5205471 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -265,7 +265,7 @@ verified source. ```python load_data = pipedrive_source() # calls the source function - load_info = pipeline.run(load_info) #runs the pipeline with selected source configuration + load_info = pipeline.run(load_data) #runs the pipeline with selected source configuration print(load_info) ``` diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 821a1affad..99d081a414 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -95,9 +95,10 @@ const sidebars = { 'dlt-ecosystem/destinations/redshift', 'dlt-ecosystem/destinations/snowflake', 'dlt-ecosystem/destinations/athena', - 'dlt-ecosystem/destinations/motherduck', 'dlt-ecosystem/destinations/weaviate', 'dlt-ecosystem/destinations/qdrant', + 'dlt-ecosystem/destinations/destination', + 'dlt-ecosystem/destinations/motherduck' ] }, ], diff --git a/tests/cases.py b/tests/cases.py index 8653f999c6..85caec4b8d 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Any, Sequence, Tuple, Literal +from typing import Dict, List, Any, Sequence, Tuple, Literal, Union import base64 from hexbytes import HexBytes from copy import deepcopy @@ -7,7 +7,7 @@ from dlt.common import Decimal, pendulum, json from dlt.common.data_types import TDataType -from dlt.common.typing import StrAny +from dlt.common.typing import StrAny, TDataItems from dlt.common.wei import Wei from dlt.common.time import ( ensure_pendulum_datetime, @@ -161,18 +161,23 @@ def table_update_and_row( def assert_all_data_types_row( - db_row: List[Any], + db_row: Union[List[Any], TDataItems], parse_complex_strings: bool = False, allow_base64_binary: bool = False, timestamp_precision: int = 6, schema: TTableSchemaColumns = None, + expect_filtered_null_columns=False, ) -> None: # content must equal # print(db_row) schema = schema or TABLE_UPDATE_COLUMNS_SCHEMA # Include only columns requested in schema - db_mapping = {col_name: db_row[i] for i, col_name in enumerate(schema)} + if isinstance(db_row, dict): + db_mapping = db_row.copy() + else: + db_mapping = {col_name: db_row[i] for i, col_name in enumerate(schema)} + expected_rows = {key: value for key, value in TABLE_ROW_ALL_DATA_TYPES.items() if key in schema} # prepare date to be compared: convert into pendulum instance, adjust microsecond precision if "col4" in expected_rows: @@ -226,8 +231,16 @@ def assert_all_data_types_row( if "col11" in db_mapping: db_mapping["col11"] = db_mapping["col11"].isoformat() - for expected, actual in zip(expected_rows.values(), db_mapping.values()): - assert expected == actual + if expect_filtered_null_columns: + for key, expected in expected_rows.items(): + if expected is None: + assert db_mapping.get(key, None) is None + db_mapping[key] = None + + for key, expected in expected_rows.items(): + actual = db_mapping[key] + assert expected == actual, f"Expected {expected} but got {actual} for column {key}" + assert db_mapping == expected_rows diff --git a/tests/cli/test_init_command.py b/tests/cli/test_init_command.py index 479bedb6fb..bf5c21c80f 100644 --- a/tests/cli/test_init_command.py +++ b/tests/cli/test_init_command.py @@ -186,6 +186,8 @@ def test_init_all_verified_sources_isolated(cloned_init_repo: FileStorage) -> No def test_init_all_destinations( destination_name: str, project_files: FileStorage, repo_dir: str ) -> None: + if destination_name == "destination": + pytest.skip("Init for generic destination not implemented yet") pipeline_name = f"generic_{destination_name}_pipeline" init_command.init_command(pipeline_name, destination_name, True, repo_dir) assert_init_files(project_files, pipeline_name, destination_name) diff --git a/tests/common/configuration/test_inject.py b/tests/common/configuration/test_inject.py index 8b9616ccd7..3e3b7f6973 100644 --- a/tests/common/configuration/test_inject.py +++ b/tests/common/configuration/test_inject.py @@ -1,11 +1,17 @@ import os from typing import Any, Dict, Optional, Type, Union import pytest - +import time, threading import dlt from dlt.common.configuration.exceptions import ConfigFieldMissingException -from dlt.common.configuration.inject import get_fun_spec, last_config, with_config +from dlt.common.configuration.inject import ( + get_fun_spec, + last_config, + with_config, + create_resolved_partial, +) +from dlt.common.configuration.container import Container from dlt.common.configuration.providers import EnvironProvider from dlt.common.configuration.providers.toml import SECRETS_TOML from dlt.common.configuration.resolve import inject_section @@ -206,6 +212,107 @@ def test_sections_like_resource(value=dlt.config.value): assert test_sections_like_resource() == "resource_style_injected" +def test_partial() -> None: + @with_config(sections=("test",)) + def test_sections(value=dlt.config.value): + return value + + # no value in scope will fail + with pytest.raises(ConfigFieldMissingException): + test_sections() + + # same for partial + with pytest.raises(ConfigFieldMissingException): + create_resolved_partial(test_sections) + + # with value in scope partial will work + os.environ["TEST__VALUE"] = "first_val" + partial = create_resolved_partial(test_sections) + + # remove the value from scope and partial will work + del os.environ["TEST__VALUE"] + assert partial() == "first_val" + + # original func wont + with pytest.raises(ConfigFieldMissingException): + test_sections() + + # partial retains value + os.environ["TEST__VALUE"] = "new_val" + assert partial() == "first_val" + assert test_sections() == "new_val" + + # new partial picks up new value + new_partial = create_resolved_partial(test_sections) + + # remove the value from scope and partial will work + del os.environ["TEST__VALUE"] + assert new_partial() == "new_val" + assert partial() == "first_val" + + +def test_base_spec() -> None: + class TestConfig(BaseConfiguration): + base_value: str + + @with_config(sections=("test",)) + def test_sections(value=dlt.config.value, base=TestConfig): + return value + + # discovered spec should derive from TestConfig + spec = get_fun_spec(test_sections) + issubclass(spec, TestConfig) + + +@pytest.mark.parametrize("lock", [False, True]) +@pytest.mark.parametrize("same_pool", [False, True]) +def test_lock_context(lock, same_pool) -> None: + # we create a slow provider to test locking + + class SlowProvider(EnvironProvider): + def get_value(self, key, hint, pipeline_name, *sections): + import time + + time.sleep(0.5) + return super().get_value(key, hint, pipeline_name, *sections) + + ctx = ConfigProvidersContext() + ctx.providers.clear() + ctx.add_provider(SlowProvider()) + + @with_config(sections=("test",), lock_context_on_injection=lock) + def test_sections(value=dlt.config.value): + return value + + os.environ["TEST__VALUE"] = "test_val" + with Container().injectable_context(ctx): + start = time.time() + + if same_pool: + thread_ids = ["dlt-pool-1-1", "dlt-pool-1-2"] + else: + thread_ids = ["dlt-pool-5-1", "dlt-pool-20-2"] + + # simulate threads in the same pool + thread1 = threading.Thread(target=test_sections, name=thread_ids[0]) + thread2 = threading.Thread(target=test_sections, name=thread_ids[1]) + + thread1.start() + thread2.start() + + thread1.join() + thread2.join() + + elapsed = time.time() - start + + # see wether there was any parallel execution going on + # it should only lock if we're in the same pool and we want it to lock + if lock and same_pool: + assert elapsed > 1 + else: + assert elapsed < 0.7 + + @pytest.mark.skip("not implemented") def test_inject_with_str_sections() -> None: # sections param is str not tuple diff --git a/tests/common/schema/test_coercion.py b/tests/common/schema/test_coercion.py index 922024a89b..34b62f9564 100644 --- a/tests/common/schema/test_coercion.py +++ b/tests/common/schema/test_coercion.py @@ -377,10 +377,16 @@ def test_coerce_type_complex() -> None: assert coerce_value("complex", "complex", v_list) == v_list assert coerce_value("text", "complex", v_dict) == json.dumps(v_dict) assert coerce_value("text", "complex", v_list) == json.dumps(v_list) + assert coerce_value("complex", "text", json.dumps(v_dict)) == v_dict + assert coerce_value("complex", "text", json.dumps(v_list)) == v_list + # all other coercions fail with pytest.raises(ValueError): coerce_value("binary", "complex", v_list) + with pytest.raises(ValueError): + coerce_value("complex", "text", "not a json string") + def test_coerce_type_complex_with_pua() -> None: v_dict = { @@ -395,6 +401,10 @@ def test_coerce_type_complex_with_pua() -> None: } assert coerce_value("complex", "complex", copy(v_dict)) == exp_v assert coerce_value("text", "complex", copy(v_dict)) == json.dumps(exp_v) + + # TODO: what to test for this case if at all? + # assert coerce_value("complex", "text", json.dumps(v_dict)) == exp_v + # also decode recursively custom_pua_decode_nested(v_dict) # restores datetime type diff --git a/tests/common/storages/test_load_package.py b/tests/common/storages/test_load_package.py index f671ddcf32..d61029c8cf 100644 --- a/tests/common/storages/test_load_package.py +++ b/tests/common/storages/test_load_package.py @@ -1,6 +1,9 @@ import os import pytest from pathlib import Path +from os.path import join + +import dlt from dlt.common import sleep from dlt.common.schema import Schema @@ -9,6 +12,15 @@ from tests.common.storages.utils import start_loading_file, assert_package_info, load_storage from tests.utils import autouse_test_storage +from dlt.common.pendulum import pendulum +from dlt.common.configuration.container import Container +from dlt.common.storages.load_package import ( + LoadPackageStateInjectableContext, + destination_state, + load_package, + commit_load_package_state, + clear_destination_state, +) def test_is_partially_loaded(load_storage: LoadStorage) -> None: @@ -57,6 +69,83 @@ def test_save_load_schema(load_storage: LoadStorage) -> None: assert schema.stored_version == schema_copy.stored_version +def test_create_and_update_loadpackage_state(load_storage: LoadStorage) -> None: + load_storage.new_packages.create_package("copy") + state = load_storage.new_packages.get_load_package_state("copy") + assert state["_state_version"] == 0 + assert state["_version_hash"] is not None + assert state["created_at"] is not None + old_state = state.copy() + + state["new_key"] = "new_value" # type: ignore + load_storage.new_packages.save_load_package_state("copy", state) + + state = load_storage.new_packages.get_load_package_state("copy") + assert state["new_key"] == "new_value" # type: ignore + assert state["_state_version"] == 1 + assert state["_version_hash"] != old_state["_version_hash"] + # created timestamp should be conserved + assert state["created_at"] == old_state["created_at"] + + # check timestamp + time = pendulum.parse(state["created_at"]) + now = pendulum.now() + assert (now - time).in_seconds() < 2 # type: ignore + + +def test_loadpackage_state_injectable_context(load_storage: LoadStorage) -> None: + load_storage.new_packages.create_package("copy") + + container = Container() + with container.injectable_context( + LoadPackageStateInjectableContext( + storage=load_storage.new_packages, + load_id="copy", + ) + ): + # test general load package state + injected_state = load_package() + assert injected_state["state"]["_state_version"] == 0 + injected_state["state"]["new_key"] = "new_value" # type: ignore + + # not persisted yet + assert load_storage.new_packages.get_load_package_state("copy").get("new_key") is None + # commit + commit_load_package_state() + + # now it should be persisted + assert ( + load_storage.new_packages.get_load_package_state("copy").get("new_key") == "new_value" + ) + assert load_storage.new_packages.get_load_package_state("copy").get("_state_version") == 1 + + # check that second injection is the same as first + second_injected_instance = load_package() + assert second_injected_instance == injected_state + + # check scoped destination states + assert ( + load_storage.new_packages.get_load_package_state("copy").get("destination_state") + is None + ) + dstate = destination_state() + dstate["new_key"] = "new_value" + commit_load_package_state() + assert load_storage.new_packages.get_load_package_state("copy").get( + "destination_state" + ) == {"new_key": "new_value"} + + # this also shows up on the previously injected state + assert injected_state["state"]["destination_state"]["new_key"] == "new_value" + + # clear destination state + clear_destination_state() + assert ( + load_storage.new_packages.get_load_package_state("copy").get("destination_state") + is None + ) + + def test_job_elapsed_time_seconds(load_storage: LoadStorage) -> None: load_id, fn = start_loading_file(load_storage, "test file") # type: ignore[arg-type] fp = load_storage.normalized_packages.storage.make_full_path( @@ -119,3 +208,25 @@ def test_build_parse_job_path(load_storage: LoadStorage) -> None: with pytest.raises(ValueError): ParsedLoadJobFileName.parse("tab.id.wrong_retry.jsonl") + + +def test_migrate_to_load_package_state() -> None: + """ + Here we test that an existing load package without a state will not error + when the user upgrades to a dlt version with the state. we simulate it by + wiping the state after normalization and see wether anything breaks + """ + from dlt.destinations import dummy + + p = dlt.pipeline(pipeline_name=uniq_id(), destination=dummy(completed_prob=1)) + + p.extract([{"id": 1, "name": "dave"}], table_name="person") + p.normalize() + + # delete load package after normalization + storage = p._get_load_storage() + packaged_id = p.list_normalized_load_packages()[0] + state_path = storage.normalized_packages.get_load_package_state_path(packaged_id) + storage.storage.delete(join(LoadStorage.NORMALIZED_FOLDER, state_path)) + + p.load() diff --git a/tests/common/test_destination.py b/tests/common/test_destination.py index a7547d27e0..b93cb5b483 100644 --- a/tests/common/test_destination.py +++ b/tests/common/test_destination.py @@ -54,7 +54,17 @@ def test_import_module_by_path() -> None: def test_import_all_destinations() -> None: # this must pass without the client dependencies being imported for dest_type in ACTIVE_DESTINATIONS: - dest = Destination.from_reference(dest_type, None, dest_type + "_name", "production") + # generic destination needs a valid callable, otherwise instantiation will fail + additional_args = {} + if dest_type == "destination": + + def dest_callable(items, table) -> None: + pass + + additional_args["destination_callable"] = dest_callable + dest = Destination.from_reference( + dest_type, None, dest_type + "_name", "production", **additional_args + ) assert dest.destination_type == "dlt.destinations." + dest_type assert dest.destination_name == dest_type + "_name" assert dest.config_params["environment"] == "production" diff --git a/tests/common/test_versioned_state.py b/tests/common/test_versioned_state.py new file mode 100644 index 0000000000..e1f31a8a92 --- /dev/null +++ b/tests/common/test_versioned_state.py @@ -0,0 +1,43 @@ +from dlt.common.versioned_state import ( + generate_state_version_hash, + bump_state_version_if_modified, + default_versioned_state, +) + + +def test_versioned_state() -> None: + state = default_versioned_state() + assert state["_state_version"] == 0 + assert state["_state_engine_version"] == 1 + + # first hash_ generation does not change version, attrs are not modified + version, hash_, previous_hash = bump_state_version_if_modified(state) + assert version == 0 + assert hash_ is not None + assert previous_hash is None + assert state["_version_hash"] == hash_ + + # change attr, but exclude while generating + state["foo"] = "bar" # type: ignore + version, hash_, previous_hash = bump_state_version_if_modified(state, exclude_attrs=["foo"]) + assert version == 0 + assert hash_ == previous_hash + + # now don't exclude (remember old hash_ to compare return vars) + old_hash = state["_version_hash"] + version, hash_, previous_hash = bump_state_version_if_modified(state) + assert version == 1 + assert hash_ != previous_hash + assert old_hash != hash_ + assert previous_hash == old_hash + + # messing with state engine version will not change hash_ + state["_state_engine_version"] = 5 + version, hash_, previous_hash = bump_state_version_if_modified(state) + assert version == 1 + assert hash_ == previous_hash + + # make sure state object is not modified while bumping with no effect + old_state = state.copy() + version, hash_, previous_hash = bump_state_version_if_modified(state) + assert old_state == state diff --git a/tests/destinations/test_generic_destination.py b/tests/destinations/test_generic_destination.py new file mode 100644 index 0000000000..1285e97fd3 --- /dev/null +++ b/tests/destinations/test_generic_destination.py @@ -0,0 +1,436 @@ +from typing import List, Tuple, Dict, Union, cast + +import dlt +import pytest +import pytest +import os + +from copy import deepcopy +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema +from dlt.common.data_writers.writers import TLoaderFileFormat +from dlt.common.destination.reference import Destination +from dlt.pipeline.exceptions import PipelineStepFailed +from dlt.common.utils import uniq_id +from dlt.common.exceptions import InvalidDestinationReference +from dlt.common.configuration.exceptions import ConfigFieldMissingException +from dlt.common.configuration.specs import ConnectionStringCredentials + +from tests.load.utils import ( + TABLE_ROW_ALL_DATA_TYPES, + TABLE_UPDATE_COLUMNS_SCHEMA, + assert_all_data_types_row, +) + +SUPPORTED_LOADER_FORMATS = ["parquet", "puae-jsonl"] + + +def _run_through_sink( + items: TDataItems, + loader_file_format: TLoaderFileFormat, + columns=None, + filter_dlt_tables: bool = True, + batch_size: int = 10, +) -> List[Tuple[TDataItems, TTableSchema]]: + """ + runs a list of items through the sink destination and returns colleceted calls + """ + calls: List[Tuple[TDataItems, TTableSchema]] = [] + + @dlt.destination(loader_file_format=loader_file_format, batch_size=batch_size) + def test_sink(items: TDataItems, table: TTableSchema) -> None: + nonlocal calls + if table["name"].startswith("_dlt") and filter_dlt_tables: + return + # convert pyarrow table to dict list here to make tests more simple downstream + if loader_file_format == "parquet": + items = items.to_pylist() # type: ignore + calls.append((items, table)) + + @dlt.resource(columns=columns, table_name="items") + def items_resource() -> TDataItems: + nonlocal items + yield items + + p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + p.run([items_resource()]) + + return calls + + +@pytest.mark.parametrize("loader_file_format", SUPPORTED_LOADER_FORMATS) +def test_all_datatypes(loader_file_format: TLoaderFileFormat) -> None: + data_types = deepcopy(TABLE_ROW_ALL_DATA_TYPES) + column_schemas = deepcopy(TABLE_UPDATE_COLUMNS_SCHEMA) + + sink_calls = _run_through_sink( + [data_types, data_types, data_types], + loader_file_format, + columns=column_schemas, + batch_size=1, + ) + + # inspect result + assert len(sink_calls) == 3 + + item = sink_calls[0][0][0] + + # filter out _dlt columns + item = {k: v for k, v in item.items() if not k.startswith("_dlt")} + + # null values are not emitted + data_types = {k: v for k, v in data_types.items() if v is not None} + + assert_all_data_types_row(item, expect_filtered_null_columns=True) + + +@pytest.mark.parametrize("loader_file_format", SUPPORTED_LOADER_FORMATS) +@pytest.mark.parametrize("batch_size", [1, 10, 23]) +def test_batch_size(loader_file_format: TLoaderFileFormat, batch_size: int) -> None: + items = [{"id": i, "value": str(i)} for i in range(100)] + + sink_calls = _run_through_sink(items, loader_file_format, batch_size=batch_size) + + if batch_size == 1: + assert len(sink_calls) == 100 + # one item per call + assert sink_calls[0][0][0].items() > {"id": 0, "value": "0"}.items() + elif batch_size == 10: + assert len(sink_calls) == 10 + # ten items in first call + assert len(sink_calls[0][0]) == 10 + assert sink_calls[0][0][0].items() > {"id": 0, "value": "0"}.items() + elif batch_size == 23: + assert len(sink_calls) == 5 + # 23 items in first call + assert len(sink_calls[0][0]) == 23 + assert sink_calls[0][0][0].items() > {"id": 0, "value": "0"}.items() + + # check all items are present + all_items = set() + for call in sink_calls: + item = call[0] + for entry in item: + all_items.add(entry["value"]) + + assert len(all_items) == 100 + for i in range(100): + assert str(i) in all_items + + +global_calls: List[Tuple[TDataItems, TTableSchema]] = [] + + +def global_sink_func(items: TDataItems, table: TTableSchema) -> None: + global global_calls + if table["name"].startswith("_dlt"): + return + global_calls.append((items, table)) + + +def test_instantiation() -> None: + calls: List[Tuple[TDataItems, TTableSchema]] = [] + + # NOTE: we also test injection of config vars here + def local_sink_func(items: TDataItems, table: TTableSchema, my_val=dlt.config.value, /) -> None: + nonlocal calls + if table["name"].startswith("_dlt"): + return + assert my_val == "something" + + calls.append((items, table)) + + os.environ["DESTINATION__MY_VAL"] = "something" + + # test decorator + calls = [] + p = dlt.pipeline("sink_test", destination=dlt.destination()(local_sink_func), full_refresh=True) # type: ignore + p.run([1, 2, 3], table_name="items") + assert len(calls) == 1 + + # test passing via from_reference + calls = [] + p = dlt.pipeline( + "sink_test", + destination=Destination.from_reference("destination", destination_callable=local_sink_func), + full_refresh=True, + ) + p.run([1, 2, 3], table_name="items") + assert len(calls) == 1 + + # test passing string reference + global global_calls + global_calls = [] + p = dlt.pipeline( + "sink_test", + destination=Destination.from_reference( + "destination", + destination_callable="tests.destinations.test_generic_destination.global_sink_func", + ), + full_refresh=True, + ) + p.run([1, 2, 3], table_name="items") + assert len(global_calls) == 1 + + # pass None credentials reference + with pytest.raises(InvalidDestinationReference): + p = dlt.pipeline( + "sink_test", + destination=Destination.from_reference("destination", destination_callable=None), + full_refresh=True, + ) + p.run([1, 2, 3], table_name="items") + + # pass invalid credentials module + with pytest.raises(InvalidDestinationReference): + p = dlt.pipeline( + "sink_test", + destination=Destination.from_reference( + "destination", destination_callable="does.not.exist" + ), + full_refresh=True, + ) + p.run([1, 2, 3], table_name="items") + + +@pytest.mark.parametrize("loader_file_format", SUPPORTED_LOADER_FORMATS) +@pytest.mark.parametrize("batch_size", [1, 10, 23]) +def test_batched_transactions(loader_file_format: TLoaderFileFormat, batch_size: int) -> None: + calls: Dict[str, List[TDataItems]] = {} + # provoke errors on resources + provoke_error: Dict[str, int] = {} + + @dlt.destination(loader_file_format=loader_file_format, batch_size=batch_size) + def test_sink(items: TDataItems, table: TTableSchema) -> None: + nonlocal calls + table_name = table["name"] + if table_name.startswith("_dlt"): + return + + # convert pyarrow table to dict list here to make tests more simple downstream + if loader_file_format == "parquet": + items = items.to_pylist() # type: ignore + + # provoke error if configured + if table_name in provoke_error: + for item in items: + if provoke_error[table_name] == item["id"]: + raise AssertionError("Oh no!") + + calls.setdefault(table_name, []).append(items) + + @dlt.resource() + def items() -> TDataItems: + for i in range(100): + yield {"id": i, "value": str(i)} + + @dlt.resource() + def items2() -> TDataItems: + for i in range(100): + yield {"id": i, "value": str(i)} + + def assert_items_in_range(c: List[TDataItems], start: int, end: int) -> None: + """ + Ensure all items where called and no duplicates are present + """ + collected_items = set() + for call in c: + for item in call: + assert item["value"] not in collected_items + collected_items.add(item["value"]) + assert len(collected_items) == end - start + for i in range(start, end): + assert str(i) in collected_items + + # no errors are set, all items should be processed + p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + load_id = p.run([items(), items2()]).loads_ids[0] + assert_items_in_range(calls["items"], 0, 100) + assert_items_in_range(calls["items2"], 0, 100) + + # destination state should have all items + destination_state = p.get_load_package_state(load_id)["destination_state"] + values = {k.split(".")[0]: v for k, v in destination_state.items()} + assert values == {"_dlt_pipeline_state": 1, "items": 100, "items2": 100} + + # provoke errors + calls = {} + provoke_error = {"items": 25, "items2": 45} + p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + with pytest.raises(PipelineStepFailed): + p.run([items(), items2()]) + + # we should have data for one load id saved here + load_id = p.list_normalized_load_packages()[0] + destination_state = p.get_load_package_state(load_id)["destination_state"] + + # get saved indexes mapped to table (this test will only work for one job per table) + values = {k.split(".")[0]: v for k, v in destination_state.items()} + + # partly loaded, pointers in state should be right + if batch_size == 1: + assert_items_in_range(calls["items"], 0, 25) + assert_items_in_range(calls["items2"], 0, 45) + # one pointer for state, one for items, one for items2... + assert values == {"_dlt_pipeline_state": 1, "items": 25, "items2": 45} + elif batch_size == 10: + assert_items_in_range(calls["items"], 0, 20) + assert_items_in_range(calls["items2"], 0, 40) + assert values == {"_dlt_pipeline_state": 1, "items": 20, "items2": 40} + elif batch_size == 23: + assert_items_in_range(calls["items"], 0, 23) + assert_items_in_range(calls["items2"], 0, 23) + assert values == {"_dlt_pipeline_state": 1, "items": 23, "items2": 23} + else: + raise AssertionError("Unknown batch size") + + # load the rest + first_calls = deepcopy(calls) + provoke_error = {} + calls = {} + p.load() + + # destination state should have all items + destination_state = p.get_load_package_state(load_id)["destination_state"] + values = {k.split(".")[0]: v for k, v in destination_state.items()} + assert values == {"_dlt_pipeline_state": 1, "items": 100, "items2": 100} + + # both calls combined should have every item called just once + assert_items_in_range(calls["items"] + first_calls["items"], 0, 100) + assert_items_in_range(calls["items2"] + first_calls["items2"], 0, 100) + + +def test_naming_convention() -> None: + @dlt.resource(table_name="PErson") + def resource(): + yield [{"UpperCase": 1, "snake_case": 1, "camelCase": 1}] + + # check snake case + @dlt.destination(naming_convention="snake_case") + def snake_sink(items, table): + if table["name"].startswith("_dlt"): + return + assert table["name"] == "p_erson" + assert table["columns"]["upper_case"]["name"] == "upper_case" + assert table["columns"]["snake_case"]["name"] == "snake_case" + assert table["columns"]["camel_case"]["name"] == "camel_case" + + dlt.pipeline("sink_test", destination=snake_sink, full_refresh=True).run(resource()) + + # check default (which is direct) + @dlt.destination() + def direct_sink(items, table): + if table["name"].startswith("_dlt"): + return + assert table["name"] == "PErson" + assert table["columns"]["UpperCase"]["name"] == "UpperCase" + assert table["columns"]["snake_case"]["name"] == "snake_case" + assert table["columns"]["camelCase"]["name"] == "camelCase" + + dlt.pipeline("sink_test", destination=direct_sink, full_refresh=True).run(resource()) + + +def test_file_batch() -> None: + @dlt.resource(table_name="person") + def resource1(): + for i in range(100): + yield [{"id": i, "name": f"Name {i}"}] + + @dlt.resource(table_name="address") + def resource2(): + for i in range(50): + yield [{"id": i, "city": f"City {i}"}] + + @dlt.destination(batch_size=0, loader_file_format="parquet") + def direct_sink(file_path, table): + if table["name"].startswith("_dlt"): + return + from dlt.common.libs.pyarrow import pyarrow + + assert table["name"] in ["person", "address"] + + with pyarrow.parquet.ParquetFile(file_path) as reader: + assert reader.metadata.num_rows == (100 if table["name"] == "person" else 50) + + dlt.pipeline("sink_test", destination=direct_sink, full_refresh=True).run( + [resource1(), resource2()] + ) + + +def test_config_spec() -> None: + # NOTE: define the destination before the env var to test env vars are evaluated + # at runtime + @dlt.destination() + def my_sink(file_path, table, my_val=dlt.config.value): + assert my_val == "something" + + print(my_sink) + + # if no value is present, it should raise + with pytest.raises(ConfigFieldMissingException): + dlt.pipeline("sink_test", destination=my_sink, full_refresh=True).run( + [1, 2, 3], table_name="items" + ) + + # we may give the value via __callable__ function + dlt.pipeline("sink_test", destination=my_sink(my_val="something"), full_refresh=True).run( + [1, 2, 3], table_name="items" + ) + + # right value will pass + os.environ["DESTINATION__MY_SINK__MY_VAL"] = "something" + dlt.pipeline("sink_test", destination=my_sink, full_refresh=True).run( + [1, 2, 3], table_name="items" + ) + + # wrong value will raise + os.environ["DESTINATION__MY_SINK__MY_VAL"] = "wrong" + with pytest.raises(PipelineStepFailed): + dlt.pipeline("sink_test", destination=my_sink, full_refresh=True).run( + [1, 2, 3], table_name="items" + ) + + # will respect given name + @dlt.destination(name="some_name") + def other_sink(file_path, table, my_val=dlt.config.value): + assert my_val == "something" + + # if no value is present, it should raise + with pytest.raises(ConfigFieldMissingException): + dlt.pipeline("sink_test", destination=other_sink, full_refresh=True).run( + [1, 2, 3], table_name="items" + ) + + # right value will pass + os.environ["DESTINATION__SOME_NAME__MY_VAL"] = "something" + dlt.pipeline("sink_test", destination=other_sink, full_refresh=True).run( + [1, 2, 3], table_name="items" + ) + + # test nested spec + + @dlt.destination() + def my_gcp_sink( + file_path, + table, + credentials: ConnectionStringCredentials = dlt.secrets.value, + ): + assert credentials.drivername == "my_driver" + assert credentials.database == "my_database" + assert credentials.username == "my_user_name" + + # missing spec + with pytest.raises(ConfigFieldMissingException): + dlt.pipeline("sink_test", destination=my_gcp_sink, full_refresh=True).run( + [1, 2, 3], table_name="items" + ) + + # add gcp vars (in different sections for testing) + os.environ["SINK_TEST__DESTINATION__CREDENTIALS__DRIVERNAME"] = "my_driver" + os.environ["DESTINATION__CREDENTIALS__DATABASE"] = "my_database" + os.environ["CREDENTIALS__USERNAME"] = "my_user_name" + + # now it will run + dlt.pipeline("sink_test", destination=my_gcp_sink, full_refresh=True).run( + [1, 2, 3], table_name="items" + ) diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index cd18454d7c..8614af4734 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -106,7 +106,9 @@ def assert_destination_state_loaded(pipeline: Pipeline) -> None: """Verify stored destination state matches the local pipeline state""" client: SqlJobClientBase with pipeline.destination_client() as client: # type: ignore[assignment] - destination_state = state_sync.load_state_from_destination(pipeline.pipeline_name, client) + destination_state = state_sync.load_pipeline_state_from_destination( + pipeline.pipeline_name, client + ) pipeline_state = dict(pipeline.state) del pipeline_state["_local"] assert pipeline_state == destination_state diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index 5ef2206031..02da91cefe 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -13,7 +13,11 @@ from dlt.pipeline.exceptions import SqlClientNotAvailable from dlt.pipeline.pipeline import Pipeline -from dlt.pipeline.state_sync import STATE_TABLE_COLUMNS, load_state_from_destination, state_resource +from dlt.pipeline.state_sync import ( + STATE_TABLE_COLUMNS, + load_pipeline_state_from_destination, + state_resource, +) from dlt.destinations.job_client_impl import SqlJobClientBase from tests.utils import TEST_STORAGE_ROOT @@ -54,14 +58,14 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - job_client: SqlJobClientBase with p.destination_client(p.default_schema.name) as job_client: # type: ignore[assignment] with pytest.raises(DestinationUndefinedEntity): - load_state_from_destination(p.pipeline_name, job_client) + load_pipeline_state_from_destination(p.pipeline_name, job_client) # sync the schema p.sync_schema() exists, _ = job_client.get_storage_table(schema.version_table_name) assert exists is True # dataset exists, still no table with pytest.raises(DestinationUndefinedEntity): - load_state_from_destination(p.pipeline_name, job_client) + load_pipeline_state_from_destination(p.pipeline_name, job_client) initial_state = p._get_state() # now add table to schema and sync initial_state["_local"]["_last_extracted_at"] = pendulum.now() @@ -84,14 +88,14 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - exists, _ = job_client.get_storage_table(schema.state_table_name) assert exists is True # table is there but no state - assert load_state_from_destination(p.pipeline_name, job_client) is None + assert load_pipeline_state_from_destination(p.pipeline_name, job_client) is None # extract state with p.managed_state(extract_state=True): pass # just run the existing extract p.normalize(loader_file_format=destination_config.file_format) p.load() - stored_state = load_state_from_destination(p.pipeline_name, job_client) + stored_state = load_pipeline_state_from_destination(p.pipeline_name, job_client) local_state = p._get_state() local_state.pop("_local") assert stored_state == local_state @@ -101,7 +105,7 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - managed_state["sources"] = {"source": dict(JSON_TYPED_DICT_DECODED)} p.normalize(loader_file_format=destination_config.file_format) p.load() - stored_state = load_state_from_destination(p.pipeline_name, job_client) + stored_state = load_pipeline_state_from_destination(p.pipeline_name, job_client) assert stored_state["sources"] == {"source": JSON_TYPED_DICT_DECODED} local_state = p._get_state() local_state.pop("_local") @@ -116,7 +120,7 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - p.normalize(loader_file_format=destination_config.file_format) info = p.load() assert len(info.loads_ids) == 0 - new_stored_state = load_state_from_destination(p.pipeline_name, job_client) + new_stored_state = load_pipeline_state_from_destination(p.pipeline_name, job_client) # new state should not be stored assert new_stored_state == stored_state @@ -147,7 +151,7 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - p.normalize(loader_file_format=destination_config.file_format) info = p.load() assert len(info.loads_ids) == 1 - new_stored_state_2 = load_state_from_destination(p.pipeline_name, job_client) + new_stored_state_2 = load_pipeline_state_from_destination(p.pipeline_name, job_client) # the stored state changed to next version assert new_stored_state != new_stored_state_2 assert new_stored_state["_state_version"] + 1 == new_stored_state_2["_state_version"] @@ -405,7 +409,7 @@ def complete_package_mock(self, load_id: str, schema: Schema, aborted: bool = Fa job_client: SqlJobClientBase with p._get_destination_clients(p.default_schema)[0] as job_client: # type: ignore[assignment] # state without completed load id is not visible - state = load_state_from_destination(pipeline_name, job_client) + state = load_pipeline_state_from_destination(pipeline_name, job_client) assert state is None diff --git a/tests/load/weaviate/utils.py b/tests/load/weaviate/utils.py index ed378191e6..1b2a74fcb8 100644 --- a/tests/load/weaviate/utils.py +++ b/tests/load/weaviate/utils.py @@ -79,6 +79,8 @@ def delete_classes(p, class_list): def drop_active_pipeline_data() -> None: def schema_has_classes(client): + if not hasattr(client, "db_client"): + return None schema = client.db_client.schema.get() return schema["classes"] diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 0cebeb2ff7..272f57d966 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1366,11 +1366,11 @@ def test_resource_state_name_not_normalized() -> None: pipeline.load() # get state from destination - from dlt.pipeline.state_sync import load_state_from_destination + from dlt.pipeline.state_sync import load_pipeline_state_from_destination client: WithStateSync with pipeline.destination_client() as client: # type: ignore[assignment] - state = load_state_from_destination(pipeline.pipeline_name, client) + state = load_pipeline_state_from_destination(pipeline.pipeline_name, client) assert "airtable_emojis" in state["sources"] assert state["sources"]["airtable_emojis"]["resources"] == {"šŸ¦šPeacock": {"šŸ¦ššŸ¦ššŸ¦š": "šŸ¦š"}} diff --git a/tests/pipeline/test_pipeline_state.py b/tests/pipeline/test_pipeline_state.py index ee788367e1..f0bcda2717 100644 --- a/tests/pipeline/test_pipeline_state.py +++ b/tests/pipeline/test_pipeline_state.py @@ -14,7 +14,11 @@ from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException, PipelineStepFailed from dlt.pipeline.pipeline import Pipeline -from dlt.pipeline.state_sync import generate_version_hash, migrate_state, STATE_ENGINE_VERSION +from dlt.pipeline.state_sync import ( + generate_pipeline_state_version_hash, + migrate_pipeline_state, + PIPELINE_STATE_ENGINE_VERSION, +) from tests.utils import test_storage from tests.pipeline.utils import json_case_path, load_json_case @@ -482,21 +486,21 @@ def transform(item): ) -def test_migrate_state(test_storage: FileStorage) -> None: +def test_migrate_pipeline_state(test_storage: FileStorage) -> None: # test generation of version hash on migration to v3 state_v1 = load_json_case("state/state.v1") - state = migrate_state("test_pipeline", state_v1, state_v1["_state_engine_version"], 3) + state = migrate_pipeline_state("test_pipeline", state_v1, state_v1["_state_engine_version"], 3) assert state["_state_engine_version"] == 3 assert "_local" in state assert "_version_hash" in state - assert state["_version_hash"] == generate_version_hash(state) + assert state["_version_hash"] == generate_pipeline_state_version_hash(state) # full migration state_v1 = load_json_case("state/state.v1") - state = migrate_state( - "test_pipeline", state_v1, state_v1["_state_engine_version"], STATE_ENGINE_VERSION + state = migrate_pipeline_state( + "test_pipeline", state_v1, state_v1["_state_engine_version"], PIPELINE_STATE_ENGINE_VERSION ) - assert state["_state_engine_version"] == STATE_ENGINE_VERSION + assert state["_state_engine_version"] == PIPELINE_STATE_ENGINE_VERSION # check destination migration assert state["destination_name"] == "postgres" @@ -505,12 +509,15 @@ def test_migrate_state(test_storage: FileStorage) -> None: with pytest.raises(PipelineStateEngineNoUpgradePathException) as py_ex: state_v1 = load_json_case("state/state.v1") - migrate_state( - "test_pipeline", state_v1, state_v1["_state_engine_version"], STATE_ENGINE_VERSION + 1 + migrate_pipeline_state( + "test_pipeline", + state_v1, + state_v1["_state_engine_version"], + PIPELINE_STATE_ENGINE_VERSION + 1, ) assert py_ex.value.init_engine == state_v1["_state_engine_version"] - assert py_ex.value.from_engine == STATE_ENGINE_VERSION - assert py_ex.value.to_engine == STATE_ENGINE_VERSION + 1 + assert py_ex.value.from_engine == PIPELINE_STATE_ENGINE_VERSION + assert py_ex.value.to_engine == PIPELINE_STATE_ENGINE_VERSION + 1 # also test pipeline init where state is old test_storage.create_folder("debug_pipeline") @@ -522,7 +529,7 @@ def test_migrate_state(test_storage: FileStorage) -> None: assert p.dataset_name == "debug_pipeline_data" assert p.default_schema_name == "example_source" state = p.state - assert state["_version_hash"] == generate_version_hash(state) + assert state["_version_hash"] == generate_pipeline_state_version_hash(state) # specifically check destination v3 to v4 migration state_v3 = { @@ -530,8 +537,8 @@ def test_migrate_state(test_storage: FileStorage) -> None: "staging": "dlt.destinations.filesystem", "_state_engine_version": 3, } - migrate_state( - "test_pipeline", state_v3, state_v3["_state_engine_version"], STATE_ENGINE_VERSION # type: ignore + migrate_pipeline_state( + "test_pipeline", state_v3, state_v3["_state_engine_version"], PIPELINE_STATE_ENGINE_VERSION # type: ignore ) assert state_v3["destination_name"] == "redshift" assert state_v3["destination_type"] == "dlt.destinations.redshift" @@ -544,8 +551,8 @@ def test_migrate_state(test_storage: FileStorage) -> None: "destination": "dlt.destinations.redshift", "_state_engine_version": 3, } - migrate_state( - "test_pipeline", state_v3, state_v3["_state_engine_version"], STATE_ENGINE_VERSION # type: ignore + migrate_pipeline_state( + "test_pipeline", state_v3, state_v3["_state_engine_version"], PIPELINE_STATE_ENGINE_VERSION # type: ignore ) assert state_v3["destination_name"] == "redshift" assert state_v3["destination_type"] == "dlt.destinations.redshift" @@ -554,8 +561,8 @@ def test_migrate_state(test_storage: FileStorage) -> None: assert "staging_type" not in state_v3 state_v3 = {"destination": None, "staging": None, "_state_engine_version": 3} - migrate_state( - "test_pipeline", state_v3, state_v3["_state_engine_version"], STATE_ENGINE_VERSION # type: ignore + migrate_pipeline_state( + "test_pipeline", state_v3, state_v3["_state_engine_version"], PIPELINE_STATE_ENGINE_VERSION # type: ignore ) assert "destination_name" not in state_v3 assert "destination_type" not in state_v3 @@ -563,8 +570,8 @@ def test_migrate_state(test_storage: FileStorage) -> None: assert "staging_type" not in state_v3 state_v3 = {"_state_engine_version": 2} - migrate_state( - "test_pipeline", state_v3, state_v3["_state_engine_version"], STATE_ENGINE_VERSION # type: ignore + migrate_pipeline_state( + "test_pipeline", state_v3, state_v3["_state_engine_version"], PIPELINE_STATE_ENGINE_VERSION # type: ignore ) assert "destination_name" not in state_v3 assert "destination_type" not in state_v3 diff --git a/tests/utils.py b/tests/utils.py index dd03279def..924f44de73 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -45,10 +45,11 @@ "motherduck", "mssql", "qdrant", + "destination", "synapse", "databricks", } -NON_SQL_DESTINATIONS = {"filesystem", "weaviate", "dummy", "motherduck", "qdrant"} +NON_SQL_DESTINATIONS = {"filesystem", "weaviate", "dummy", "motherduck", "qdrant", "destination"} SQL_DESTINATIONS = IMPLEMENTED_DESTINATIONS - NON_SQL_DESTINATIONS # exclude destination configs (for now used for athena and athena iceberg separation) From 3a007a427aeff8fe79ec077d1a04aff786fdbe66 Mon Sep 17 00:00:00 2001 From: Ilya Gurov Date: Thu, 14 Mar 2024 20:54:44 +0400 Subject: [PATCH 10/27] docs(airflow): add description of new decompose methods (#1072) --- .../deploy-with-airflow-composer.md | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md index 365f6747dc..e30659bc60 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md @@ -155,8 +155,41 @@ def load_data(): ) # Create the source, the "serialize" decompose option # will convert dlt resources into Airflow tasks. - # Use "none" to disable it - tasks.add_run(pipeline, source(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) + # Use "none" to disable it. + tasks.add_run( + pipeline, + source(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True + ) + + # The "parallel" decompose option will convert dlt + # resources into parallel Airflow tasks, except the + # first one, which will be executed before any other tasks. + # All the tasks will be executed in the same pipeline state. + # tasks.add_run( + # pipeline, + # source(), + # decompose="parallel", + # trigger_rule="all_done", + # retries=0, + # provide_context=True + # ) + + # The "parallel-isolated" decompose option will convert dlt + # resources into parallel Airflow tasks, except the + # first one, which will be executed before any other tasks. + # In this mode, all the tasks will use separate pipeline states. + # tasks.add_run( + # pipeline, + # source(), + # decompose="parallel-isolated", + # trigger_rule="all_done", + # retries=0, + # provide_context=True + # ) load_data() ``` From 5e0b8b44b1a24ffdce9d7f01a715225567d59ba8 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 15 Mar 2024 19:47:13 +0300 Subject: [PATCH 11/27] Clarify process for enhancements and bug fixes (#1096) --- CONTRIBUTING.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index afd0a00d4a..895ad08229 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,6 +12,15 @@ Thank you for considering contributing to **dlt**! We appreciate your help in ma 6. [Publishing (Maintainers Only)](#publishing-maintainers-only) 7. [Resources](#resources) +## Before You Begin + +- **Proposing significant changes or enhancements**: If you're thinking about making significant changes, make sure to [submit an issue](https://github.com/dlt-hub/dlt/issues/new/choose) first. This ensures your efforts align with the project's direction and that you don't invest time on a feature that may not be merged. + +- **Fixing bugs**: + - **Check existing issues**: search [open issues](https://github.com/dlt-hub/dlt/issues) to see if the bug you've found is already reported. + - If **not reported**, [create a new issue](https://github.com/dlt-hub/dlt/issues/new/choose). You're more than welcome to fix it and submit a pull request with your solution. Thank you! + - If the bug is **already reported**, please leave a comment on that issue stating you're working on fixing it. This helps keep everyone updated and avoids duplicate efforts. + ## Getting Started To get started, follow these steps: From ce701b549999135e32f2f593594fd48e734560ec Mon Sep 17 00:00:00 2001 From: David Scharf Date: Mon, 18 Mar 2024 13:19:06 +0100 Subject: [PATCH 12/27] check embedded code blocks (#1093) * first version of embedded snippets check * add missing code block types where needed * small change to snippets script * fix all parser problems in code blocks * add better error messages and add check to ci * add linting of embedded snippets * small improvement for snippets linting * remove one ignored error code * add ruff dep * add mypy (comment out for now) * fix bug in script * ignore lint setup for embedded snippets * fix linting and small mypy adjustments * switches from shell to sh as shell block type * make snippet checker code nicer * small script changes and readme * add lint and type check count --- Makefile | 5 +- docs/tools/README.md | 37 + docs/tools/__init__.py | 0 docs/tools/check_embedded_snippets.py | 332 + docs/tools/lint_setup/.gitignore | 1 + docs/tools/lint_setup/__init__.py | 0 docs/tools/lint_setup/template.py | 35 + docs/tools/mypy.ini | 4 + docs/tools/ruff.toml | 2 + .../blog/2023-10-09-dlt-ops-startups.md | 2 +- ...01-15-dlt-dbt-runner-on-cloud-functions.md | 4 +- .../website/docs/build-a-pipeline-tutorial.md | 20 +- .../docs/dlt-ecosystem/destinations/athena.md | 8 +- .../dlt-ecosystem/destinations/bigquery.md | 14 +- .../dlt-ecosystem/destinations/databricks.md | 10 +- .../dlt-ecosystem/destinations/destination.md | 12 +- .../docs/dlt-ecosystem/destinations/duckdb.md | 16 +- .../dlt-ecosystem/destinations/filesystem.md | 8 +- .../dlt-ecosystem/destinations/motherduck.md | 12 +- .../docs/dlt-ecosystem/destinations/mssql.md | 10 +- .../dlt-ecosystem/destinations/postgres.md | 14 +- .../docs/dlt-ecosystem/destinations/qdrant.md | 20 +- .../dlt-ecosystem/destinations/redshift.md | 10 +- .../dlt-ecosystem/destinations/snowflake.md | 18 +- .../dlt-ecosystem/destinations/synapse.md | 12 +- .../dlt-ecosystem/destinations/weaviate.md | 22 +- .../file-formats/insert-format.md | 2 +- .../docs/dlt-ecosystem/file-formats/jsonl.md | 2 +- .../dlt-ecosystem/file-formats/parquet.md | 4 +- docs/website/docs/dlt-ecosystem/staging.md | 4 +- .../dlt-ecosystem/transformations/dbt/dbt.md | 2 +- .../transformations/dbt/dbt_cloud.md | 8 +- .../dlt-ecosystem/transformations/pandas.md | 2 +- .../docs/dlt-ecosystem/transformations/sql.md | 10 +- .../verified-sources/airtable.md | 26 +- .../verified-sources/amazon_kinesis.md | 21 +- .../verified-sources/arrow-pandas.md | 8 +- .../dlt-ecosystem/verified-sources/asana.md | 24 +- .../dlt-ecosystem/verified-sources/chess.md | 24 +- .../verified-sources/facebook_ads.md | 39 +- .../verified-sources/filesystem.md | 52 +- .../dlt-ecosystem/verified-sources/github.md | 27 +- .../verified-sources/google_analytics.md | 26 +- .../verified-sources/google_sheets.md | 48 +- .../dlt-ecosystem/verified-sources/hubspot.md | 30 +- .../dlt-ecosystem/verified-sources/inbox.md | 26 +- .../dlt-ecosystem/verified-sources/jira.md | 24 +- .../dlt-ecosystem/verified-sources/kafka.md | 19 +- .../dlt-ecosystem/verified-sources/matomo.md | 30 +- .../dlt-ecosystem/verified-sources/mongodb.md | 38 +- .../dlt-ecosystem/verified-sources/mux.md | 28 +- .../dlt-ecosystem/verified-sources/notion.md | 19 +- .../verified-sources/personio.md | 25 +- .../verified-sources/pipedrive.md | 28 +- .../verified-sources/salesforce.md | 23 +- .../dlt-ecosystem/verified-sources/shopify.md | 25 +- .../dlt-ecosystem/verified-sources/slack.md | 31 +- .../verified-sources/sql_database.md | 38 +- .../dlt-ecosystem/verified-sources/strapi.md | 17 +- .../dlt-ecosystem/verified-sources/stripe.md | 29 +- .../verified-sources/workable.md | 28 +- .../dlt-ecosystem/verified-sources/zendesk.md | 50 +- .../visualizations/exploring-the-data.md | 10 +- .../docs/examples/chess_production/index.md | 2 +- .../docs/examples/google_sheets/index.md | 2 +- .../docs/examples/nested_data/index.md | 2 +- .../docs/examples/pdf_to_weaviate/index.md | 2 +- .../docs/examples/qdrant_zendesk/index.md | 6 +- .../credentials/config_providers.md | 4 +- .../general-usage/credentials/config_specs.md | 34 +- .../credentials/configuration.md | 104 +- .../pseudonymizing_columns.md | 2 +- .../customising-pipelines/removing_columns.md | 10 +- .../customising-pipelines/renaming_columns.md | 2 +- .../currency_conversion_data_enrichment.md | 16 +- .../url-parser-data-enrichment.md | 68 +- .../user_agent_device_data_enrichment.md | 88 +- .../website/docs/general-usage/destination.md | 2 +- .../docs/general-usage/full-loading.md | 2 +- .../docs/general-usage/incremental-loading.md | 64 +- docs/website/docs/general-usage/pipeline.md | 10 +- docs/website/docs/general-usage/resource.md | 50 +- .../docs/general-usage/schema-contracts.md | 10 +- docs/website/docs/general-usage/schema.md | 4 +- docs/website/docs/general-usage/source.md | 26 +- docs/website/docs/general-usage/state.md | 2 +- docs/website/docs/getting-started.md | 12 +- .../docs/reference/command-line-interface.md | 36 +- docs/website/docs/reference/installation.md | 18 +- docs/website/docs/reference/performance.md | 12 +- docs/website/docs/reference/telemetry.md | 6 +- .../docs/running-in-production/alerting.md | 2 +- .../docs/running-in-production/monitoring.md | 10 +- .../docs/running-in-production/running.md | 32 +- .../docs/running-in-production/tracing.md | 2 +- .../docs/tutorial/grouping-resources.md | 22 +- .../docs/tutorial/load-data-from-an-api.md | 8 +- .../walkthroughs/add-a-verified-source.md | 20 +- .../docs/walkthroughs/add_credentials.md | 2 +- .../docs/walkthroughs/adjust-a-schema.md | 6 +- .../docs/walkthroughs/create-a-pipeline.md | 20 +- .../walkthroughs/create-new-destination.md | 6 +- .../deploy-gcp-cloud-function-as-webhook.md | 6 +- .../deploy-with-airflow-composer.md | 48 +- .../deploy-with-github-actions.md | 10 +- .../deploy-with-google-cloud-functions.md | 6 +- .../dispatch-to-multiple-tables.md | 8 +- .../docs/walkthroughs/run-a-pipeline.md | 22 +- .../docs/walkthroughs/share-a-dataset.md | 18 +- .../docs/walkthroughs/zendesk-weaviate.md | 20 +- poetry.lock | 8310 +++++++++-------- pyproject.toml | 6 +- 112 files changed, 5660 insertions(+), 4995 deletions(-) create mode 100644 docs/tools/README.md create mode 100644 docs/tools/__init__.py create mode 100644 docs/tools/check_embedded_snippets.py create mode 100644 docs/tools/lint_setup/.gitignore create mode 100644 docs/tools/lint_setup/__init__.py create mode 100644 docs/tools/lint_setup/template.py create mode 100644 docs/tools/mypy.ini create mode 100644 docs/tools/ruff.toml diff --git a/Makefile b/Makefile index 5aa2b2786c..4cc19f1ae5 100644 --- a/Makefile +++ b/Makefile @@ -60,8 +60,9 @@ format: # poetry run isort ./ test-and-lint-snippets: - poetry run mypy --config-file mypy.ini docs/website docs/examples - poetry run flake8 --max-line-length=200 docs/website docs/examples + cd docs/tools && poetry run python check_embedded_snippets.py full + poetry run mypy --config-file mypy.ini docs/website docs/examples docs/tools --exclude docs/tools/lint_setup + poetry run flake8 --max-line-length=200 docs/website docs/examples docs/tools cd docs/website/docs && poetry run pytest --ignore=node_modules lint-security: diff --git a/docs/tools/README.md b/docs/tools/README.md new file mode 100644 index 0000000000..78fd0aff43 --- /dev/null +++ b/docs/tools/README.md @@ -0,0 +1,37 @@ +# DLT docs tools + +## `check_embedded_snippets.py` +This script find's all embedded snippets in our docs, extracts them and performs the following check: + +* Snippet must have a valid language set, e.g. ```py +* Snippet must be parseable (works for py, toml, yaml and json snippets) +* Snippet must pass linting (works for py) +* Coming soon: snippet must pass type checking + +This script is run on CI to ensure code quality in our docs. + +### Usage + +```sh +# Run a full check on all snippets +python check_embedded_snippets.py full + +# Show all available commands and arguments for this script +python check_embedded_snippets.py --help + +# Only run the linting stage +python check_embedded_snippets.py lint + +# Run all stages but only for snippets in files that have the string "walkthrough" in the filepath +# you will probably be using this a lot when working on one doc page +python check_embedded_snippets.py full -f walkthrough + +# Run the parsing stage, but only on snippets 49, 345 and 789 +python check_embedded_snippets.py parse -s 49,345,789 + +# run all checks but with a bit more output to the terminal +python check_embedded_snippets.py full -v +``` + +### Snippet numbers +Each snippet will be assigned an index in the order it is encountered. This is useful during creation of new snippets in the docs to selectively only run a few snippets. These numbers will change as snippets are inserted into the docs. diff --git a/docs/tools/__init__.py b/docs/tools/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/tools/check_embedded_snippets.py b/docs/tools/check_embedded_snippets.py new file mode 100644 index 0000000000..663166d0c0 --- /dev/null +++ b/docs/tools/check_embedded_snippets.py @@ -0,0 +1,332 @@ +""" +Walks through all markdown files, finds all code snippets, and checks wether they are parseable. +""" +from typing import List, Dict, Optional + +import os, ast, json, yaml, tomlkit, subprocess, argparse # noqa: I251 +from dataclasses import dataclass +from textwrap import dedent + +import dlt.cli.echo as fmt + +DOCS_DIR = "../website/docs" + +SNIPPET_MARKER = "```" +ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql"] + +LINT_TEMPLATE = "./lint_setup/template.py" +LINT_FILE = "./lint_setup/lint_me.py" + +ENABLE_MYPY = False + + +@dataclass +class Snippet: + index: int + language: str + code: str + file: str + line: int + + def __str__(self) -> str: + return ( + f"Snippet No. {self.index} in {self.file} at line {self.line} with language" + f" {self.language}" + ) + + +def collect_markdown_files(verbose: bool) -> List[str]: + """ + Discovers all docs markdown files + """ + markdown_files: List[str] = [] + for path, _, files in os.walk(DOCS_DIR): + if "api_reference" in path: + continue + if "jaffle_shop" in path: + continue + for file in files: + if file.endswith(".md"): + markdown_files.append(os.path.join(path, file)) + if verbose: + fmt.echo(f"Discovered {os.path.join(path, file)}") + + if len(markdown_files) < 50: # sanity check + fmt.error("Found too few files. Something went wrong.") + exit(1) + + fmt.note(f"Discovered {len(markdown_files)} markdown files") + + return markdown_files + + +def collect_snippets(markdown_files: List[str], verbose: bool) -> List[Snippet]: + """ + Extract all snippets from markdown files + """ + snippets: List[Snippet] = [] + index = 0 + for file in markdown_files: + # go line by line and find all code blocks + with open(file, "r", encoding="utf-8") as f: + current_snippet: Snippet = None + lint_count = 0 + for line in f.readlines(): + lint_count += 1 + if line.strip().startswith(SNIPPET_MARKER): + if current_snippet: + # process snippet + snippets.append(current_snippet) + current_snippet.code = dedent(current_snippet.code) + current_snippet = None + else: + # start new snippet + index += 1 + current_snippet = Snippet( + index=index, + language=line.strip().split(SNIPPET_MARKER)[1] or "unknown", + code="", + file=file, + line=lint_count, + ) + elif current_snippet: + current_snippet.code += line + assert not current_snippet, ( + "It seems that the last snippet in the file was not closed. Please check the file " + + file + ) + + fmt.note(f"Discovered {len(snippets)} snippets") + if verbose: + for lang in ALLOWED_LANGUAGES: + lang_count = len([s for s in snippets if s.language == lang]) + fmt.echo(f"Found {lang_count} snippets marked as {lang}") + if len(snippets) < 100: # sanity check + fmt.error("Found too few snippets. Something went wrong.") + exit(1) + return snippets + + +def filter_snippets(snippets: List[Snippet], files: str, snippet_numbers: str) -> List[Snippet]: + """ + Filter out snippets based on file or snippet number + """ + fmt.secho(fmt.bold("Filtering Snippets")) + filtered_snippets: List[Snippet] = [] + filtered_count = 0 + for snippet in snippets: + if files and (files not in snippet.file): + filtered_count += 1 + continue + elif snippet_numbers and (str(snippet.index) not in snippet_numbers): + filtered_count += 1 + continue + filtered_snippets.append(snippet) + if filtered_count: + fmt.note( + f"{filtered_count} Snippets skipped based on file and snippet number settings." + f" {len(filtered_snippets)} snippets remaining." + ) + else: + fmt.note("0 Snippets skipped based on file and snippet number settings") + + if len(filtered_snippets) == 0: # sanity check + fmt.error("No snippets remaining after filter, nothing to do.") + exit(1) + return filtered_snippets + + +def check_language(snippets: List[Snippet]) -> None: + """ + Check if the language is allowed + """ + fmt.secho(fmt.bold("Checking snippets language settings")) + failed_count = 0 + for snippet in snippets: + if snippet.language not in ALLOWED_LANGUAGES: + fmt.warning(f"{str(snippet)} has an invalid language {snippet.language} setting.") + failed_count += 1 + + if failed_count: + fmt.error(f"""\ +Found {failed_count} snippets with invalid language settings. +* Please choose the correct language for your snippets: {ALLOWED_LANGUAGES}" +* All sh commands, except for windows (bat), should be marked as sh. +* All code blocks that are not a specific (markup-) language should be marked as text.\ +""") + exit(1) + else: + fmt.note("All snippets have valid language settings") + + +def clear(): + fmt.echo("\r" + " " * 200 + "\r", nl=False) + + +def parse_snippets(snippets: List[Snippet], verbose: bool) -> None: + """ + Parse all snippets with the respective parser library + """ + fmt.secho(fmt.bold("Parsing snippets")) + failed_count = 0 + for snippet in snippets: + # parse snippet by type + clear() + fmt.echo(f"\rParsing {snippet}", nl=False) + try: + if snippet.language == "py": + ast.parse(snippet.code) + elif snippet.language == "toml": + tomlkit.loads(snippet.code) + elif snippet.language == "json": + json.loads(snippet.code) + elif snippet.language == "yaml": + yaml.safe_load(snippet.code) + # ignore text and sh scripts + elif snippet.language in ["text", "sh", "bat", "sql"]: + pass + else: + raise ValueError(f"Unknown language {snippet.language}") + except Exception as exc: + clear() + fmt.warning(f"Failed to parse {str(snippet)}") + fmt.echo(exc) + failed_count += 1 + + clear() + if failed_count: + fmt.error(f"Failed to parse {failed_count} snippets") + exit(1) + else: + fmt.note("All snippets could be parsed") + + +def prepare_for_linting(snippet: Snippet) -> None: + """ + Prepare the lintme file with the snippet code and the template header + """ + with open(LINT_TEMPLATE, "r", encoding="utf-8") as f: + lint_template = f.read() + with open(LINT_FILE, "w", encoding="utf-8") as f: + f.write(lint_template) + f.write("# Snippet start\n\n") + f.write(snippet.code) + + +def lint_snippets(snippets: List[Snippet], verbose: bool) -> None: + """ + Lint all python snippets with ruff + """ + fmt.secho(fmt.bold("Linting Python snippets")) + failed_count = 0 + count = 0 + for snippet in snippets: + count += 1 + prepare_for_linting(snippet) + result = subprocess.run(["ruff", "check", LINT_FILE], capture_output=True, text=True) + clear() + fmt.echo(f"\rLinting {snippet} ({count} of {len(snippets)})", nl=False) + if "error" in result.stdout.lower(): + failed_count += 1 + clear() + fmt.warning(f"Failed to lint {str(snippet)}") + fmt.echo(result.stdout.strip()) + + clear() + if failed_count: + fmt.error(f"Failed to lint {failed_count} snippets") + exit(1) + else: + fmt.note("All snippets could be linted") + + +def typecheck_snippets(snippets: List[Snippet], verbose: bool) -> None: + """ + TODO: Type check all python snippets with mypy + """ + fmt.secho(fmt.bold("Type checking Python snippets")) + failed_count = 0 + count = 0 + for snippet in snippets: + count += 1 + clear() + fmt.echo(f"\rType checking {snippet} ({count} of {len(snippets)})", nl=False) + prepare_for_linting(snippet) + result = subprocess.run(["mypy", LINT_FILE], capture_output=True, text=True) + if "no issues found" not in result.stdout.lower(): + failed_count += 1 + clear() + fmt.warning(f"Failed to type check {str(snippet)}") + fmt.echo(result.stdout.strip()) + + clear() + if failed_count: + fmt.error(f"Failed to type check {failed_count} snippets") + exit(1) + else: + fmt.note("All snippets passed type checking") + + +if __name__ == "__main__": + fmt.note( + "Welcome to Snippet Checker 3000, run 'python check_embedded_snippets.py --help' for help." + ) + + # setup cli + parser = argparse.ArgumentParser( + description=( + "Check embedded snippets. Discover, parse, lint, and type check all code snippets in" + " the docs." + ), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "command", + help=( + 'Which checks to run. "full" will run all checks, parse, lint or typecheck will only' + " run that specific step" + ), + choices=["full", "parse", "lint", "typecheck"], + default="full", + ) + parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") + parser.add_argument( + "-f", + "--files", + help="Filter .md files to files containing this string in filename", + type=str, + ) + parser.add_argument( + "-s", + "--snippetnumbers", + help=( + "Filter checked snippets to snippetnumbers contained in this string, example:" + ' "13,412,345"' + ), + type=lambda i: i.split(","), + default=None, + ) + + args = parser.parse_args() + + fmt.secho(fmt.bold("Discovering snippets")) + + # find all markdown files and collect all snippets + markdown_files = collect_markdown_files(args.verbose) + snippets = collect_snippets(markdown_files, args.verbose) + + # check language settings + check_language(snippets) + + # filter snippets + filtered_snippets = filter_snippets(snippets, args.files, args.snippetnumbers) + + if args.command in ["parse", "full"]: + parse_snippets(filtered_snippets, args.verbose) + + # these stages are python only + python_snippets = [s for s in filtered_snippets if s.language == "py"] + if args.command in ["lint", "full"]: + lint_snippets(python_snippets, args.verbose) + if ENABLE_MYPY and args.command in ["typecheck", "full"]: + typecheck_snippets(python_snippets, args.verbose) diff --git a/docs/tools/lint_setup/.gitignore b/docs/tools/lint_setup/.gitignore new file mode 100644 index 0000000000..27479bdb04 --- /dev/null +++ b/docs/tools/lint_setup/.gitignore @@ -0,0 +1 @@ +lint_me.py \ No newline at end of file diff --git a/docs/tools/lint_setup/__init__.py b/docs/tools/lint_setup/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/tools/lint_setup/template.py b/docs/tools/lint_setup/template.py new file mode 100644 index 0000000000..dcfada63f6 --- /dev/null +++ b/docs/tools/lint_setup/template.py @@ -0,0 +1,35 @@ +# This section is imported before linting + +# mypy: disable-error-code="name-defined,import-not-found,import-untyped,empty-body,no-redef" + +# some universal imports +from typing import Optional, Dict, List, Any, Iterable, Iterator, Tuple, Sequence, Callable + +import os + +import pendulum +from pendulum import DateTime +from datetime import datetime # noqa: I251 + +import dlt +from dlt.common import json +from dlt.common.typing import TimedeltaSeconds, TAnyDateTime, TDataItem, TDataItems +from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns + +from dlt.common.pipeline import LoadInfo +from dlt.sources.helpers import requests +from dlt.extract import DltResource, DltSource +from dlt.common.configuration.specs import ( + GcpServiceAccountCredentials, + ConnectionStringCredentials, + OAuth2Credentials, + BaseConfiguration, +) +from dlt.common.storages.configuration import FileSystemCredentials + +# some universal variables +pipeline: dlt.Pipeline = None # type: ignore[assignment] +p: dlt.Pipeline = None # type: ignore[assignment] +ex: Exception = None # type: ignore[assignment] +load_info: LoadInfo = None # type: ignore[assignment] +url: str = None # type: ignore[assignment] diff --git a/docs/tools/mypy.ini b/docs/tools/mypy.ini new file mode 100644 index 0000000000..167ad5b30e --- /dev/null +++ b/docs/tools/mypy.ini @@ -0,0 +1,4 @@ +[mypy] +ignore_missing_imports = True +no_implicit_optional = False +strict_optional = False \ No newline at end of file diff --git a/docs/tools/ruff.toml b/docs/tools/ruff.toml new file mode 100644 index 0000000000..96f9432ecc --- /dev/null +++ b/docs/tools/ruff.toml @@ -0,0 +1,2 @@ +[lint] +ignore = ["F811", "F821", "F401", "F841", "E402"] diff --git a/docs/website/blog/2023-10-09-dlt-ops-startups.md b/docs/website/blog/2023-10-09-dlt-ops-startups.md index c48fd9ed95..94c1ff662b 100644 --- a/docs/website/blog/2023-10-09-dlt-ops-startups.md +++ b/docs/website/blog/2023-10-09-dlt-ops-startups.md @@ -112,7 +112,7 @@ Customize the INVOICE_QUERIES dictionary in the `unstructured_data/settings.py` And now the magic happens. Use the following command to run the pipeline: -```shell +```sh python unstructured_data_pipeline.py ``` diff --git a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md index 227c466d37..b36748aed9 100644 --- a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md +++ b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md @@ -132,7 +132,7 @@ We recommend setting up and testing dbt-core locally before using it in cloud fu 1. Finally, you can deploy the function using gcloud CLI as: - ```shell + ```sh gcloud functions deploy YOUR_FUNCTION_NAME \ --gen2 \ --region=YOUR_REGION \ @@ -313,7 +313,7 @@ To integrate dlt and dbt in cloud functions, use the dlt-dbt runner; hereā€™s ho 1. Finally, you can deploy the function using gcloud CLI as: - ```shell + ```sh gcloud functions deploy YOUR_FUNCTION_NAME \ --gen2 \ --region=YOUR_REGION \ diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index 90a175777f..1522761609 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -36,7 +36,7 @@ scalable extraction via micro-batching and parallelism. ## The simplest pipeline: 1 liner to load data with schema evolution -```python +```py import dlt dlt.pipeline(destination='duckdb', dataset_name='mydata').run([{'id': 1, 'name': 'John'}], table_name="users") @@ -52,7 +52,7 @@ named "three". With `dlt`, you can create a pipeline and run it with just a few 1. [Create a pipeline](walkthroughs/create-a-pipeline.md) to the [destination](dlt-ecosystem/destinations). 1. Give this pipeline data and [run it](walkthroughs/run-a-pipeline.md). -```python +```py import dlt pipeline = dlt.pipeline(destination="duckdb", dataset_name="country_data") @@ -84,7 +84,7 @@ In this example, we also run a dbt package and then load the outcomes of the loa This will enable us to log when schema changes occurred and match them to the loaded data for lineage, granting us both column and row level lineage. We also alert the schema change to a Slack channel where hopefully the producer and consumer are subscribed. -```python +```py import dlt # have data? dlt likes data @@ -105,7 +105,7 @@ load_info = pipeline.run( ) ``` Add dbt runner, optionally with venv: -```python +```py venv = dlt.dbt.get_venv(pipeline) dbt = dlt.dbt.package( pipeline, @@ -122,7 +122,7 @@ pipeline.run([models_info], table_name="transform_status", write_disposition='ap ``` Let's alert any schema changes: -```python +```py from dlt.common.runtime.slack import send_slack_message slack_hook = "https://hooks.slack.com/services/xxx/xxx/xxx" @@ -211,7 +211,7 @@ that only one instance of each event is present. You can use the merge write disposition as follows: -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def github_repo_events(): yield from _get_event_pages() @@ -260,7 +260,7 @@ into DAGs, providing cross-database compatibility and various features such as t backfills, testing, and troubleshooting. You can use the dbt runner in `dlt` to seamlessly integrate dbt into your pipeline. Here's an example of running a dbt package after loading the data: -```python +```py import dlt from pipedrive import pipedrive_source @@ -275,7 +275,7 @@ load_info = pipeline.run(pipedrive_source()) print(load_info) ``` Now transform from loaded data to dbt dataset: -```python +```py pipeline = dlt.pipeline( pipeline_name='pipedrive', destination='bigquery', @@ -306,7 +306,7 @@ transformations using SQL statements. You can execute SQL statements that change or manipulate data within tables. Here's an example of inserting a row into the `customers` table using the `dlt` SQL client: -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") with pipeline.sql_client() as client: @@ -324,7 +324,7 @@ You can fetch query results as Pandas data frames and perform transformations us functionalities. Here's an example of reading data from the `issues` table in DuckDB and counting reaction types using Pandas: -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index b376337e77..26be75869b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -10,7 +10,7 @@ The Athena destination stores data as Parquet files in S3 buckets and creates [e ## Install dlt with Athena **To install the DLT library with Athena dependencies:** -``` +```sh pip install dlt[athena] ``` @@ -18,7 +18,7 @@ pip install dlt[athena] ### 1. Initialize the dlt project Let's start by initializing a new `dlt` project as follows: - ```bash + ```sh dlt init chess athena ``` > šŸ’” This command will initialize your pipeline with chess as the source and AWS Athena as the destination using the filesystem staging destination. @@ -27,7 +27,7 @@ Let's start by initializing a new `dlt` project as follows: ### 2. Setup bucket storage and Athena credentials First, install dependencies by running: -``` +```sh pip install -r requirements.txt ``` or with `pip install dlt[athena]`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. @@ -122,7 +122,7 @@ If you decide to change the [filename layout](./filesystem#data-loading) from th ### Iceberg data tables You can save your tables as Iceberg tables to Athena. This will enable you, for example, to delete data from them later if you need to. To switch a resource to the iceberg table format, supply the table_format argument like this: -```python +```py @dlt.resource(table_format="iceberg") def data() -> Iterable[TDataItem]: ... diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index e852bfa9e5..4144707b03 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -10,7 +10,7 @@ keywords: [bigquery, destination, data warehouse] **To install the DLT library with BigQuery dependencies:** -``` +```sh pip install dlt[bigquery] ``` @@ -18,13 +18,13 @@ pip install dlt[bigquery] **1. Initialize a project with a pipeline that loads to BigQuery by running:** -``` +```sh dlt init chess bigquery ``` **2. Install the necessary dependencies for BigQuery by running:** -``` +```sh pip install -r requirements.txt ``` @@ -67,7 +67,7 @@ A `JSON` file that includes your service account private key will then be downlo Open your `dlt` credentials file: -``` +```sh open .dlt/secrets.toml ``` @@ -166,7 +166,7 @@ Alternatively to parquet files, you can specify jsonl as the staging file format ### BigQuery/GCS Staging Example -```python +```py # Create a dlt pipeline that will load # chess player data to the BigQuery destination # via a GCS bucket. @@ -217,7 +217,7 @@ The adapter updates the DltResource with metadata about the destination column a Here is an example of how to use the `bigquery_adapter` method to apply hints to a resource on both the column level and table level: -```python +```py from datetime import date, timedelta import dlt @@ -258,7 +258,7 @@ Some things to note with the adapter's behavior: Note that `bigquery_adapter` updates the resource *inplace*, but returns the resource for convenience, i.e. both the following are valid: -```python +```py bigquery_adapter(my_resource, partition="partition_column_name") my_resource = bigquery_adapter(my_resource, partition="partition_column_name") ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index d00c603c14..8078d2c64d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -11,7 +11,7 @@ keywords: [Databricks, destination, data warehouse] ## Install dlt with Databricks **To install the DLT library with Databricks dependencies:** -``` +```sh pip install dlt[databricks] ``` @@ -91,12 +91,12 @@ If you already have your Databricks workspace set up, you can skip to the [Loade ## Loader setup Guide **1. Initialize a project with a pipeline that loads to Databricks by running** -``` +```sh dlt init chess databricks ``` **2. Install the necessary dependencies for Databricks by running** -``` +```sh pip install -r requirements.txt ``` This will install dlt with **databricks** extra which contains Databricks Python dbapi client. @@ -148,7 +148,7 @@ Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on co Example to set up Databricks with S3 as a staging destination: -```python +```py import dlt # Create a dlt pipeline that will load @@ -168,7 +168,7 @@ Refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure Example to set up Databricks with Azure as a staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Databricks destination # via staging on Azure Blob Storage diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md index e00bbdfc38..174eaa7837 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/destination.md +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -19,7 +19,7 @@ you can do this here too. ## Install dlt for Sink / reverse ETL ** To install the DLT without additional dependencies ** -``` +```sh pip install dlt ``` @@ -28,7 +28,7 @@ pip install dlt Let's start by initializing a new dlt project as follows: -```bash +```sh dlt init chess sink ``` > šŸ’” This command will initialize your pipeline with chess as the source and sink as the destination. @@ -42,7 +42,7 @@ With the `@dlt.destination` decorator you can convert A very simple dlt pipeline that pushes a list of items into a sink function might look like this: -```python +```py from dlt.common.typing import TDataItems from dlt.common.schema import TTableSchema @@ -68,7 +68,7 @@ the sink from your pipeline constructor. Now you can run your pipeline and see t The full signature of the destination decorator plus its function is the following: -```python +```py @dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_sink", naming="direct") def sink(items: TDataItems, table: TTableSchema) -> None: ... @@ -93,7 +93,7 @@ how table and column names are normalized. The default is `direct` which will ke ## Adding config variables and secrets The destination decorator supports settings and secrets variables. If you, for example, plan to connect to a service that requires an api secret or a login, you can do the following: -```python +```py @dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_sink") def my_sink(items: TDataItems, table: TTableSchema, api_key: dlt.secrets.value) -> None: ... @@ -124,7 +124,7 @@ reasons we recommend to keep the multithreaded approach and make sure that you, ## Referencing the sink function There are multiple ways to reference the sink function you want to use. These are: -```python +```py # file my_pipeline.py @dlt.destination(batch_size=10) diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 9452a80c50..63b4aecd80 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -8,24 +8,24 @@ keywords: [duckdb, destination, data warehouse] ## Install dlt with DuckDB **To install the DLT library with DuckDB dependencies, run:** -``` +```sh pip install dlt[duckdb] ``` ## Setup Guide **1. Initialize a project with a pipeline that loads to DuckDB by running:** -``` +```sh dlt init chess duckdb ``` **2. Install the necessary dependencies for DuckDB by running:** -``` +```sh pip install -r requirements.txt ``` **3. Run the pipeline:** -``` +```sh python3 chess_pipeline.py ``` @@ -47,7 +47,7 @@ naming="duck_case" ``` or via the env variable `SCHEMA__NAMING` or directly in the code: -```python +```py dlt.config["schema.naming"] = "duck_case" ``` :::caution @@ -73,7 +73,7 @@ You can configure the following file formats to load data to duckdb: By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. The `duckdb` credentials do not require any secret values. You are free to pass the configuration explicitly via the `credentials` parameter to `dlt.pipeline` or `pipeline.run` methods. For example: -```python +```py # will load data to files/data.db database file p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='chess_data', full_refresh=False, credentials="files/data.db") @@ -82,7 +82,7 @@ p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='ches ``` The destination accepts a `duckdb` connection instance via `credentials`, so you can also open a database connection yourself and pass it to `dlt` to use. `:memory:` databases are supported. -```python +```py import duckdb db = duckdb.connect() p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='chess_data', full_refresh=False, credentials=db) @@ -92,7 +92,7 @@ This destination accepts database connection strings in the format used by [duck You can configure a DuckDB destination with [secret / config values](../../general-usage/credentials) (e.g., using a `secrets.toml` file) ```toml -destination.duckdb.credentials=duckdb:///_storage/test_quack.duckdb +destination.duckdb.credentials="duckdb:///_storage/test_quack.duckdb" ``` The **duckdb://** URL above creates a **relative** path to `_storage/test_quack.duckdb`. To define an **absolute** path, you need to specify four slashes, i.e., `duckdb:////_storage/test_quack.duckdb`. diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index ba323b3d7f..dbd54253b3 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -7,7 +7,7 @@ Its primary role is to be used as a staging for other destinations, but you can ## Install dlt with filesystem **To install the DLT library with filesystem dependencies:** -``` +```sh pip install dlt[filesystem] ``` @@ -29,7 +29,7 @@ so pip does not fail on backtracking. ### 1. Initialise the dlt project Let's start by initialising a new dlt project as follows: - ```bash + ```sh dlt init chess filesystem ``` > šŸ’” This command will initialise your pipeline with chess as the source and the AWS S3 filesystem as the destination. @@ -38,7 +38,7 @@ Let's start by initialising a new dlt project as follows: #### AWS S3 The command above creates sample `secrets.toml` and requirements file for AWS S3 bucket. You can install those dependencies by running: -``` +```sh pip install -r requirements.txt ``` @@ -71,7 +71,7 @@ You need to create a S3 bucket and a user who can access that bucket. `dlt` is n 1. You can create the S3 bucket in the AWS console by clicking on "Create Bucket" in S3 and assigning the appropriate name and permissions to the bucket. 2. Once the bucket is created, you'll have the bucket URL. For example, If the bucket name is `dlt-ci-test-bucket`, then the bucket URL will be: - ``` + ```text s3://dlt-ci-test-bucket ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index 1288b9caac..de11ed5772 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -9,7 +9,7 @@ keywords: [MotherDuck, duckdb, destination, data warehouse] ## Install dlt with MotherDuck **To install the DLT library with MotherDuck dependencies:** -``` +```sh pip install dlt[motherduck] ``` @@ -25,12 +25,12 @@ or export the **LOAD__WORKERS=3** env variable. See more in [performance](../../ ## Setup Guide **1. Initialize a project with a pipeline that loads to MotherDuck by running** -``` +```sh dlt init chess motherduck ``` **2. Install the necessary dependencies for MotherDuck by running** -``` +```sh pip install -r requirements.txt ``` @@ -51,7 +51,7 @@ motherduck.credentials="md:///dlt_data_3?token=" ``` **4. Run the pipeline** -``` +```sh python3 chess_pipeline.py ``` @@ -83,14 +83,14 @@ If your connection is of poor quality and you get a timeout when executing a DML ### I see some exception with home_dir missing when opening `md:` connection. Some internal component (HTTPS) requires the **HOME** env variable to be present. Export such a variable to the command line. Here is what we do in our tests: -```python +```py os.environ["HOME"] = "/tmp" ``` before opening the connection. ### I see some watchdog timeouts. We also see them. -``` +```text 'ATTACH_DATABASE': keepalive watchdog timeout ``` Our observation is that if you write a lot of data into the database, then close the connection and then open it again to write, there's a chance of such a timeout. A possible **WAL** file is being written to the remote duckdb database. diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 5ed4b69707..fc3eede075 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -8,7 +8,7 @@ keywords: [mssql, sqlserver, destination, data warehouse] ## Install dlt with MS SQL **To install the DLT library with MS SQL dependencies, use:** -``` +```sh pip install dlt[mssql] ``` @@ -28,16 +28,16 @@ You can also [configure the driver name](#additional-destination-options) explic ### Create a pipeline **1. Initialize a project with a pipeline that loads to MS SQL by running:** -``` +```sh dlt init chess mssql ``` **2. Install the necessary dependencies for MS SQL by running:** -``` +```sh pip install -r requirements.txt ``` or run: -``` +```sh pip install dlt[mssql] ``` This will install `dlt` with the `mssql` extra, which contains all the dependencies required by the SQL server client. @@ -62,7 +62,7 @@ destination.mssql.credentials="mssql://loader:@loader.database.windows ``` To pass credentials directly, you can use the `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. -```python +```py pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="mssql://loader:@loader.database.windows.net/dlt_data?connect_timeout=15") ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index 10b935c083..ddf4aae9f8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -8,39 +8,39 @@ keywords: [postgres, destination, data warehouse] ## Install dlt with PostgreSQL **To install the DLT library with PostgreSQL dependencies, run:** -``` +```sh pip install dlt[postgres] ``` ## Setup Guide **1. Initialize a project with a pipeline that loads to Postgres by running:** -``` +```sh dlt init chess postgres ``` **2. Install the necessary dependencies for Postgres by running:** -``` +```sh pip install -r requirements.txt ``` This will install dlt with the `postgres` extra, which contains the `psycopg2` client. **3. After setting up a Postgres instance and `psql` / query editor, create a new database by running:** -``` +```sql CREATE DATABASE dlt_data; ``` Add the `dlt_data` database to `.dlt/secrets.toml`. **4. Create a new user by running:** -``` +```sql CREATE USER loader WITH PASSWORD ''; ``` Add the `loader` user and `` password to `.dlt/secrets.toml`. **5. Give the `loader` user owner permissions by running:** -``` +```sql ALTER DATABASE dlt_data OWNER TO loader; ``` @@ -66,7 +66,7 @@ destination.postgres.credentials="postgresql://loader:@localhost/dlt_d ``` To pass credentials directly, you can use the `credentials` argument passed to the `dlt.pipeline` or `pipeline.run` methods. -```python +```py pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="postgresql://loader:@localhost/dlt_data") ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index ff37252852..40d85a43a5 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -13,7 +13,7 @@ This destination helps you load data into Qdrant from [dlt resources](../../gene 1. To use Qdrant as a destination, make sure `dlt` is installed with the `qdrant` extra: -```bash +```sh pip install dlt[qdrant] ``` @@ -31,7 +31,7 @@ If no configuration options are provided, the default fallback will be `http://l 3. Define the source of the data. For starters, let's load some data from a simple data structure: -```python +```py import dlt from dlt.destinations.adapters import qdrant_adapter @@ -53,7 +53,7 @@ movies = [ 4. Define the pipeline: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="qdrant", @@ -63,7 +63,7 @@ pipeline = dlt.pipeline( 5. Run the pipeline: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -74,7 +74,7 @@ info = pipeline.run( 6. Check the results: -```python +```py print(info) ``` @@ -86,7 +86,7 @@ To use vector search after the data has been loaded, you must specify which fiel The `qdrant_adapter` is a helper function that configures the resource for the Qdrant destination: -```python +```py qdrant_adapter(data, embed) ``` @@ -99,7 +99,7 @@ Returns: [DLT resource](../../general-usage/resource.md) object that you can pas Example: -```python +```py qdrant_adapter( resource, embed=["title", "description"], @@ -122,7 +122,7 @@ The [replace](../../general-usage/full-loading.md) disposition replaces the data In the movie example from the [setup guide](#setup-guide), we can use the `replace` disposition to reload the data every time we run the pipeline: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -137,7 +137,7 @@ info = pipeline.run( The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination. For the `merge` disposition, you need to specify a `primary_key` for the resource: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -170,7 +170,7 @@ However, if you prefer to have class names without the dataset prefix, skip the For example: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="qdrant", diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index bc03dbbbeb..7b56377f3b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -8,7 +8,7 @@ keywords: [redshift, destination, data warehouse] ## Install dlt with Redshift **To install the DLT library with Redshift dependencies:** -``` +```sh pip install dlt[redshift] ``` @@ -17,13 +17,13 @@ pip install dlt[redshift] Let's start by initializing a new dlt project as follows: -```bash +```sh dlt init chess redshift ``` > šŸ’” This command will initialize your pipeline with chess as the source and Redshift as the destination. The above command generates several files and directories, including `.dlt/secrets.toml` and a requirements file for Redshift. You can install the necessary dependencies specified in the requirements file by executing it as follows: -```bash +```sh pip install -r requirements.txt ``` or withĀ `pip install dlt[redshift]`,Ā which installs theĀ `dlt`Ā library and the necessary dependencies for working with Amazon Redshift as a destination. @@ -52,7 +52,7 @@ To load data into Redshift, you need to create a Redshift cluster and enable acc 2. The "host" is derived from the cluster endpoint specified in the ā€œGeneral Configuration.ā€ For example: - ```bash + ```sh # If the endpoint is: redshift-cluster-1.cv3cmsy7t4il.us-east-1.redshift.amazonaws.com:5439/your_database_name # Then the host is: @@ -108,7 +108,7 @@ staging_iam_role="arn:aws:iam::..." ### Redshift/S3 staging example code -```python +```py # Create a dlt pipeline that will load # chess player data to the redshift destination # via staging on s3 diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index a6058a255e..a65eaec267 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -8,19 +8,19 @@ keywords: [Snowflake, destination, data warehouse] ## Install dlt with Snowflake **To install the DLT library with Snowflake dependencies, run:** -``` +```sh pip install dlt[snowflake] ``` ## Setup Guide **1. Initialize a project with a pipeline that loads to Snowflake by running:** -``` +```sh dlt init chess snowflake ``` **2. Install the necessary dependencies for Snowflake by running:** -``` +```sh pip install -r requirements.txt ``` This will install `dlt` with the `snowflake` extra, which contains the Snowflake Python dbapi client. @@ -162,12 +162,12 @@ To prevent dlt from forwarding the S3 bucket credentials on every command, and s ```toml [destination] -stage_name=PUBLIC.my_s3_stage +stage_name="PUBLIC.my_s3_stage" ``` To run Snowflake with S3 as the staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Snowflake destination # via staging on S3 @@ -191,12 +191,12 @@ Please refer to the [Google Storage filesystem documentation](./filesystem.md#go ```toml [destination] -stage_name=PUBLIC.my_gcs_stage +stage_name="PUBLIC.my_gcs_stage" ``` To run Snowflake with GCS as the staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Snowflake destination # via staging on GCS @@ -222,12 +222,12 @@ Please consult the Snowflake Documentation on [how to create a stage for your Az ```toml [destination] -stage_name=PUBLIC.my_azure_stage +stage_name="PUBLIC.my_azure_stage" ``` To run Snowflake with Azure as the staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Snowflake destination # via staging on Azure diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index bac184fd41..d803b88a2c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -8,7 +8,7 @@ keywords: [synapse, destination, data warehouse] ## Install dlt with Synapse **To install the DLT library with Synapse dependencies:** -``` +```sh pip install dlt[synapse] ``` @@ -32,12 +32,12 @@ pip install dlt[synapse] ### Steps **1. Initialize a project with a pipeline that loads to Synapse by running** -``` +```sh dlt init chess synapse ``` **2. Install the necessary dependencies for Synapse by running** -``` +```sh pip install -r requirements.txt ``` This will install `dlt` with the **synapse** extra that contains all dependencies required for the Synapse destination. @@ -86,7 +86,7 @@ destination.synapse.credentials = "synapse://loader:your_loader_password@your_sy ``` To pass credentials directly you can use the `credentials` argument of `dlt.destinations.synapse(...)`: -```python +```py pipeline = dlt.pipeline( pipeline_name='chess', destination=dlt.destinations.synapse( @@ -117,7 +117,7 @@ Data is loaded via `INSERT` statements by default. ## Table index type The [table index type](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index) of the created tables can be configured at the resource level with the `synapse_adapter`: -```python +```py info = pipeline.run( synapse_adapter( data=your_resource, @@ -156,7 +156,7 @@ Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.m To run Synapse with staging on Azure Blob Storage: -```python +```py # Create a dlt pipeline that will load # chess player data to the snowflake destination # via staging on Azure Blob Storage diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 6bd52acd35..fb87ccfa6f 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -13,7 +13,7 @@ This destination helps you load data into Weaviate from [dlt resources](../../ge 1. To use Weaviate as a destination, make sure dlt is installed with the 'weaviate' extra: -```bash +```sh pip install dlt[weaviate] ``` @@ -41,7 +41,7 @@ The `url` will default to **http://localhost:8080** and `api_key` is not defined 3. Define the source of the data. For starters, let's load some data from a simple data structure: -```python +```py import dlt from dlt.destinations.adapters import weaviate_adapter @@ -63,7 +63,7 @@ movies = [ 4. Define the pipeline: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="weaviate", @@ -73,7 +73,7 @@ pipeline = dlt.pipeline( 5. Run the pipeline: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -84,7 +84,7 @@ info = pipeline.run( 6. Check the results: -```python +```py print(info) ``` @@ -96,7 +96,7 @@ Weaviate destination is different from other [dlt destinations](../destinations/ The `weaviate_adapter` is a helper function that configures the resource for the Weaviate destination: -```python +```py weaviate_adapter(data, vectorize, tokenization) ``` @@ -109,7 +109,7 @@ Returns: a [dlt resource](../../general-usage/resource.md) object that you can p Example: -```python +```py weaviate_adapter( resource, vectorize=["title", "description"], @@ -133,7 +133,7 @@ The [replace](../../general-usage/full-loading.md) disposition replaces the data In the movie example from the [setup guide](#setup-guide), we can use the `replace` disposition to reload the data every time we run the pipeline: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -148,7 +148,7 @@ info = pipeline.run( The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data in the destination. For the `merge` disposition, you would need to specify a `primary_key` for the resource: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -203,7 +203,7 @@ However, if you prefer to have class names without the dataset prefix, skip the For example: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="weaviate", @@ -246,7 +246,7 @@ You can configure an alternative naming convention which will lowercase all prop {"camelCase": 1, "CamelCase": 2} ``` it will be normalized to: -``` +```json {"camelcase": 2} ``` so your best course of action is to clean up the data yourself before loading and use the default naming convention. Nevertheless, you can configure the alternative in `config.toml`: diff --git a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md index ff73e3741e..641be9a106 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md @@ -25,6 +25,6 @@ It is also supported by: **filesystem**. By setting the `loader_file_format` argument to `insert_values` in the run command, the pipeline will store your data in the INSERT format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="insert_values") ``` diff --git a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md index 130464578e..7467c6f639 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md @@ -27,6 +27,6 @@ This format is used by default by: **BigQuery**, **Snowflake**, **filesystem**. By setting the `loader_file_format` argument to `jsonl` in the run command, the pipeline will store your data in the jsonl format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="jsonl") ``` diff --git a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md index cc2fcfb200..94aaaf4884 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md @@ -20,7 +20,7 @@ Supported by: **BigQuery**, **DuckDB**, **Snowflake**, **filesystem**, **Athena* By setting the `loader_file_format` argument to `parquet` in the run command, the pipeline will store your data in the parquet format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="parquet") ``` @@ -53,7 +53,7 @@ timestamp_timezone="Europe/Berlin" Or using environment variables: -``` +```sh NORMALIZE__DATA_WRITER__FLAVOR NORMALIZE__DATA_WRITER__VERSION NORMALIZE__DATA_WRITER__DATA_PAGE_SIZE diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md index d2ed03a2a2..e3a60dfa51 100644 --- a/docs/website/docs/dlt-ecosystem/staging.md +++ b/docs/website/docs/dlt-ecosystem/staging.md @@ -48,7 +48,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel 4. **Chain staging to destination and request `parquet` file format.** Pass the `staging` argument to `dlt.pipeline`. It works like the destination `argument`: - ```python + ```py # Create a dlt pipeline that will load # chess player data to the redshift destination # via staging on s3 @@ -60,7 +60,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel ) ``` `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify `parquet` file format (just to demonstrate how to do it): - ```python + ```py info = pipeline.run(chess(), loader_file_format="parquet") ``` diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index 1cf7a91bfb..42f31d4875 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -33,7 +33,7 @@ Included below is another example where we run a `dlt` pipeline and then a dbt p > šŸ’” Docstrings are available to read in your IDE. -```python +```py # load all pipedrive endpoints to pipedrive_raw dataset pipeline = dlt.pipeline( pipeline_name='pipedrive', diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md index 43321aab97..d15c4eb84c 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md @@ -11,7 +11,7 @@ keywords: [transform, sql] The DBT Cloud Client is a Python class designed to interact with the dbt Cloud API (version 2). It provides methods to perform various operations on dbt Cloud, such as triggering job runs and retrieving job run statuses. -```python +```py from dlt.helpers.dbt_cloud import DBTCloudClientV2 # Initialize the client @@ -36,7 +36,7 @@ They simplify the process of triggering and monitoring job runs in dbt Cloud. This function triggers a job run in dbt Cloud using the specified configuration. It supports various customization options and allows for monitoring the job's status. -```python +```py from dlt.helpers.dbt_cloud import run_dbt_cloud_job # Trigger a job run with default configuration @@ -58,7 +58,7 @@ If you have already started a job run and have a run ID, then you can use the `g This function retrieves the full information about a specific dbt Cloud job run. It also supports options for waiting until the run is complete. -```python +```py from dlt.helpers.dbt_cloud import get_dbt_cloud_run_status # Retrieve status for a specific run @@ -96,7 +96,7 @@ For environment variables, all names are capitalized and sections are separated For example, for the above secrets, we would need to put into the environment: -``` +```sh DBT_CLOUD__API_TOKEN DBT_CLOUD__ACCOUNT_ID DBT_CLOUD__JOB_ID diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md index dc2fc6d40a..5a82d8be66 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ b/docs/website/docs/dlt-ecosystem/transformations/pandas.md @@ -11,7 +11,7 @@ natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to th dataframes can be really fast! The example below reads GitHub reactions data from the `issues` table and counts the reaction types. -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index 6131cac85a..ad37c61bd8 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -12,22 +12,24 @@ including statements that change the database schema or data in the tables. In t insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi` connection. -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") try: with pipeline.sql_client() as client: client.sql_client.execute_sql( - f"INSERT INTO customers VALUES (%s, %s, %s)", + "INSERT INTO customers VALUES (%s, %s, %s)", 10, "Fred", "fred@fred.com" ) +except Exception: + ... ``` In the case of SELECT queries, the data is returned as a list of rows, with the elements of a row corresponding to selected columns. -```python +```py try: with pipeline.sql_client() as client: res = client.execute_sql( @@ -36,6 +38,8 @@ try: ) # prints column values of the first row print(res[0]) +except Exception: + ... ``` ## Other transforming tools diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md index 0baf1917d1..a920b21a03 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md @@ -45,7 +45,7 @@ Sources and resources that can be loaded using this verified source are: Upon logging into Airtable and accessing your base or table, you'll notice a URL in your browser's address bar resembling: -```bash +```sh https://airtable.com/appve10kl227BIT4GV/tblOUnZVLFWbemTP1/viw3qtF76bRQC3wKx/rec9khXgeTotgCQ62?blocks=hide ``` @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init airtable duckdb ``` @@ -116,20 +116,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python airtable_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -147,13 +147,14 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrieves tables from given Airtable base. -```python +```py @dlt.source def airtable_source( base_id: str = dlt.config.value, table_names: Optional[List[str]] = None, access_token: str = dlt.secrets.value, ) -> Iterable[DltResource]: + ... ``` `base_id`: The base's unique identifier. @@ -167,12 +168,13 @@ tables in the schema are loaded. This function retrieves data from a single Airtable table. -```python +```py def airtable_resource( api: pyairtable.Api, base_id: str, table: Dict[str, Any], ) -> DltResource: + ... ``` `table`: Airtable metadata, excluding actual records. @@ -186,7 +188,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="airtable", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -196,16 +198,16 @@ verified source. 1. To load the entire base: - ```python + ```py base_id = "Please set me up!" # The id of the base. - airtables = airtable_source(base_id=base_id)) + airtables = airtable_source(base_id=base_id) load_info = pipeline.run(load_data, write_disposition="replace") ``` 1. To load selected tables from a base table: - ```python + ```py base_id = "Please set me up!" # The id of the base. table_names = ["Table1","Table2"] # A list of table IDs or table names to load. @@ -221,7 +223,7 @@ verified source. 1. To load data and apply hints to a specific column: - ```python + ```py base_id = "Please set me up!" # The id of the base. table_names = ["Table1","Table2"] # A list of table IDs or table names to load. resource_name = "Please set me up!" # The table name we want to apply hints. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md index 4118902a6c..2894c15b5e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md @@ -57,7 +57,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init kinesis duckdb ``` @@ -110,16 +110,16 @@ For more information, read [Credentials](../../general-usage/credentials). 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python kinesis_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `kinesis_pipeline`. You may @@ -138,7 +138,7 @@ This resource reads a Kinesis stream and yields messages. It supports [incremental loading](../../general-usage/incremental-loading) and parses messages as json by default. -```python +```py @dlt.resource( name=lambda args: args["stream_name"], primary_key="_kinesis_msg_id", @@ -156,6 +156,7 @@ def kinesis_stream( parse_json: bool = True, chunk_size: int = 1000, ) -> Iterable[TDataItem]: + ... ``` `stream_name`: Name of the Kinesis stream. Defaults to config/secrets if unspecified. @@ -212,7 +213,7 @@ verified source. 1. Configure the [pipeline](../../general-usage/pipeline) by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="kinesis_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -222,7 +223,7 @@ verified source. 1. To load messages from a stream from the last one hour: - ```python + ```py # the resource below will take its name from the stream name, # it can be used multiple times by default it assumes that Data is json and parses it, # here we disable that to just get bytes in data elements of the message @@ -237,7 +238,7 @@ verified source. 1. For incremental Kinesis streams, to fetch only new messages: - ```python + ```py #running pipeline will get only new messages info = pipeline.run(kinesis_stream_data) message_counts = pipeline.last_trace.last_normalize_info.row_counts @@ -249,7 +250,7 @@ verified source. 1. To parse json with a simple decoder: - ```python + ```py def _maybe_parse_json(item: TDataItem) -> TDataItem: try: item.update(json.loadb(item["data"])) @@ -263,7 +264,7 @@ verified source. 1. To read Kinesis messages and send them somewhere without using a pipeline: - ```python + ```py from dlt.common.configuration.container import Container from dlt.common.pipeline import StateInjectableContext diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index df968422d7..915a9d297a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -25,7 +25,7 @@ To write an Arrow source, pass any `pyarrow.Table`, `pyarrow.RecordBatch` or `pa This example loads a Pandas dataframe to a Snowflake table: -```python +```py import dlt from dlt.common import pendulum import pandas as pd @@ -45,7 +45,7 @@ pipeline.run(df, table_name="orders") A `pyarrow` table can be loaded in the same way: -```python +```py import pyarrow as pa # Create dataframe and pipeline same as above @@ -96,7 +96,7 @@ Usage is the same as without other dlt resources. Refer to the [incremental load Example: -```python +```py import dlt from dlt.common import pendulum import pandas as pd @@ -144,7 +144,7 @@ All struct types are represented as `complex` and will be loaded as JSON (if des even if they are present in the destination. If you want to represent nested data as separated tables, you must yield panda frames and arrow tables as records. In the examples above: -```python +```py # yield panda frame as records pipeline.run(df.to_dict(orient='records'), table_name="orders") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md index 8554cdd376..9e3ee9c8fe 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md @@ -56,7 +56,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init asana_dlt duckdb ``` @@ -94,16 +94,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python asana_dlt_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `asana`, you may also use any @@ -127,7 +127,7 @@ it is important to note the complete list of the default endpoints given in This is a `dlt.source` function, which returns a list of DltResource objects: "workspaces", "projects", "sections","tags","tasks","stories", "teams", and "users". -```python +```py @dlt.source def asana_source(access_token: str = dlt.secrets.value) -> Any: return [ @@ -142,7 +142,7 @@ def asana_source(access_token: str = dlt.secrets.value) -> Any: This is a `dlt.resource` function, which returns collections of tasks and related information. -```python +```py @dlt.resource(write_disposition="replace") def workspaces( access_token: str = dlt.secrets.value, @@ -171,7 +171,7 @@ transformer functions transform or process data from one or more resources. The transformer function `projects` process data from the `workspaces` resource. It fetches and returns a list of projects for a given workspace from Asana. -```python +```py @dlt.transformer( data_from=workspaces, write_disposition="replace", @@ -200,7 +200,7 @@ It uses `@dlt.defer` decorator to enable parallel run in thread pool. This [incremental](../../general-usage/incremental-loading.md) resource-transformer fetches all tasks for a given project from Asana. -```python +```py @dlt.transformer(data_from=projects, write_disposition="merge", primary_key="gid") def tasks( project_array: t.List[TDataItem], @@ -235,7 +235,7 @@ these steps: 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="asana_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -248,13 +248,13 @@ these steps: 1. To load the data from all the fields, you can utilise the `asana_source` method as follows: - ```python + ```py load_data = asana_source() ``` 1. Use the method `pipeline.run()` to execute the pipeline. - ```python + ```py load_info = pipeline.run(load_data) # print the information on data that was loaded print(load_info) @@ -263,7 +263,7 @@ these steps: 1. To use the method `pipeline.run()` to load custom endpoints ā€œworkspacesā€ and ā€œprojectsā€, the above script may be modified as: - ```python + ```py load_info = pipeline.run(load_data.with_resources("workspaces", "projects")) # print the information on data that was loaded print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md index 7f01b83f08..2341680d97 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md @@ -36,7 +36,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init chess duckdb ``` @@ -66,20 +66,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python chess_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -98,7 +98,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This is a `dlt.source` function for the Chess.com API named "chess", which returns a sequence of DltResource objects. That we'll discuss in subsequent sections as resources. -```python +```py dlt.source(name="chess") def source( players: List[str], start_month: str = None, end_month: str = None @@ -120,7 +120,7 @@ to fetch game data (in "YYYY/MM" format). This is a `dlt.resource` function, which returns player profiles for a list of player usernames. -```python +```py @dlt.resource(write_disposition="replace") def players_profiles(players: List[str]) -> Iterator[TDataItem]: @@ -138,7 +138,7 @@ It uses `@dlt.defer` decorator to enable parallel run in thread pool. This is a `dlt.resource` function, which returns url to game archives for specified players. -```python +```py @dlt.resource(write_disposition="replace", selected=False) def players_archives(players: List[str]) -> Iterator[List[TDataItem]]: ... @@ -154,7 +154,7 @@ runs. This incremental resource takes data from players and returns games for the last month if not specified otherwise. -```python +```py @dlt.resource(write_disposition="append") def players_games( players: List[str], start_month: str = None, end_month: str = None @@ -186,7 +186,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="chess_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -199,7 +199,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. To load the data from all the resources for specific players (e.g. for November), you can utilise the `source` method as follows: - ```python + ```py # Loads games for Nov 2022 data = source( ["magnuscarlsen", "vincentkeymer", "dommarajugukesh", "rpragchess"], @@ -210,7 +210,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. Use the method `pipeline.run()` to execute the pipeline. - ```python + ```py info = pipeline.run(data) # print the information on data that was loaded print(info) @@ -219,7 +219,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. To load data from specific resources like "players_games" and "player_profiles", modify the above code as: - ```python + ```py info = pipeline.run(data.with_resources("players_games", "players_profiles")) # print the information on data that was loaded print(info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md index dea97921b4..0a0c64fb30 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md @@ -66,9 +66,9 @@ By default, Facebook access tokens have a short lifespan of one hour. To exchang Facebook access token for a long-lived token, update the `.dlt/secrets.toml` with client_id, and client_secret and execute the provided Python code. -```python +```py from facebook_ads import get_long_lived_token -print(get_long_lived_token("your short-lived token") +print(get_long_lived_token("your short-lived token")) ``` Replace the `access_token` in the `.dlt/secrets.toml` file with the long-lived token obtained from @@ -77,7 +77,7 @@ the above code snippet. To retrieve the expiry date and the associated scopes of the token, you can use the following command: -```python +```py from facebook_ads import debug_access_token debug_access_token() ``` @@ -88,7 +88,7 @@ level. In `config.toml` / `secrets.toml`: ```toml [sources.facebook_ads] -access_token_expires_at=1688821881... +access_token_expires_at=1688821881 ``` > Note: The Facebook UI, which is described here, might change. @@ -101,7 +101,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init facebook_ads duckdb ``` @@ -158,16 +158,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python facebook_ads_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `facebook_ads`, you may also @@ -191,7 +191,7 @@ it is important to note the complete list of the default endpoints given in This function returns a list of resources to load campaigns, ad sets, ads, creatives, and ad leads data from Facebook Marketing API. -```python +```py @dlt.source(name="facebook_ads") def facebook_ads_source( account_id: str = dlt.config.value, @@ -200,6 +200,7 @@ def facebook_ads_source( request_timeout: float = 300.0, app_api_version: str = None, ) -> Sequence[DltResource]: + ... ``` `account_id`: Account id associated with add manager, configured in "config.toml". @@ -220,7 +221,7 @@ were issued i.e. 'v17.0'. Defaults to the _facebook_business_ library default ve The ads function fetches ad data. It retrieves ads from a specified account with specific fields and states. -```python +```py @dlt.resource(primary_key="id", write_disposition="replace") def ads( fields: Sequence[str] = DEFAULT_AD_FIELDS, @@ -254,7 +255,7 @@ The default fields are defined in This function returns a list of resources to load facebook_insights. -```python +```py @dlt.source(name="facebook_ads") def facebook_insights_source( account_id: str = dlt.config.value, @@ -271,6 +272,7 @@ def facebook_insights_source( request_timeout: int = 300, app_api_version: str = None, ) -> DltResource: + ... ``` `account_id`: Account id associated with ads manager, configured in _config.toml_. @@ -315,13 +317,14 @@ were issued i.e. 'v17.0'. Defaults to the facebook_business library default vers This function fetches Facebook insights data incrementally from a specified start date until the current date, in day steps. -```python +```py @dlt.resource(primary_key=INSIGHTS_PRIMARY_KEY, write_disposition="merge") def facebook_insights( date_start: dlt.sources.incremental[str] = dlt.sources.incremental( "date_start", initial_value=initial_load_start_date_str ) ) -> Iterator[TDataItems]: + ... ``` `date_start`: Parameter sets the initial value for the "date_start" parameter in @@ -337,7 +340,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="facebook_ads", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -350,7 +353,7 @@ verified source. 1. To load all the data from, campaigns, ad sets, ads, ad creatives and leads. - ```python + ```py load_data = facebook_ads_source() load_info = pipeline.run(load_data) print(load_info) @@ -359,7 +362,7 @@ verified source. 1. To merge the Facebook Ads with the state ā€œDISAPPROVEDā€ and with ads state ā€œPAUSEDā€ you can do the following: - ```python + ```py load_data = facebook_ads_source() # It is recommended to enable root key propagation on a source that is not a merge one by default. this is not required if you always use merge but below we start with replace load_data.root_key = True @@ -382,7 +385,7 @@ verified source. 1. To load data with a custom field, for example, to load only ā€œidā€ from Facebook ads, you can do the following: - ```python + ```py load_data = facebook_ads_source() # Only loads add ids, works the same for campaigns, leads etc. load_data.ads.bind(fields=("id",)) @@ -395,7 +398,7 @@ verified source. demonstrates how to enrich objects by adding an enrichment transformation that includes additional fields. - ```python + ```py # You can reduce the chunk size for smaller requests load_data = facebook_ads_source(chunk_size=2) @@ -429,7 +432,7 @@ verified source. breakdowns, etc. As defined in the `facebook_insights_source`. This function generates daily reports for a specified number of past days. - ```python + ```py load_data = facebook_insights_source( initial_load_past_days=30, attribution_window_days_lag= 7, diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md index aed19838ef..bf3d23d0a3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md @@ -81,7 +81,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init filesystem duckdb ``` @@ -150,32 +150,32 @@ For more information, read the 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. Install optional modules: - For AWS S3: - ```bash + ```sh pip install s3fs ``` - For Azure blob: - ```bash + ```sh pip install adlfs>=2023.9.0 ``` - GCS storage: No separate module needed. 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python filesystem_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -197,13 +197,14 @@ This source offers chunked file readers as resources, which can be optionally cu - `read_jsonl()` - `read_parquet()` -```python +```py @dlt.source(_impl_cls=ReadersSource, spec=FilesystemConfigurationResource) def readers( bucket_url: str = dlt.secrets.value, credentials: Union[FileSystemCredentials, AbstractFileSystem] = dlt.secrets.value, file_glob: Optional[str] = "*", ) -> Tuple[DltResource, ...]: + ... ``` - `bucket_url`: The url to the bucket. @@ -225,7 +226,7 @@ This resource lists files in `bucket_url` based on the `file_glob` pattern, retu [FileItem](https://github.com/dlt-hub/dlt/blob/devel/dlt/common/storages/fsspec_filesystem.py#L22) with data access methods. These can be paired with transformers for enhanced processing. -```python +```py @dlt.resource( primary_key="file_url", spec=FilesystemConfigurationResource, standalone=True ) @@ -236,6 +237,7 @@ def filesystem( files_per_page: int = DEFAULT_CHUNK_SIZE, extract_content: bool = False, ) -> Iterator[List[FileItem]]: + ... ``` - `bucket_url`: URL of the bucket. @@ -256,9 +258,9 @@ in bucket URL. To load data into a specific table (instead of the default filesystem table), see the snippet below: -```python +```py @dlt.transformer(standalone=True) -def read_csv(items, chunksize: int = 15) ->: +def read_csv(items, chunksize: int = 15): """Reads csv file with Pandas chunk by chunk.""" ... @@ -275,7 +277,7 @@ Use the [standalone filesystem](../../general-usage/resource#declare-a-standalone-resource) resource to list files in s3, GCS, and Azure buckets. This allows you to customize file readers or manage files using [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html). -```python +```py files = filesystem(bucket_url="s3://my_bucket/data", file_glob="csv_folder/*.csv") pipeline.run(files) ``` @@ -327,7 +329,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="standard_filesystem", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -337,17 +339,17 @@ verified source. 1. To read and load CSV files: - ```python + ```py BUCKET_URL = "YOUR_BUCKET_PATH_HERE" # path of the bucket url or local destination met_files = readers( bucket_url=BUCKET_URL, file_glob="directory/*.csv" - ).read_csv() - # tell dlt to merge on date - met_files.apply_hints(write_disposition="merge", merge_key="date") - # We load the data into the met_csv table - load_info = pipeline.run(met_files.with_name("table_name")) - print(load_info) - print(pipeline.last_trace.last_normalize_info) + ).read_csv() + # tell dlt to merge on date + met_files.apply_hints(write_disposition="merge", merge_key="date") + # We load the data into the met_csv table + load_info = pipeline.run(met_files.with_name("table_name")) + print(load_info) + print(pipeline.last_trace.last_normalize_info) ``` - The `file_glob` parameter targets all CSVs in the "met_csv/A801" directory. @@ -358,7 +360,7 @@ verified source. ::: 1. To load only new CSV files with [incremental loading](../../general-usage/incremental-loading): - ```python + ```py # This configuration will only consider new csv files new_files = filesystem(bucket_url=BUCKET_URL, file_glob="directory/*.csv") # add incremental on modification time @@ -369,7 +371,7 @@ verified source. ``` 1. To read and load Parquet and JSONL from a bucket: - ```python + ```py jsonl_reader = readers(BUCKET_URL, file_glob="**/*.jsonl").read_jsonl( chunksize=10000 ) @@ -391,7 +393,7 @@ verified source. 1. To set up a pipeline that reads from an Excel file using a standalone transformer: - ```python + ```py # Define a standalone transformer to read data from an Excel file. @dlt.transformer(standalone=True) def read_excel( @@ -427,7 +429,7 @@ verified source. 1. To copy files locally, add a step in the filesystem resource and then load the listing to the database: - ```python + ```py def _copy(item: FileItemDict) -> FileItemDict: # instantiate fsspec and copy file dest_file = os.path.join(local_folder, item["file_name"]) @@ -459,7 +461,7 @@ verified source. You can get a fsspec client from filesystem resource after it was extracted i.e. in order to delete processed files etc. The filesystem module contains a convenient method `fsspec_from_resource` that can be used as follows: - ```python + ```py from filesystem import filesystem, fsspec_from_resource # get filesystem source gs_resource = filesystem("gs://ci-test-bucket/") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md index 2fd0277500..4c9a322760 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/github.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/github.md @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init github duckdb ``` @@ -110,16 +110,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python github_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `github_reactions`, you may @@ -137,7 +137,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This `dlt.source` function uses GraphQL to fetch DltResource objects: issues and pull requests along with associated reactions, comments, and reactions to comments. -```python +```py @dlt.source def github_reactions( owner: str, @@ -147,6 +147,7 @@ def github_reactions( max_items: int = None, max_item_age_seconds: float = None, ) -> Sequence[DltResource]: + ... ``` `owner`: Refers to the owner of the repository. @@ -169,7 +170,7 @@ yet to be implemented. Defaults to None. The `dlt.resource` function employs the `_get_reactions_data` method to retrieve data about issues, their associated comments, and subsequent reactions. -```python +```py dlt.resource( _get_reactions_data( "issues", @@ -193,11 +194,12 @@ on event type. It loads new events only and appends them to tables. > Note: Github allows retrieving up to 300 events for public repositories, so frequent updates are > recommended for active repos. -```python +```py @dlt.source(max_table_nesting=2) def github_repo_events( owner: str, name: str, access_token: str = None ) -> DltResource: + ... ``` `owner`: Refers to the owner of the repository. @@ -216,13 +218,14 @@ Read more about [nesting levels](../../general-usage/source#reduce-the-nesting-l This `dlt.resource` function serves as the resource for the `github_repo_events` source. It yields repository events as data items. -```python +```py dlt.resource(primary_key="id", table_name=lambda i: i["type"]) # type: ignore def repo_events( last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental( "created_at", initial_value="1970-01-01T00:00:00Z", last_value_func=max ) ) -> Iterator[TDataItems]: + ... ``` `primary_key`: Serves as the primary key, instrumental in preventing data duplication. @@ -244,7 +247,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -258,7 +261,7 @@ verified source. 1. To load all the data from repo on issues, pull requests, their comments and reactions, you can do the following: - ```python + ```py load_data = github_reactions("duckdb", "duckdb") load_info = pipeline.run(load_data) print(load_info) @@ -267,7 +270,7 @@ verified source. 1. To load only the first 100 issues, you can do the following: - ```python + ```py load_data = github_reactions("duckdb", "duckdb", max_items=100) load_info = pipeline.run(load_data.with_resources("issues")) print(load_info) @@ -276,7 +279,7 @@ verified source. 1. You can use fetch and process repo events data incrementally. It loads all data during the first run and incrementally in subsequent runs. - ```python + ```py load_data = github_repo_events( "duckdb", "duckdb", access_token=os.getenv(ACCESS_TOKEN) ) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md index b6a3a0a5a8..2d8be0b15d 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md @@ -84,7 +84,7 @@ follow these steps: 1. Add the following scope: - ``` + ```text "https://www.googleapis.com/auth/analytics.readonly" ``` @@ -93,7 +93,7 @@ follow these steps: After configuring "client_id", "client_secret", and "project_id" in "secrets.toml", to generate the refresh token, run the following script from the root folder: -```bash +```sh python google_analytics/setup_script_gcp_oauth.py ``` @@ -128,7 +128,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init google_analytics duckdb ``` @@ -214,16 +214,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python google_analytics_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is @@ -241,7 +241,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function returns a list of resources including metadata, metrics, and dimensions data from the Google Analytics API. -```python +```py @dlt.source(max_table_nesting=2) def google_analytics( credentials: Union[ GcpOAuthCredentials, GcpServiceAccountCredential ] = dlt.secrets.value, @@ -250,6 +250,7 @@ def google_analytics( start_date: Optional[str] = START_DATE, rows_per_page: int = 1000, ) -> List[DltResource]: + ... ``` `credentials`: GCP OAuth or service account credentials. @@ -269,9 +270,10 @@ set to 1000. This function retrieves all the metrics and dimensions for a report from a Google Analytics project. -```python +```py @dlt.resource(selected=False) def get_metadata(client: Resource, property_id: int) -> Iterator[Metadata]: + ... ``` `client`: This is the Google Analytics client used to make requests. @@ -284,7 +286,7 @@ def get_metadata(client: Resource, property_id: int) -> Iterator[Metadata]: This transformer function extracts data using metadata and populates a table called "metrics" with the data from each metric. -```python +```py @dlt.transformer(data_from=get_metadata, write_disposition="replace", name="metrics") def metrics_table(metadata: Metadata) -> Iterator[TDataItem]: for metric in metadata.metrics: @@ -304,7 +306,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="google_analytics", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -317,7 +319,7 @@ verified source. 1. To load all the data from metrics and dimensions: - ```python + ```py load_data = google_analytics() load_info = pipeline.run(load_data) print(load_info) @@ -328,7 +330,7 @@ verified source. 1. To load data from a specific start date: - ```python + ```py load_data = google_analytics(start_date='2023-01-01') load_info = pipeline.run(load_data) print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index 2a5d4b03ab..be12f5aea4 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -87,7 +87,7 @@ follow these steps: 1. Add the following scope: - ``` + ```text "https://www.googleapis.com/auth/spreadsheets.readonly" ``` @@ -98,7 +98,7 @@ follow these steps: After configuring "client_id", "client_secret" and "project_id" in "secrets.toml". To generate the refresh token, run the following script from the root folder: - ```bash + ```sh python google_sheets/setup_script_gcp_oauth.py ``` @@ -128,13 +128,13 @@ following: When setting up the pipeline, you can use either the browser-copied URL of your spreadsheet: -```bash +```sh https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing ``` or spreadsheet id (which is a part of the url) -```bash +```sh 1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4 ``` @@ -183,7 +183,7 @@ converted into tables, named after them and stored in the destination. 1. In range_names, you can enter as follows: - ``` + ```text range_names = ["Range_1","Range_2","Sheet1!A1:D10"] ``` @@ -214,7 +214,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init google_sheets duckdb ``` @@ -296,20 +296,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python google_sheets_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -328,7 +328,7 @@ Also, since recently `dlt`'s no longer recognizing date and time types, so you h Use the `apply_hints` method on the resource to achieve this. Here's how you can do it: -```python +```py for resource in resources: resource.apply_hints(columns={ "total_amount": {"data_type": "double"}, @@ -340,7 +340,7 @@ This will ensure that all values in the `total_amount` column are treated as `do And `date` column will be represented as dates, not integers. For a single resource (e.g. `Sheet1`), you can simply use: -```python +```py source.Sheet1.apply_hints(columns={ "total_amount": {"data_type": "double"}, "date": {"data_type": "timestamp"}, @@ -348,7 +348,7 @@ source.Sheet1.apply_hints(columns={ ``` To get the name of resources, you can use: -```python +```py print(source.resources.keys()) ``` @@ -371,7 +371,7 @@ or set `full_refresh=True`. This function loads data from a Google Spreadsheet. It retrieves data from all specified ranges, whether explicitly defined or named, and obtains metadata for the first two rows within each range. -```python +```py def google_spreadsheet( spreadsheet_url_or_id: str = dlt.config.value, range_names: Sequence[str] = dlt.config.value, @@ -381,6 +381,7 @@ def google_spreadsheet( get_sheets: bool = False, get_named_ranges: bool = True, ) -> Iterable[DltResource]: + ... ``` `spreadsheet_url_or_id`: ID or URL of the Google Spreadsheet. @@ -399,7 +400,7 @@ def google_spreadsheet( This function processes each range name provided by the source function, loading its data into separate tables in the destination. -```python +```py dlt.resource( process_range(rows_data, headers=headers, data_types=data_types), name=name, @@ -429,7 +430,7 @@ This table refreshes after each load, storing information on loaded ranges: - Range name as given to the source. - String and parsed representation of the loaded range. -```python +```py dlt.resource( metadata_table, write_disposition="merge", @@ -457,7 +458,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="google_sheets", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -467,7 +468,7 @@ verified source. 1. To load data from explicit range names: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["range_name1", "range_name2"], # Range names @@ -483,7 +484,7 @@ verified source. 1. To load all the range_names from spreadsheet: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=False, @@ -497,7 +498,7 @@ verified source. 1. To load all the sheets from spreadsheet: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=True, @@ -511,7 +512,7 @@ verified source. 1. To load all the sheets and range_names: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=True, @@ -525,7 +526,7 @@ verified source. 1. To load data from multiple spreadsheets: - ```python + ```py load_data1 = google_spreadsheet( "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["Sheet 1!A1:B10"], @@ -543,7 +544,7 @@ verified source. 1. To load with table rename: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["Sheet 1!A1:B10"], @@ -554,7 +555,6 @@ verified source. load_info = pipeline.run(load_data) print(load_info) - } ``` ### Using Airflow with Google Spreadsheets: @@ -583,7 +583,7 @@ Below is the correct way to set up an Airflow DAG for this purpose: - When adding the Google Spreadsheet task to the pipeline, avoid decomposing it; run it as a single task for efficiency. -```python +```py @dag( schedule_interval='@daily', start_date=pendulum.datetime(2023, 2, 1), diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md index 3a623c7b49..8a6e1d1bb3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md @@ -55,7 +55,7 @@ Follow these steps: - Read scopes for CMS, CRM, and Settings. - Permissions for: - ``` + ```text business-intelligence, actions, crm.export, e-commerce, oauth, tickets ``` @@ -74,7 +74,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init hubspot duckdb ``` @@ -115,16 +115,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python hubspot_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `hubspot_pipeline`, you may @@ -148,12 +148,13 @@ it is important to note the complete list of the default endpoints given in This function returns a list of resources to load companies, contacts, deals, tickets, products, and web analytics events data into the destination. -```python +```py @dlt.source(name="hubspot") def hubspot( api_key: str = dlt.secrets.value, include_history: bool = False, ) -> Sequence[DltResource]: + ... ``` `api_key`: The key used to authenticate with the HubSpot API. Configured in "secrets.toml". @@ -166,7 +167,7 @@ specified entities. This resource function fetches data from the "companies" endpoint and loads it to the destination, replacing any existing data. -```python +```py @dlt.resource(name="companies", write_disposition="replace") def companies( api_key: str = api_key, @@ -195,7 +196,7 @@ in addition to the custom properties. Similar to this, resource functions "conta This function loads web analytics events for specific objects from Hubspot API into the destination. -```python +```py @dlt.resource def hubspot_events_for_objects( object_type: THubspotObjectType, @@ -203,6 +204,7 @@ def hubspot_events_for_objects( api_key: str = dlt.secrets.value, start_date: pendulum.DateTime = STARTDATE, ) -> DltResource: + ... ``` `object_type`: One of the Hubspot object types as defined in @@ -225,7 +227,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="hubspot", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -238,7 +240,7 @@ verified source. 1. To load all the data from contacts, companies, deals, products, tickets, and quotes into the destination. - ```python + ```py load_data = hubspot() load_info = pipeline.run(load_data) print(load_info) @@ -246,7 +248,7 @@ verified source. 1. To load data from contacts and companies, with time history using "with_resources" method. - ```python + ```py load_data = hubspot(include_history=True).with_resources("companies","contacts") load_info = pipeline.run(load_data) print(load_info) @@ -256,7 +258,7 @@ verified source. 1. By default, all the custom properties of a CRM object are extracted. If you want only particular fields, set the flag `include_custom_props=False` and add a list of properties with the `props` arg. - ```python + ```py load_data = hubspot() load_data.contacts.bind(props=["date_of_birth", "degree"], include_custom_props=False) load_info = pipeline.run(load_data.with_resources("contacts")) @@ -264,7 +266,7 @@ verified source. 1. If you want to read all the custom properties of CRM objects and some additional (e.g. Hubspot driven) properties. - ```python + ```py load_data = hubspot() load_data.contacts.bind(props=["hs_content_membership_email", "hs_content_membership_email_confirmed"]) load_info = pipeline.run(load_data.with_resources("contacts")) @@ -273,7 +275,7 @@ verified source. 1. To load the web analytics events of a given object type. - ```python + ```py resource = hubspot_events_for_objects("company", ["7086461639", "7086464459"]) # Here, object type : company, and object ids : 7086461639 and 7086464459 load_info = pipeline.run([resource]) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md index 75106df609..668d1ec470 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init inbox duckdb ``` @@ -112,7 +112,7 @@ For more information, read the 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` @@ -128,7 +128,7 @@ For more information, read the 2. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `standard_inbox`, you may also @@ -145,7 +145,7 @@ For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs This function fetches inbox emails, saves attachments locally, and returns uids, messages, and attachments as resources. -```python +```py @dlt.source def inbox_source( host: str = dlt.secrets.value, @@ -158,6 +158,7 @@ def inbox_source( filter_by_mime_type: Sequence[str] = None, chunksize: int = DEFAULT_CHUNK_SIZE, ) -> Sequence[DltResource]: + ... ``` `host` : IMAP server hostname. Default: 'dlt.secrets.value'. @@ -182,13 +183,14 @@ def inbox_source( This resource collects email message UIDs (Unique IDs) from the mailbox. -```python +```py @dlt.resource(name="uids") def get_messages_uids( initial_message_num: Optional[ dlt.sources.incremental[int] ] = dlt.sources.incremental("message_uid", initial_value=1), ) -> TDataItem: + ... ``` `initial_message_num`: provides incremental loading on UID. @@ -197,12 +199,13 @@ def get_messages_uids( This resource retrieves emails by UID (Unique IDs), yielding a dictionary with metadata like UID, ID, sender, subject, dates, content type, and body. -```python +```py @dlt.transformer(name="messages", primary_key="message_uid") def get_messages( items: TDataItems, include_body: bool = True, ) -> TDataItem: + ... ``` `items`: An iterable containing dictionaries with 'message_uid' representing the email message UIDs. @@ -214,7 +217,7 @@ def get_messages( Similar to the previous resources, resource `get_attachments` extracts email attachments by UID from the IMAP server. It yields file items with attachments in the file_content field and the original email in the message field. -```python +```py @dlt.transformer( name="attachments", primary_key="file_hash", @@ -222,6 +225,7 @@ It yields file items with attachments in the file_content field and the original def get_attachments( items: TDataItems, ) -> Iterable[List[FileItem]]: + ... ``` `items`: An iterable containing dictionaries with 'message_uid' representing the email message UIDs. @@ -236,7 +240,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="standard_inbox", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -250,7 +254,7 @@ verified source. - Set `DEFAULT_START_DATE = pendulum.datetime(2023, 10, 1)` in `./inbox/settings.py`. - Use the following code: - ```python + ```py # Retrieve messages from the specified email address. messages = inbox_source(filter_emails=("mycreditcard@bank.com",)).messages # Configure messages to exclude body and name the result "my_inbox". @@ -263,7 +267,7 @@ verified source. > Please refer to inbox_source() docstring for email filtering options by sender, date, or mime type. 3. To load messages from multiple emails, including "community@dlthub.com": - ```python + ```py messages = inbox_source( filter_emails=("mycreditcard@bank.com", "community@dlthub.com.") ).messages @@ -272,7 +276,7 @@ verified source. 4. In `inbox_pipeline.py`, the `pdf_to_text` transformer extracts text from PDFs, treating each page as a separate data item. Using the `pdf_to_text` function to load parsed pdfs from mail to the database: - ```python + ```py filter_emails = ["mycreditcard@bank.com", "community@dlthub.com."] # Email senders attachments = inbox_source( filter_emails=filter_emails, filter_by_mime_type=["application/pdf"] diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md index c796014835..068251a927 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md @@ -51,7 +51,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init jira duckdb ``` @@ -102,16 +102,16 @@ For more information, read [General Usage: Credentials.](../../general-usage/cre 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python jira_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`. You may also @@ -134,13 +134,14 @@ it is important to note the complete list of the default endpoints given in This source function creates a list of resources to load data into the destination. -```python +```py @dlt.source def jira( subdomain: str = dlt.secrets.value, email: str = dlt.secrets.value, api_token: str = dlt.secrets.value, ) -> Iterable[DltResource]: + ... ``` - `subdomain`: The subdomain of the Jira account. Configured in ".dlt/secrets.toml". @@ -152,13 +153,14 @@ def jira( This function returns a resource for querying issues using JQL [(Jira Query Language)](https://support.atlassian.com/jira-service-management-cloud/docs/use-advanced-search-with-jira-query-language-jql/). -```python +```py @dlt.source def jira_search( subdomain: str = dlt.secrets.value, email: str = dlt.secrets.value, api_token: str = dlt.secrets.value, ) -> Iterable[DltResource]: + ... ``` The above function uses the same arguments `subdomain`, `email`, and `api_token` as described above @@ -168,7 +170,7 @@ for the [jira source](jira.md#source-jira). The resource function searches issues using JQL queries and then loads them to the destination. -```python +```py @dlt.resource(write_disposition="replace") def issues(jql_queries: List[str]) -> Iterable[TDataItem]: api_path = "rest/api/3/search" @@ -186,7 +188,7 @@ above. about pipeline configuration, please refer to our documentation [here](https://dlthub.com/docs/general-usage/pipeline): - ```python + ```py pipeline = dlt.pipeline( pipeline_name="jira_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -196,7 +198,7 @@ above. 2. To load custom endpoints such as ā€œissuesā€ and ā€œusersā€ using the jira source function: - ```python + ```py #Run the pipeline load_info = pipeline.run(jira().with_resources("issues","users")) print(f"Load Information: {load_info}") @@ -205,11 +207,11 @@ above. 3. To load the custom issues using JQL queries, you can use custom queries. Here is an example below: - ```python + ```py # Define the JQL queries as follows queries = [ "created >= -30d order by created DESC", - "created >= -30d AND project = DEV AND issuetype = Epic AND status = "In Progress" order by created DESC", + 'created >= -30d AND project = DEV AND issuetype = Epic AND status = "In Progress" order by created DESC', ] # Run the pipeline load_info = pipeline.run(jira_search().issues(jql_queries=queries)) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md index 5bff03e357..87fa2d6927 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md @@ -38,7 +38,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init kafka duckdb ``` @@ -80,20 +80,20 @@ sasl_password="example_secret" 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 2. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python kafka_pipeline.py ``` 3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -108,7 +108,7 @@ For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/ This function retrieves messages from the given Kafka topics. -```python +```py @dlt.resource(name="kafka_messages", table_name=lambda msg: msg["_kafka"]["topic"], standalone=True) def kafka_consumer( topics: Union[str, List[str]], @@ -118,6 +118,7 @@ def kafka_consumer( batch_timeout: Optional[int] = 3, start_from: Optional[TAnyDateTime] = None, ) -> Iterable[TDataItem]: + ... ``` `topics`: A list of Kafka topics to be extracted. @@ -151,7 +152,7 @@ this offset. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="kafka", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -161,7 +162,7 @@ this offset. 2. To extract several topics: - ```python + ```py topics = ["topic1", "topic2", "topic3"] source = kafka_consumer(topics) @@ -170,7 +171,7 @@ this offset. 3. To extract messages and process them in a custom way: - ```python + ```py def custom_msg_processor(msg: confluent_kafka.Message) -> Dict[str, Any]: return { "_kafka": { @@ -187,7 +188,7 @@ this offset. 4. To extract messages, starting from a timestamp: - ```python + ```py data = kafka_consumer("topic", start_from=pendulum.datetime(2023, 12, 15)) pipeline.run(data) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md index 45841850c6..8be748b1a3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md @@ -44,7 +44,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init matomo duckdb ``` @@ -102,16 +102,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python matomo_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `matomo`, you may also @@ -128,7 +128,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function executes and loads a set of reports defined in "queries" for a specific Matomo site identified by "site_id". -```python +```py @dlt.source(max_table_nesting=2) def matomo_reports( api_token: str = dlt.secrets.value, @@ -136,6 +136,7 @@ def matomo_reports( queries: List[DictStrAny] = dlt.config.value, site_id: int = dlt.config.value, ) -> Iterable[DltResource]: + ... ``` `api_token`: API access token for Matomo server authentication, defaults to "./dlt/secrets.toml" @@ -152,7 +153,7 @@ def matomo_reports( The function loads visits from current day and the past `initial_load_past_days` in first run. In subsequent runs it continues from last load and skips active visits until closed. -```python +```py def matomo_visits( api_token: str = dlt.secrets.value, url: str = dlt.config.value, @@ -162,6 +163,7 @@ def matomo_visits( visit_max_duration_seconds: int = 3600, get_live_event_visitors: bool = False, ) -> List[DltResource]: + ... ``` `api_token`: API token for authentication, defaulting to "./dlt/secrets.toml". @@ -184,7 +186,7 @@ def matomo_visits( This function retrieves site visits within a specified timeframe. If a start date is given, it begins from that date. If not, it retrieves all visits up until now. -```python +```py @dlt.resource( name="visits", write_disposition="append", primary_key="idVisit", selected=True ) @@ -196,6 +198,7 @@ def get_last_visits( visit_max_duration_seconds: int = 3600, rows_per_page: int = 2000, ) -> Iterator[TDataItem]: + ... ``` `site_id`: Unique ID for each Matomo site. @@ -215,7 +218,7 @@ def get_last_visits( This function, retrieves unique visit information from get_last_visits. -```python +```py @dlt.transformer( data_from=get_last_visits, write_disposition="merge", @@ -225,6 +228,7 @@ This function, retrieves unique visit information from get_last_visits. def get_unique_visitors( visits: List[DictStrAny], client: MatomoAPIClient, site_id: int ) -> Iterator[TDataItem]: + ... ``` `visits`: Recent visit data within the specified timeframe. @@ -242,7 +246,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="matomo", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -255,7 +259,7 @@ verified source. 1. To load the data from reports. - ```python + ```py data_reports = matomo_reports() load_info = pipeline_reports.run(data_reports) print(load_info) @@ -264,7 +268,7 @@ verified source. 1. To load custom data from reports using queries. - ```python + ```py queries = [ { "resource_name": "custom_report_name", @@ -285,7 +289,7 @@ verified source. 1. To load data from reports and visits. - ```python + ```py data_reports = matomo_reports() data_events = matomo_visits() load_info = pipeline_reports.run([data_reports, data_events]) @@ -294,7 +298,7 @@ verified source. 1. To load data on live visits and visitors, and only retrieve data from today. - ```python + ```py load_data = matomo_visits(initial_load_past_days=1, get_live_event_visitors=True) load_info = pipeline_events.run(load_data) print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md index 9178d2ab6d..a30eb3f248 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md @@ -66,30 +66,30 @@ Here are the typical ways to configure MongoDB and their connection URLs: 1. Connect to MongoDB: - ```bash + ```sh mongo "mongodb://dbuser:passwd@your_host:27017" ``` 1. List all Databases: - ```bash + ```sh show dbs ``` 1. View Collections in a Database: 1. Switch to Database: - ```bash + ```sh use your_database_name ``` 1. Display its Collections: - ```bash + ```sh show collections ``` 1. Disconnect: - ```bash + ```sh exit ``` @@ -115,7 +115,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init mongodb duckdb ``` @@ -174,16 +174,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python mongodb_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `local_mongo`, you may also @@ -200,7 +200,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function loads data from a MongoDB database, yielding one or multiple collections to be retrieved. -```python +```py @dlt.source def mongodb( connection_url: str = dlt.secrets.value, @@ -209,6 +209,7 @@ def mongodb( incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg] write_disposition: Optional[str] = dlt.config.value, ) -> Iterable[DltResource]: + ... ``` `connection_url`: MongoDB connection URL. @@ -226,7 +227,7 @@ def mongodb( This function fetches a single collection from a MongoDB database using PyMongo. -```python +```py def mongodb_collection( connection_url: str = dlt.secrets.value, database: Optional[str] = dlt.config.value, @@ -234,6 +235,7 @@ def mongodb_collection( incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg] write_disposition: Optional[str] = dlt.config.value, ) -> Any: + ... ``` `collection`: Name of the collection to load. @@ -247,7 +249,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="mongodb_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -257,7 +259,7 @@ verified source. 1. To load all the collections in a database: - ```python + ```py load_data = mongodb() load_info = pipeline.run(load_data, write_disposition="replace") print(load_info) @@ -265,7 +267,7 @@ verified source. 1. To load a specific collections from the database: - ```python + ```py load_data = mongodb().with_resources("collection_1", "collection_2") load_info = pipeline.run(load_data, write_disposition="replace") print(load_info) @@ -273,7 +275,7 @@ verified source. 1. To load specific collections from the source incrementally: - ```python + ```py load_data = mongodb(incremental=dlt.sources.incremental("date")).with_resources("collection_1") load_info = pipeline.run(load_data, write_disposition = "merge") print(load_info) @@ -282,7 +284,7 @@ verified source. 1. To load data from a particular collection say "movies" incrementally: - ```python + ```py load_data = mongodb_collection( collection="movies", incremental=dlt.sources.incremental( @@ -300,7 +302,7 @@ verified source. 1. To incrementally load a table with an append-only disposition using hints: - ```python + ```py # Suitable for tables where new rows are added, but existing rows aren't updated. # Load data from the 'listingsAndReviews' collection in MongoDB, using 'last_scraped' for incremental addition. airbnb = mongodb().with_resources("listingsAndReviews") @@ -317,7 +319,7 @@ verified source. 1. To load a selected collection and rename it in the destination: - ```python + ```py # Create the MongoDB source and select the "collection_1" collection source = mongodb().with_resources("collection_1") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md index a713121f29..338611e657 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md @@ -46,7 +46,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init mux duckdb ``` @@ -88,16 +88,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python mux_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is @@ -115,7 +115,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function yields resources "asset_resource" and "views_resource" to load video assets and views. -```python +```py @dlt.source def mux_source() -> Iterable[DltResource]: yield assets_resource @@ -126,13 +126,14 @@ def mux_source() -> Iterable[DltResource]: The assets_resource function fetches metadata about video assets from the Mux API's "assets" endpoint. -```python +```py @dlt.resource(write_disposition="merge") def assets_resource( mux_api_access_token: str = dlt.secrets.value, mux_api_secret_key: str = dlt.secrets.value, limit: int = DEFAULT_LIMIT, ) -> Iterable[TDataItem]: + ... ``` `mux_api_access_token`: Mux API token for authentication, defaults to ".dlt/secrets.toml". @@ -145,13 +146,14 @@ def assets_resource( This function yields data about every video view from yesterday to be loaded. -```python +```py @dlt.resource(write_disposition="append") def views_resource( mux_api_access_token: str = dlt.secrets.value, mux_api_secret_key: str = dlt.secrets.value, limit: int = DEFAULT_LIMIT, ) -> Iterable[DltResource]: + ... ``` The arguments `mux_api_access_token`, `mux_api_secret_key` and `limit` are the same as described [above](#resource-assets_resource) in "asset_resource". @@ -165,7 +167,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="mux_pipeline", # Use a custom name if desired destination="bigquery", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -175,21 +177,21 @@ verified source. 1. To load metadata about every asset to be loaded: - ```python - load_info = pipeline.run(mux_source().with_resources("assets_resource") + ```py + load_info = pipeline.run(mux_source().with_resources("assets_resource")) print(load_info) ``` 1. To load data for each video view from yesterday: - ```python - load_info = pipeline.run(mux_source().with_resources("views_resource") + ```py + load_info = pipeline.run(mux_source().with_resources("views_resource")) print(load_info) ``` 1. To load both metadata about assets and video views from yesterday: - ```python + ```py load_info = pipeline.run(mux_source()) print(load_info) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md index ffb0becfbb..650fc10fde 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md @@ -50,7 +50,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init notion duckdb ``` @@ -93,16 +93,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python notion_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `notion`, you may also use any @@ -119,12 +119,13 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function loads notion databases from notion into the destination. -```python +```py @dlt.source def notion_databases( database_ids: Optional[List[Dict[str, str]]] = None, api_key: str = dlt.secrets.value, ) -> Iterator[DltResource]: + ... ``` `database_ids`: A list of dictionaries each containing a database id and a name. @@ -146,7 +147,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="notion", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -159,7 +160,7 @@ verified source. 1. To load all the integrated databases: - ```python + ```py load_data = notion_databases() load_info = pipeline.run(load_data) print(load_info) @@ -167,7 +168,7 @@ verified source. 1. To load the custom databases: - ```python + ```py selected_database_ids = [{"id": "0517dae9409845cba7d","use_name":"db_one"}, {"id": "d8ee2d159ac34cfc"}] load_data = notion_databases(database_ids=selected_database_ids) load_info = pipeline.run(load_data) @@ -176,7 +177,7 @@ verified source. The Database ID can be retrieved from the URL. For example if the URL is: - ```shell + ```sh https://www.notion.so/d8ee2d159ac34cfc85827ba5a0a8ae71?v=c714dec3742440cc91a8c38914f83b6b ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md index 6fae36d0ec..af951bd21a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md @@ -57,7 +57,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init personio duckdb ``` @@ -102,16 +102,16 @@ For more information, read [Credentials](../../general-usage/credentials). 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python personio_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `personio`, you may also use @@ -127,7 +127,7 @@ For more information, read [Run a pipeline.](../../walkthroughs/run-a-pipeline) ### Source `personio_source` This `dlt` source returns data resources like `employees`, `absences`, `absence_types`, etc. -```python +```py @dlt.source(name="personio") def personio_source( client_id: str = dlt.secrets.value, @@ -158,8 +158,8 @@ def personio_source( This resource retrieves data on all the employees in a company. -```python - @dlt.resource(primary_key="id", write_disposition="merge") +```py +@dlt.resource(primary_key="id", write_disposition="merge") def employees( updated_at: dlt.sources.incremental[ pendulum.DateTime @@ -185,9 +185,10 @@ data incrementally from the Personio API to your preferred destination. ### Resource `absence_types` Simple resource, which retrieves a list of various types of employee absences. -```python +```py @dlt.resource(primary_key="id", write_disposition="replace") def absence_types(items_per_page: int = items_per_page) -> Iterable[TDataItem]: + ... ... ``` @@ -209,7 +210,7 @@ The transformer functions transform or process data from resources. The transformer function `employees_absences_balance` process data from the `employees` resource. It fetches and returns a list of the absence balances for each employee. -```python +```py @dlt.transformer( data_from=employees, write_disposition="merge", @@ -232,7 +233,7 @@ verified source. 1. Configure the [pipeline](../../general-usage/pipeline) by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="personio", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -242,14 +243,14 @@ verified source. 1. To load employee data: - ```python + ```py load_data = personio_source().with_resources("employees") print(pipeline.run(load_data)) ``` 1. To load data from all supported endpoints: - ```python + ```py load_data = personio_source() print(pipeline.run(load_data)) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index 1da5205471..9b2c8a640f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -53,7 +53,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init pipedrive duckdb ``` @@ -93,16 +93,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python pipedrive_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `pipedrive`, but you may also use @@ -138,12 +138,13 @@ Pipedrive API. This function returns a list of resources including activities, deals, custom_fields_mapping and other resources data from Pipedrive API. -```python +```py @dlt.source(name="pipedrive") def pipedrive_source( pipedrive_api_key: str = dlt.secrets.value, since_timestamp: Optional[Union[pendulum.DateTime, str]] = dlt.config.value, ) -> Iterator[DltResource]: + ... ``` `pipedrive_api_key`: Authentication token for Pipedrive, configured in ".dlt/secrets.toml". @@ -159,7 +160,7 @@ This code generates resources for each entity in [RECENTS_ENTITIES](https://github.com/dlt-hub/verified-sources/blob/master/sources/pipedrive/settings.py), stores them in endpoints_resources, and then loads data from each endpoint to the destination. -```python +```py endpoints_resources = {} for entity, resource_name in RECENTS_ENTITIES.items(): endpoints_resources[resource_name] = dlt.resource( @@ -186,7 +187,7 @@ for entity, resource_name in RECENTS_ENTITIES.items(): This function gets the participants of deals from the Pipedrive API and yields the result. -```python +```py def pipedrive_source(args): # Rest of function yield endpoints_resources["deals"] | dlt.transformer( @@ -209,12 +210,13 @@ further processing or loading. This function preserves the mapping of custom fields across different pipeline runs. It is used to create and store a mapping of custom fields for different entities in the source state. -```python +```py @dlt.resource(selected=False) def create_state(pipedrive_api_key: str) -> Iterator[Dict[str, Any]]: def _get_pages_for_rename( entity: str, fields_entity: str, pipedrive_api_key: str ) -> Dict[str, Any]: + ... ``` It processes each entity in ENTITY_MAPPINGS, updating the custom fields mapping if a related fields @@ -238,7 +240,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="pipedrive", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -251,7 +253,7 @@ verified source. 1. To print source info: - ```python + ```py pipedrive_data = pipedrive_source() #print source info print(pipedrive_data) @@ -263,7 +265,7 @@ verified source. 1. To load all the data in Pipedrive: - ```python + ```py load_data = pipedrive_source() # calls the source function load_info = pipeline.run(load_data) #runs the pipeline with selected source configuration print(load_info) @@ -271,7 +273,7 @@ verified source. 1. To load data from selected resources: - ```python + ```py #To load custom fields, include custom_fields_mapping for hash to name mapping. load_data = pipedrive_source().with_resources("products", "deals", "deals_participants", "custom_fields_mapping") load_info = pipeline.run(load_data) #runs the pipeline loading selected data @@ -280,7 +282,7 @@ verified source. 1. To load data from a start date: - ```python + ```py # Configure a source for 'activities' starting from the specified date. # The 'custom_fields_mapping' is incorporated to convert custom field hashes into their respective names. activities_source = pipedrive_source( diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md index aa8fbe10d4..7d6b6e036a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md @@ -63,7 +63,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init salesforce duckdb ``` @@ -110,16 +110,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python salesforce_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `salesforce`, you may also use @@ -137,13 +137,14 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function returns a list of resources to load users, user_role, opportunity, opportunity_line_item, account etc. data from Salesforce API. -```python +```py @dlt.source(name="salesforce") def salesforce_source( user_name: str = dlt.secrets.value, password: str = dlt.secrets.value, security_token: str = dlt.secrets.value, ) ->Iterable[DltResource]: + ... ``` - `user_name`: Your Salesforce account username. @@ -156,7 +157,7 @@ def salesforce_source( This resource function retrieves records from the Salesforce "User" endpoint. -```python +```py @dlt.resource(write_disposition="replace") def sf_user() -> Iterator[Dict[str, Any]]: yield from get_records(client, "User") @@ -176,7 +177,7 @@ the "user_role" endpoint. This resource function retrieves records from the Salesforce "Opportunity" endpoint in incremental mode. -```python +```py @dlt.resource(write_disposition="merge") def opportunity( last_timestamp: Incremental[str] = dlt.sources.incremental( @@ -215,7 +216,7 @@ To create your data pipeline using single loading and 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="salesforce_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -228,7 +229,7 @@ To create your data pipeline using single loading and 1. To load data from all the endpoints, use the `salesforce_source` method as follows: - ```python + ```py load_data = salesforce_source() source.schema.merge_hints({"not_null": ["id"]}) # Hint for id field not null load_info = pipeline.run(load_data) @@ -241,7 +242,7 @@ To create your data pipeline using single loading and 1. To use the method `pipeline.run()` to load custom endpoints ā€œcandidatesā€ and ā€œmembersā€: - ```python + ```py load_info = pipeline.run(load_data.with_resources("opportunity", "contact")) # print the information on data that was loaded print(load_info) @@ -260,7 +261,7 @@ To create your data pipeline using single loading and 1. To load data from the ā€œcontactā€ in replace mode and ā€œtaskā€ incrementally merge mode endpoints: - ```python + ```py load_info = pipeline.run(load_data.with_resources("contact", "task")) # pretty print the information on data that was loaded print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md index 09dc392c87..af00b17703 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md @@ -61,7 +61,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init shopify_dlt duckdb ``` @@ -125,16 +125,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python shopify_dlt_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `shopify_data`, you may also @@ -152,7 +152,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function returns a list of resources to load products, orders, and customers data from Shopify API. -```python +```py def shopify_source( private_app_password: str = dlt.secrets.value, api_version: str = DEFAULT_API_VERSION, @@ -163,6 +163,7 @@ def shopify_source( items_per_page: int = DEFAULT_ITEMS_PER_PAGE, order_status: TOrderStatus = "any", ) -> Iterable[DltResource]: + ... ``` `private_app_password`: App's password for your shop. @@ -188,7 +189,7 @@ incremental loading if unspecified. This resource loads products from your Shopify shop into the destination. It supports incremental loading and pagination. -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def products( updated_at: dlt.sources.incremental[ @@ -202,6 +203,7 @@ def products( created_at_min: pendulum.DateTime = created_at_min_obj, items_per_page: int = items_per_page, ) -> Iterable[TDataItem]: + ... ``` `updated_at`: The saved [state](../../general-usage/state) of the last 'updated_at' value. @@ -212,7 +214,7 @@ support incremental loading and pagination. ### Resource `shopify_partner_query`: This resource can be used to run custom GraphQL queries to load paginated data. -```python +```py @dlt.resource def shopify_partner_query( query: str, @@ -224,6 +226,7 @@ def shopify_partner_query( organization_id: str = dlt.config.value, api_version: str = DEFAULT_PARTNER_API_VERSION, ) -> Iterable[TDataItem]: + ... ``` `query`: The GraphQL query for execution. @@ -251,7 +254,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="shopify", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -264,7 +267,7 @@ verified source. 1. To load data from "products", "orders" and "customers" from 1st Jan 2023. - ```python + ```py # Add your desired resources to the list... resources = ["products", "orders", "customers"] start_date="2023-01-01" @@ -278,7 +281,7 @@ verified source. minimizes potential failure during large data loads. Running chunks and incremental loads in parallel accelerates the initial load. - ```python + ```py # Load all orders from 2023-01-01 to now min_start_date = current_start_date = pendulum.datetime(2023, 1, 1) max_end_date = pendulum.now() @@ -310,7 +313,7 @@ verified source. print(load_info) ``` 1. To load the first 10 transactions via GraphQL query from the Shopify Partner API. - ```python + ```py # Construct query to load transactions 100 per page, the `$after` variable is used to paginate query = """query Transactions($after: String) { transactions(after: $after, first: 10) { diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 647e39a427..104eeff388 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init slack duckdb ``` @@ -107,20 +107,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python slack_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -138,7 +138,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage It retrieves data from Slack's API and fetches the Slack data such as channels, messages for selected channels, users, logs. -```python +```py @dlt.source(name="slack", max_table_nesting=2) def slack_source( page_size: int = MAX_PAGE_SIZE, @@ -147,6 +147,7 @@ def slack_source( end_date: Optional[TAnyDateTime] = None, selected_channels: Optional[List[str]] = dlt.config.value, ) -> Iterable[DltResource]: + ... ``` `page_size`: Maximum items per page (default: 1000). @@ -163,25 +164,27 @@ def slack_source( This function yields all the channels data as a `dlt` resource. -```python +```py @dlt.resource(name="channels", primary_key="id", write_disposition="replace") def channels_resource() -> Iterable[TDataItem]: + ... ``` ### Resource `users` This function yields all the users data as a `dlt` resource. -```python +```py @dlt.resource(name="users", primary_key="id", write_disposition="replace") def users_resource() -> Iterable[TDataItem]: + ... ``` ### Resource `get_messages_resource` This method fetches messages for a specified channel from the Slack API. It creates a resource for each channel with the channel's name. -```python +```py def get_messages_resource( channel_data: Dict[str, Any], created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental( @@ -191,6 +194,7 @@ def get_messages_resource( allow_external_schedulers=True, ), ) -> Iterable[TDataItem]: + ... ``` `channel_data`: A dictionary detailing a specific channel to determine where messages are fetched from. @@ -209,7 +213,7 @@ def get_messages_resource( This method retrieves access logs from the Slack API. -```python +```py @dlt.resource( name="access_logs", selected=False, @@ -218,6 +222,7 @@ This method retrieves access logs from the Slack API. ) # it is not an incremental resource it just has a end_date filter def logs_resource() -> Iterable[TDataItem]: + ... ``` `selected`: A boolean set to False, indicating the resource isn't loaded by default. @@ -235,7 +240,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="slack", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -244,7 +249,7 @@ verified source. ``` 1. To load Slack resources from the specified start date: - ```python + ```py source = slack_source(page_size=1000, start_date=datetime(2023, 9, 1), end_date=datetime(2023, 9, 8)) # Enable below to load only 'access_logs', available for paid accounts only. @@ -258,7 +263,7 @@ verified source. 1. To load data from selected Slack channels from the specified start date: - ```python + ```py # To load data from selected channels. selected_channels=["general", "random"] # Enter the channel names here. @@ -275,7 +280,7 @@ verified source. 1. To load only messages from selected Slack resources: - ```python + ```py # To load data from selected channels. selected_channels=["general", "random"] # Enter the channel names here. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index 67965863ce..56fc826ce8 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -58,8 +58,8 @@ The database above doesn't require a password. The connection URL can be broken down into: -```python -connection_url = "connection_string = f"{drivername}://{username}:{password}@{host}:{port}/{database}" +```py +connection_url = connection_string = f"{drivername}://{username}:{password}@{host}:{port}{database}" ``` `drivername`: Indicates both the database system and driver used. @@ -116,7 +116,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init sql_database duckdb ``` @@ -158,7 +158,7 @@ For more information, read the guide on [how to add a verified source](../../wal 1. You can also pass credentials in the pipeline script the following way: - ```python + ```py credentials = ConnectionStringCredentials( "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ) @@ -176,19 +176,19 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Install the necessary dependencies by running the following command: - ```bash + ```sh pip install -r requirements.txt ``` 1. Run the verified source by entering: - ```bash + ```sh python sql_database_pipeline.py ``` 1. Make sure that everything is loaded as expected with: - ```bash + ```sh dlt pipeline show ``` @@ -208,7 +208,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage This function loads data from an SQL database via SQLAlchemy and auto-creates resources for each table or from a specified list of tables. -```python +```py @dlt.source def sql_database( credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, @@ -220,6 +220,7 @@ def sql_database( defer_table_reflect: Optional[bool] = dlt.config.value, table_adapter_callback: Callable[[Table], None] = None, ) -> Iterable[DltResource]: + ... ``` `credentials`: Database details or an 'sqlalchemy.Engine' instance. @@ -244,7 +245,7 @@ remove certain columns to be selected. This function loads data from specific database tables. -```python +```py @dlt.common.configuration.with_config( sections=("sources", "sql_database"), spec=SqlTableResourceConfiguration ) @@ -259,6 +260,7 @@ def sql_table( defer_table_reflect: Optional[bool] = dlt.config.value, table_adapter_callback: Callable[[Table], None] = None, ) -> DltResource: + ... ``` `incremental`: Optional, enables incremental loading. @@ -284,7 +286,7 @@ certain range. 1. Consider a table with a `last_modified` timestamp column. By setting this column as your cursor and specifying an initial value, the loader generates a SQL query filtering rows with `last_modified` values greater than the specified initial value. - ```python + ```py from sql_database import sql_table from datetime import datetime @@ -303,7 +305,7 @@ certain range. 1. To incrementally load the "family" table using the sql_database source method: - ```python + ```py source = sql_database().with_resources("family") #using the "updated" field as an incremental field using initial value of January 1, 2022, at midnight source.family.apply_hints(incremental=dlt.sources.incremental("updated"),initial_value=pendulum.DateTime(2022, 1, 1, 0, 0, 0)) @@ -315,7 +317,7 @@ certain range. 1. To incrementally load the "family" table using the 'sql_table' resource. - ```python + ```py family = sql_table( table="family", incremental=dlt.sources.incremental( @@ -342,7 +344,7 @@ When running on Airflow ### Parallel extraction You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. -```python +```py database = sql_database().parallelize() table = sql_table().parallelize() ``` @@ -358,7 +360,7 @@ To create your own pipeline, use source and resource methods from this verified 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="rfam", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -370,7 +372,7 @@ To create your own pipeline, use source and resource methods from this verified 1. To load the entire database, use the `sql_database` source as: - ```python + ```py source = sql_database() info = pipeline.run(source, write_disposition="replace") print(info) @@ -378,7 +380,7 @@ To create your own pipeline, use source and resource methods from this verified 1. If you just need the "family" table, use: - ```python + ```py source = sql_database().with_resources("family") #running the pipeline info = pipeline.run(source, write_disposition="replace") @@ -389,7 +391,7 @@ To create your own pipeline, use source and resource methods from this verified [documentation](https://dlthub.com/docs/general-usage/customising-pipelines/pseudonymizing_columns). As an example, here's how to pseudonymize the "rfam_acc" column in the "family" table: - ```python + ```py import hashlib def pseudonymize_name(doc): @@ -421,7 +423,7 @@ To create your own pipeline, use source and resource methods from this verified 1. To exclude columns, such as the "rfam_id" column from the "family" table before loading: - ```python + ```py def remove_columns(doc): del doc["rfam_id"] return doc diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md index 4ddf20aa78..0ac1fe7acf 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md @@ -50,7 +50,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init strapi duckdb ``` @@ -73,7 +73,7 @@ For more information, read the guide on [how to add a verified source](../../wal information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py # put your secret values and credentials here. do not share this file and do not push it to github [sources.strapi] api_secret_key = "api_secret_key" # please set me up! @@ -96,13 +96,13 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python strapi_pipeline.py ``` @@ -113,7 +113,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -131,13 +131,14 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrives data from Strapi. -```python +```py @dlt.source def strapi_source( endpoints: List[str], api_secret_key: str = dlt.secrets.value, domain: str = dlt.secrets.value, ) -> Iterable[DltResource]: + ... ``` `endpoints`: Collections to fetch data from. @@ -155,7 +156,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="strapi", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -165,7 +166,7 @@ verified source. 1. To load the specified endpoints: - ```python + ```py endpoints = ["athletes"] load_data = strapi_source(endpoints=endpoints) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 0b172dc3be..118c0e6511 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -56,7 +56,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init stripe_analytics duckdb ``` @@ -96,20 +96,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python stripe_analytics_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -127,7 +127,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug You can write your own pipelines to load data to a destination using this verified source. However, it is important to note is how the `ENDPOINTS` and `INCREMENTAL_ENDPOINTS` tuples are defined in `stripe_analytics/settings.py`. -```python +```py # The most popular Stripe API's endpoints ENDPOINTS = ("Subscription", "Account", "Coupon", "Customer", "Product", "Price") # Possible incremental endpoints @@ -140,7 +140,7 @@ INCREMENTAL_ENDPOINTS = ("Event", "Invoice", "BalanceTransaction") This function retrieves data from the Stripe API for the specified endpoint: -```python +```py @dlt.source def stripe_source( endpoints: Tuple[str, ...] = ENDPOINTS, @@ -148,6 +148,7 @@ def stripe_source( start_date: Optional[DateTime] = None, end_date: Optional[DateTime] = None, ) -> Iterable[DltResource]: + ... ``` - `endpoints`: Tuple containing endpoint names. @@ -159,7 +160,7 @@ def stripe_source( This source loads data in 'append' mode from incremental endpoints. -```python +```py @dlt.source def incremental_stripe_source( endpoints: Tuple[str, ...] = INCREMENTAL_ENDPOINTS, @@ -167,6 +168,7 @@ def incremental_stripe_source( initial_start_date: Optional[DateTime] = None, end_date: Optional[DateTime] = None, ) -> Iterable[DltResource]: + ... ``` `endpoints`: Tuple containing incremental endpoint names. @@ -183,9 +185,10 @@ For more information, read the [General Usage: Incremental loading](../../genera This function loads a dictionary with calculated metrics, including MRR and Churn rate, along with the current timestamp. -```python +```py @dlt.resource(name="Metrics", write_disposition="append", primary_key="created") def metrics_resource() -> Iterable[TDataItem]: + ... ``` Abrevations MRR and Churn rate are as follows: @@ -203,7 +206,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="stripe_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -213,7 +216,7 @@ verified source. 1. To load endpoints like "Plan" and "Charge" in replace mode, retrieve all data for the year 2022: - ```python + ```py source_single = stripe_source( endpoints=("Plan", "Charge"), start_date=datetime(2022, 1, 1), @@ -225,7 +228,7 @@ verified source. 1. To load data from the "Invoice" endpoint, which has static data, using incremental loading: - ```python + ```py # Load all data on the first run that was created after start_date and before end_date source_incremental = incremental_stripe_source( endpoints=("Invoice", ), @@ -239,7 +242,7 @@ verified source. 1. To load data created after December 31, 2022, adjust the data range for stripe_source to prevent redundant loading. For incremental_stripe_source, the initial_start_date will auto-update to the last loaded date from the previous run. - ```python + ```py source_single = stripe_source( endpoints=("Plan", "Charge"), start_date=datetime(2022, 12, 31), @@ -254,7 +257,7 @@ verified source. 1. To load important metrics and store them in database: - ```python + ```py # Event is an endpoint with uneditable data, so we can use 'incremental_stripe_source'. source_event = incremental_stripe_source(endpoints=("Event",)) # Subscription is an endpoint with editable data, use stripe_source. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 8701db7db8..dc4c1936f9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -65,7 +65,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init workable duckdb ``` @@ -117,20 +117,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python workable_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -146,7 +146,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug Note the default definitions of DEFAULT_ENDPOINTS and DEFAULT_DETAILS in "workable/settings.py". -```python +```py DEFAULT_ENDPOINTS = ("members", "recruiters", "stages", "requisitions", "jobs", "custom_attributes","events") DEFAULT_DETAILS = { @@ -164,7 +164,7 @@ endpoints allow incremental 'merge' mode loading. This source returns a sequence of dltResources that correspond to the endpoints. -```python +```py @dlt.source(name="workable") def workable_source( access_token: str = dlt.secrets.value, @@ -172,6 +172,7 @@ def workable_source( start_date: Optional[DateTime] = None, load_details: bool = False, ) -> Iterable[DltResource]: + ... ``` `access_token`: Authenticate the Workable API using the token specified in ".dlt/secrets.toml". @@ -187,13 +188,14 @@ def workable_source( This function is used to retrieve "candidates" endpoints. -```python +```py @dlt.resource(name="candidates", write_disposition="merge", primary_key="id") def candidates_resource( updated_at: Optional[Any] = dlt.sources.incremental( "updated_at", initial_value=workable.start_date_iso ) ) -> Iterable[TDataItem]: + ... ``` `updated_at`: Uses the dlt.sources.incremental method. Defaults to the function's start_date or Jan @@ -211,7 +213,7 @@ To create your data pipeline using single loading and 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="workable", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -221,7 +223,7 @@ To create your data pipeline using single loading and 1. To load all data: - ```python + ```py load_data = workable_source() load_info = pipeline.run(load_data) print(load_info) @@ -232,7 +234,7 @@ To create your data pipeline using single loading and 1. To load data from a specific date, including dependent endpoints: - ```python + ```py load_data = workable_source(start_date=datetime(2022, 1, 1), load_details=True) load_info = pipeline.run(load_data) print(load_info) @@ -244,8 +246,8 @@ To create your data pipeline using single loading and 1. To load custom endpoints ā€œcandidatesā€ and ā€œmembersā€: - ```python - load_info = pipeline.run(load_data.with_resources("candidates", "members") + ```py + load_info = pipeline.run(load_data.with_resources("candidates", "members")) # print the information on data that was loaded print(load_info) ``` @@ -255,7 +257,7 @@ To create your data pipeline using single loading and 1. To load data from the ā€œjobsā€ endpoint and its dependent endpoints like "activities" and "application_form": - ```python + ```py load_data = workable_source(start_date=datetime(2022, 2, 1), load_details=True) # Set the load_details as True to load all the dependent endpoints. load_info = pipeline.run(load_data.with_resources("jobs","jobs_activities","jobs_application_form")) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md index 234483dca0..11567306d9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md @@ -84,7 +84,7 @@ Here's a summarized version: 1. To get full token using the client id obtained above, you can follow the [instructions here.](https://developer.zendesk.com/documentation/ticketing/working-with-oauth/creating-and-using-oauth-tokens-with-the-api/#creating-the-access-token) - ```curl + ```sh curl https://{subdomain}.zendesk.com/api/v2/oauth/tokens.json \ -X POST \ -v -u {email_address}:{password} \ @@ -129,7 +129,7 @@ To generate Zendesk chat OAuth token, please refer to this 1. Record the "CLIENT_ID" and "SUBDOMAIN". 1. Format the below URL with your own CLIENT_ID and SUBDOMAIN, paste it into a new browser tab, and press Enter. - ```bash + ```sh https://www.zopim.com/oauth2/authorizations/new?response_type=token&client_id=CLIENT_ID&scope=read%20write&subdomain=SUBDOMAIN ``` 1. The call will be made, possibly asking you to log in and select 'Allow' to generate the token. @@ -160,7 +160,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init zendesk duckdb ``` @@ -183,7 +183,7 @@ For more information, read the guide on [how to add a verified source.](../../wa information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py #Zendesk support credentials [sources.zendesk.credentials] subdomain = "subdomain" # Zendesk subdomain @@ -215,20 +215,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python zendesk_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -246,13 +246,14 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrieves data from Zendesk Talk for phone calls and voicemails. -```python +```py @dlt.source(max_table_nesting=2) def zendesk_talk( credentials: TZendeskCredentials = dlt.secrets.value, start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE, end_date: Optional[TAnyDateTime] = None, ) -> Iterable[DltResource]: + ... ``` `credentials`: Authentication credentials. @@ -266,13 +267,14 @@ run. This function loads data from Zendesk talk endpoint. -```python +```py def talk_resource( zendesk_client: ZendeskAPIClient, talk_endpoint_name: str, talk_endpoint: str, pagination_type: PaginationType, ) -> Iterator[TDataItem]: + ... ``` `zendesk_client`: An instance of ZendeskAPIClient for making API calls to Zendesk Talk. @@ -305,7 +307,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -315,7 +317,7 @@ verified source. 1. To load data related to support, talk and chat: - ```python + ```py #zendesk support source function data_support = zendesk_support(load_all=True) # zendesk chat source function @@ -324,23 +326,23 @@ verified source. data_talk = zendesk_talk() # run pipeline with all 3 sources info = pipeline.run([data_support,data_chat,data_talk]) - return info + print(info) ``` 1. To load data related to support, chat and talk in incremental mode: - ```python - pipeline = dlt.pipeline( - pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired - destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) - full_refresh = Fasle - dataset_name="sample_zendesk_data" # Use a custom name if desired + ```py + pipeline = dlt.pipeline( + pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired + destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) + full_refresh = False, + dataset_name="sample_zendesk_data" # Use a custom name if desired ) - data = zendesk_support(load_all=True, start_date=start_date) - data_chat = zendesk_chat(start_date=start_date) - data_talk = zendesk_talk(start_date=start_date) - info = pipeline.run(data=[data, data_chat, data_talk]) - return info + data = zendesk_support(load_all=True, start_date=start_date) + data_chat = zendesk_chat(start_date=start_date) + data_talk = zendesk_talk(start_date=start_date) + info = pipeline.run(data=[data, data_chat, data_talk]) + print(info) ``` > Supports incremental loading for Support, Chat, and Talk Endpoints. By default, it fetches data @@ -350,7 +352,7 @@ verified source. 1. To load historical data in weekly ranges from Jan 1st, 2023, then switch to incremental loading for new tickets. - ```python + ```py # Load ranges of dates to load between January 1st 2023 and today min_start_date = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC") max_end_date = pendulum.today() diff --git a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md index c61805423b..ffe0abd082 100644 --- a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md +++ b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md @@ -12,7 +12,7 @@ To do so, run the [cli command](../../reference/command-line-interface.md#show-t below with your pipeline name. The pipeline name is the name of the Python file where your pipeline is defined and also displayed in your terminal when loading: -```bash +```sh dlt pipeline {pipeline_name} show ``` @@ -33,7 +33,7 @@ pipeline and hide many intricacies of correctly setting up the connection to you Execute any SQL query and get results following the Python [dbapi](https://peps.python.org/pep-0249/) spec. Below we fetch data from the customers table: -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") with pipeline.sql_client() as client: with client.execute_query( @@ -54,7 +54,7 @@ natively (i.e. BigQuery and DuckDB), `dlt` uses the native method. Thanks to tha frames may be really fast! The example below reads GitHub reactions data from the `issues` table and counts reaction types. -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", @@ -79,14 +79,14 @@ The native connection to your destination like BigQuery `Client` or DuckDB `Duck available in case you want to do anything special. Below we take the native connection to `duckdb` to get `DuckDBPyRelation` from a query: -```python +```py import dlt import duckdb pipeline = dlt.pipeline(destination="duckdb", dataset_name="github_reactions") with pipeline.sql_client() as client: conn = client.native_connection - rel = conn.sql('SELECT * FROM issues'); + rel = conn.sql('SELECT * FROM issues') rel.limit(3).show() ``` diff --git a/docs/website/docs/examples/chess_production/index.md b/docs/website/docs/examples/chess_production/index.md index d80558e745..ac305e943b 100644 --- a/docs/website/docs/examples/chess_production/index.md +++ b/docs/website/docs/examples/chess_production/index.md @@ -179,7 +179,7 @@ def load_data_with_retry(pipeline, data): :::warning To run this example you need to provide Slack incoming hook in `.dlt/secrets.toml`: -```python +```py [runtime] slack_incoming_hook="https://hooks.slack.com/services/***" ``` diff --git a/docs/website/docs/examples/google_sheets/index.md b/docs/website/docs/examples/google_sheets/index.md index 4af35f6dac..3bf3f858d8 100644 --- a/docs/website/docs/examples/google_sheets/index.md +++ b/docs/website/docs/examples/google_sheets/index.md @@ -27,7 +27,7 @@ This example is for educational purposes. For best practices, we recommend using ### Install Google client library -```shell +```sh pip install google-api-python-client ``` diff --git a/docs/website/docs/examples/nested_data/index.md b/docs/website/docs/examples/nested_data/index.md index b2b5ee2792..8a5c17604c 100644 --- a/docs/website/docs/examples/nested_data/index.md +++ b/docs/website/docs/examples/nested_data/index.md @@ -26,7 +26,7 @@ We'll learn how to: ### Install pymongo -```shell +```sh pip install pymongo>=4.3.3 ``` diff --git a/docs/website/docs/examples/pdf_to_weaviate/index.md b/docs/website/docs/examples/pdf_to_weaviate/index.md index cc2ef01e33..5b889b858d 100644 --- a/docs/website/docs/examples/pdf_to_weaviate/index.md +++ b/docs/website/docs/examples/pdf_to_weaviate/index.md @@ -14,7 +14,7 @@ import Header from '../_examples-header.md'; Additionally we'll use PyPDF2 to extract text from PDFs. Make sure you have it installed: -```shell +```sh pip install PyPDF2 ``` diff --git a/docs/website/docs/examples/qdrant_zendesk/index.md b/docs/website/docs/examples/qdrant_zendesk/index.md index 7920619b26..b71840073b 100644 --- a/docs/website/docs/examples/qdrant_zendesk/index.md +++ b/docs/website/docs/examples/qdrant_zendesk/index.md @@ -28,7 +28,7 @@ First, configure the destination credentials for [Qdrant](https://dlthub.com/doc Next, make sure you have the following dependencies installed: -```commandline +```sh pip install qdrant-client>=1.6.9 pip install fastembed>=0.1.1 ``` @@ -170,13 +170,13 @@ response = qdrant_client.query( The query above gives stores the following results in the `response` variable: -```json +```py [QueryResponse(id='6aeacd21-b3d0-5174-97ef-5aaa59486414', embedding=None, metadata={'_dlt_id': 'Nx3wBiL29xTgaQ', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-09-01T11:19:25+00:00', 'custom_status_id': 12765028278545, 'description': 'I have been trying to cancel my subscription but the system wonā€™t let me do it. Can you please help?', 'from_messaging_channel': False, 'generated_timestamp': 1693567167, 'group_id': 12765036328465, 'has_incidents': False, 'id': 12, 'is_public': True, 'organization_id': 12765041119505, 'raw_subject': 'Unable to Cancel Subscription', 'requester_id': 12765072569105, 'status': 'open', 'subject': 'Unable to Cancel Subscription', 'submitter_id': 12765072569105, 'tags': ['test1'], 'test_field': 'test1', 'ticket_form_id': 12765054772497, 'updated_at': '2023-09-01T11:19:25+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/12.json', 'via__channel': 'web'}, document='', score=0.89545774), QueryResponse(id='a22189c1-70ab-5421-938b-1caae3e7d6d8', embedding=None, metadata={'_dlt_id': 'bc/xloksL89EUg', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-07-18T17:23:42+00:00', 'custom_status_id': 12765028278545, 'description': 'ABCDEF', 'from_messaging_channel': False, 'generated_timestamp': 1689701023, 'group_id': 12765036328465, 'has_incidents': False, 'id': 4, 'is_public': True, 'organization_id': 12765041119505, 'raw_subject': 'What is this ticket', 'requester_id': 12765072569105, 'status': 'open', 'subject': 'What is this ticket', 'submitter_id': 12765072569105, 'tags': ['test1'], 'test_field': 'test1', 'ticket_form_id': 12765054772497, 'updated_at': '2023-07-18T17:23:42+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/4.json', 'via__channel': 'web'}, document='', score=0.8643349), QueryResponse(id='ce2f1c5c-41c3-56c3-a31d-2399a7a9239d', embedding=None, metadata={'_dlt_id': 'ZMuFJZo0AJxV4A', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-03-14T10:52:28+00:00', 'custom_status_id': 12765028278545, 'description': 'X', 'from_messaging_channel': False, 'generated_timestamp': 1696163084, 'group_id': 12765036328465, 'has_incidents': False, 'id': 2, 'is_public': True, 'priority': 'high', 'raw_subject': 'SCRUBBED', 'requester_id': 13726460510097, 'status': 'deleted', 'subject': 'SCRUBBED', 'submitter_id': 12765072569105, 'tags': [], 'ticket_form_id': 13726337882769, 'type': 'question', 'updated_at': '2023-09-01T12:10:35+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/2.json', 'via__channel': 'web'}, document='', score=0.8467072)] ``` To get a closer look at what the Zendesk ticket was, and how dlt dealt with it, we can index into the metadata of the first `QueryResponse` object: -```json lines +```py {'_dlt_id': 'Nx3wBiL29xTgaQ', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, diff --git a/docs/website/docs/general-usage/credentials/config_providers.md b/docs/website/docs/general-usage/credentials/config_providers.md index 860370d38a..cf23b5d5dc 100644 --- a/docs/website/docs/general-usage/credentials/config_providers.md +++ b/docs/website/docs/general-usage/credentials/config_providers.md @@ -38,7 +38,7 @@ providers. ### Example -```python +```py @dlt.source def google_sheets( spreadsheet_id=dlt.config.value, @@ -133,7 +133,7 @@ current Working Directory**. Example: If your working directory is `my_dlt_project` and your project has the following structure: -``` +```text my_dlt_project: | pipelines/ diff --git a/docs/website/docs/general-usage/credentials/config_specs.md b/docs/website/docs/general-usage/credentials/config_specs.md index 07e56b3e14..e93e1c466a 100644 --- a/docs/website/docs/general-usage/credentials/config_specs.md +++ b/docs/website/docs/general-usage/credentials/config_specs.md @@ -21,7 +21,7 @@ service account credentials, while `ConnectionStringCredentials` handles databas As an example, let's use `ConnectionStringCredentials` which represents a database connection string. -```python +```py from dlt.sources.credentials import ConnectionStringCredentials @dlt.source @@ -60,17 +60,17 @@ dsn.password="loader" You can explicitly provide credentials in various forms: -```python +```py query("SELECT * FROM customers", "postgres://loader@localhost:5432/dlt_data") # or -query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"...}) +query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"}) ``` ## Built in credentials We have some ready-made credentials you can reuse: -```python +```py from dlt.sources.credentials import ConnectionStringCredentials from dlt.sources.credentials import OAuth2Credentials from dlt.sources.credentials import GcpServiceAccountCredentials, GcpOAuthCredentials @@ -87,7 +87,7 @@ and additional query parameters. This class provides methods for parsing and generating connection strings. #### Usage -```python +```py credentials = ConnectionStringCredentials() # Set the necessary attributes @@ -117,7 +117,7 @@ client secret, refresh token, and access token. It also allows for the addition of scopes and provides methods for client authentication. Usage: -```python +```py credentials = OAuth2Credentials( client_id="CLIENT_ID", client_secret="CLIENT_SECRET", @@ -153,7 +153,7 @@ This class provides methods to retrieve native credentials for Google clients. - You may just pass the `service.json` as string or dictionary (in code and via config providers). - Or default credentials will be used. -```python +```py credentials = GcpServiceAccountCredentials() # Parse a native value (ServiceAccountCredentials) # Accepts a native value, which can be either an instance of ServiceAccountCredentials @@ -163,7 +163,7 @@ native_value = {"private_key": ".."} # or "path/to/services.json" credentials.parse_native_representation(native_value) ``` or more preferred use: -```python +```py import dlt from dlt.sources.credentials import GcpServiceAccountCredentials @@ -204,7 +204,7 @@ serialized OAuth client secrets JSON. This class provides methods for authentication and obtaining access tokens. ##### Usage -```python +```py oauth_credentials = GcpOAuthCredentials() # Accepts a native value, which can be either an instance of GoogleOAuth2Credentials @@ -214,7 +214,7 @@ native_value_oauth = {"client_secret": ...} oauth_credentials.parse_native_representation(native_value_oauth) ``` or more preferred use: -```python +```py import dlt from dlt.sources.credentials import GcpOAuthCredentials @@ -277,7 +277,7 @@ It inherits the ability to manage default credentials and extends it with method for handling partial credentials and converting credentials to a botocore session. #### Usage -```python +```py credentials = AwsCredentials() # Set the necessary attributes credentials.aws_access_key_id = "ACCESS_KEY_ID" @@ -285,7 +285,7 @@ credentials.aws_secret_access_key = "SECRET_ACCESS_KEY" credentials.region_name = "us-east-1" ``` or -```python +```py # Imports an external boto3 session and sets the credentials properties accordingly. import botocore.session @@ -295,7 +295,7 @@ credentials.parse_native_representation(session) print(credentials.aws_access_key_id) ``` or more preferred use: -```python +```py @dlt.source def aws_readers( bucket_url: str = dlt.config.value, @@ -340,14 +340,14 @@ handling partial credentials and converting credentials to a format suitable for interacting with Azure Blob Storage using the adlfs library. #### Usage -```python +```py credentials = AzureCredentials() # Set the necessary attributes credentials.azure_storage_account_name = "ACCOUNT_NAME" credentials.azure_storage_account_key = "ACCOUNT_KEY" ``` or more preferred use: -```python +```py @dlt.source def azure_readers( bucket_url: str = dlt.config.value, @@ -388,7 +388,7 @@ decorated function. Example: -```python +```py @dlt.source def zen_source(credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, some_option: bool = False): # depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected in `credentials` argument @@ -432,7 +432,7 @@ This is used a lot in the `dlt` core and may become useful for complicated sourc In fact, for each decorated function a spec is synthesized. In case of `google_sheets` following class is created: -```python +```py from dlt.sources.config import configspec, with_config @configspec diff --git a/docs/website/docs/general-usage/credentials/configuration.md b/docs/website/docs/general-usage/credentials/configuration.md index 9b2d392883..ec8e5fe32a 100644 --- a/docs/website/docs/general-usage/credentials/configuration.md +++ b/docs/website/docs/general-usage/credentials/configuration.md @@ -25,7 +25,7 @@ When done right you'll be able to run the same pipeline script during developmen In the example below, the `google_sheets` source function is used to read selected tabs from Google Sheets. It takes several arguments that specify the spreadsheet, the tab names and the Google credentials to be used when extracting data. -```python +```py @dlt.source def google_sheets( spreadsheet_id=dlt.config.value, @@ -68,14 +68,14 @@ You are free to call the function above as usual and pass all the arguments in t Instead let `dlt` to do the work and leave it to [injection mechanism](#injection-mechanism) that looks for function arguments in the config files or environment variables and adds them to your explicit arguments during a function call. Below are two most typical examples: 1. Pass spreadsheet id and tab names in the code, inject credentials from the secrets: - ```python + ```py data_source = google_sheets("23029402349032049", ["tab1", "tab2"]) ``` `credentials` value will be injected by the `@source` decorator (e.g. from `secrets.toml`). `spreadsheet_id` and `tab_names` take values from the call arguments. 2. Inject all the arguments from config / secrets - ```python + ```py data_source = google_sheets() ``` `credentials` value will be injected by the `@source` decorator (e.g. from **secrets.toml**). @@ -97,16 +97,16 @@ Where do the configs and secrets come from? By default, `dlt` looks in two **con Secrets in **.dlt/secrets.toml**. `dlt` will look for `credentials`, ```toml [credentials] - client_email = - private_key = - project_id = + client_email = "" + private_key = "" + project_id = "" ``` Note that **credentials** will be evaluated as dictionary containing **client_email**, **private_key** and **project_id** as keys. It is standard TOML behavior. - [Environment Variables](config_providers#environment-provider): - ```python - CREDENTIALS= - SPREADSHEET_ID=1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580 - TAB_NAMES=tab1,tab2 + ```toml + CREDENTIALS="" + SPREADSHEET_ID="1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580" + TAB_NAMES=["tab1", "tab2"] ``` We pass the JSON contents of `service.json` file to `CREDENTIALS` and we specify tab names as comma-delimited values. Environment variables are always in **upper case**. @@ -123,7 +123,7 @@ There are many ways you can organize your configs and secrets. The example above ### Do not hardcode secrets You should never do that. Sooner or later your private key will leak. -```python +```py # WRONG!: # provide all values directly - wrong but possible. # secret values should never be present in the code! @@ -137,7 +137,7 @@ data_source = google_sheets( ### Pass secrets in code from external providers You can get the secret values from your own providers. Below we take **credentials** for our `google_sheets` source from Airflow base hook: -```python +```py from airflow.hooks.base_hook import BaseHook # get it from airflow connections or other credential store @@ -163,7 +163,7 @@ Doing so provides several benefits: 1. You can request [built-in and custom credentials](config_specs.md) (i.e. connection strings, AWS / GCP / Azure credentials). 1. You can specify a set of possible types via `Union` i.e. OAuth or API Key authorization. -```python +```py @dlt.source def google_sheets( spreadsheet_id: str = dlt.config.value, @@ -171,7 +171,7 @@ def google_sheets( credentials: GcpServiceAccountCredentials = dlt.secrets.value, only_strings: bool = False ): - ... + ... ``` Now: @@ -189,7 +189,7 @@ In case of `GcpServiceAccountCredentials`: ## Read configs and secrets yourself `dlt.secrets` and `dlt.config` provide dictionary-like access to configuration values and secrets, respectively. -```python +```py # use `dlt.secrets` and `dlt.config` to explicitly take # those values from providers from the explicit keys data_source = google_sheets( @@ -202,14 +202,14 @@ data_source.run(destination="bigquery") ``` `dlt.config` and `dlt.secrets` behave like dictionaries from which you can request a value with any key name. `dlt` will look in all [config providers](#injection-mechanism) - TOML files, env variables etc. just like it does with the standard section layout. You can also use `dlt.config.get()` or `dlt.secrets.get()` to request value cast to a desired type. For example: -```python +```py credentials = dlt.secrets.get("my_section.gcp_credentials", GcpServiceAccountCredentials) ``` Creates `GcpServiceAccountCredentials` instance out of values (typically a dictionary) under **my_section.gcp_credentials** key. ### Write configs and secrets in code **dlt.config** and **dlt.secrets** can be also used as setters. For example: -```python +```py dlt.config["sheet_id"] = "23029402349032049" dlt.secrets["destination.postgres.credentials"] = BaseHook.get_connection('postgres_dsn').extra ``` @@ -263,9 +263,9 @@ Here is the simplest default layout for our `google_sheets` example. ```toml [credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` **config.toml** @@ -284,9 +284,9 @@ This makes sure that `google_sheets` source does not share any secrets and confi ```toml [sources.google_sheets.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` **config.toml** @@ -305,9 +305,9 @@ Use this if you want to read and pass the config/secrets yourself ```toml [my_section] - [my_section.gcp_credentials] - client_email = - private_key = +[my_section.gcp_credentials] +client_email = "" +private_key = "" ``` **config.toml** @@ -316,9 +316,9 @@ Use this if you want to read and pass the config/secrets yourself [my_section] tabs=["tab1", "tab2"] - [my_section.gcp_credentials] - # I prefer to keep my project id in config file and private key in secrets - project_id = +[my_section.gcp_credentials] +# I prefer to keep my project id in config file and private key in secrets +project_id = "" ``` ### Default layout and default key lookup during injection @@ -328,7 +328,7 @@ makes it easy to configure simple cases but also provides a room for more explic complex cases i.e. having several sources with different credentials or even hosting several pipelines in the same project sharing the same config and credentials. -``` +```text pipeline_name | |-sources @@ -368,15 +368,15 @@ Example: We use the `bigquery` destination and the `google_sheets` source. They ```toml # google sheet credentials [sources.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" # bigquery credentials [destination.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` Now when `dlt` looks for destination credentials, it will start with `destination.bigquery.credentials`, eliminate `bigquery` and stop at `destination.credentials`. @@ -388,21 +388,21 @@ Example: let's be even more explicit and use a full section path possible. ```toml # google sheet credentials [sources.google_sheets.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" # google analytics credentials [sources.google_analytics.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" # bigquery credentials [destination.bigquery.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` Now we can separate credentials for different sources as well. @@ -418,18 +418,18 @@ Example: the pipeline is named `ML_sheets`. ```toml [ML_sheets.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` or maximum path: ```toml [ML_sheets.sources.google_sheets.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` ### The `sources` section @@ -455,7 +455,7 @@ Now we can finally understand the `ConfigFieldMissingException`. Let's run `chess.py` example without providing the password: -``` +```sh $ CREDENTIALS="postgres://loader@localhost:5432/dlt_data" python chess.py ... dlt.common.configuration.exceptions.ConfigFieldMissingException: Following fields are missing: ['password'] in configuration with spec PostgresCredentials diff --git a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md index 3f665bd0fb..ba0b13636b 100644 --- a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md @@ -11,7 +11,7 @@ consistently achieve the same mapping. If instead you wish to anonymize, you can replace it with a constant. In the example below, we create a dummy source with a PII column called "name", which we replace with deterministic hashes (i.e. replacing the German umlaut). -```python +```py import dlt import hashlib diff --git a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md index 8493ffaec5..3163062ced 100644 --- a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md @@ -14,7 +14,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Create a source function that creates dummy data as follows: - ```python + ```py import dlt # This function creates a dummy data source. @@ -31,7 +31,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Next, create a function to filter out columns from the data before loading it into a database as follows: - ```python + ```py from typing import Dict, List, Optional def remove_columns(doc: Dict, remove_columns: Optional[List[str]] = None) -> Dict: @@ -53,7 +53,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Next, declare the columns to be removed from the table, and then modify the source as follows: - ```python + ```py # Example columns to remove: remove_columns_list = ["country_code"] @@ -67,7 +67,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. ``` 1. You can optionally inspect the result: - ```python + ```py for row in data_source: print(row) #{'id': 0, 'name': 'Jane Washington 0'} @@ -77,7 +77,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. At last, create a pipeline: - ```python + ```py # Integrating with a DLT pipeline pipeline = dlt.pipeline( pipeline_name='example', diff --git a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md index e58dae6d9d..04e4d33b13 100644 --- a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md @@ -12,7 +12,7 @@ In the example below, we create a dummy source with special characters in the na function that we intend to apply to the resource to modify its output (i.e. replacing the German umlaut): `replace_umlauts_in_dict_keys`. -```python +```py import dlt # create a dummy source with umlauts (special characters) in key names (um) diff --git a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md index 6b09510f68..f8bd179422 100644 --- a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md @@ -77,7 +77,7 @@ currency_conversion_enrichment/ 1. Here's the resource that yields the sample data as discussed above: - ```python + ```py @dlt.resource() def enriched_data_part_two(): data_enrichment_part_one = [ @@ -113,14 +113,14 @@ API token. information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py [sources] api_key= "Please set me up!" #ExchangeRate-API key ``` 1. Create the `converted_amount` function as follows: - ```python + ```py # @transformer(data_from=enriched_data_part_two) def converted_amount(record): """ @@ -210,7 +210,7 @@ API token. 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_two", @@ -229,7 +229,7 @@ API token. To do so, you need to add the transformer decorator at the top of the `converted_amount` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( enriched_data_part_two | converted_amount, @@ -246,19 +246,19 @@ API token. 1. Install necessary dependencies for the preferred [destination](../../dlt-ecosystem/destinations/), For example, duckdb: - ``` + ```sh pip install dlt[duckdb] ``` 1. Run the pipeline with the following command: - ``` + ```sh python currency_enrichment_pipeline.py ``` 1. To ensure that everything loads as expected, use the command: - ``` + ```sh dlt pipeline show ``` diff --git a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md index f4578d065f..ab71d3d1d0 100644 --- a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md @@ -29,7 +29,7 @@ you can use any API you prefer. By default the URL Parse API will return a JSON response like: -```text +```json { "authority": "urlparse.com", "domain": "urlparse.com", @@ -73,7 +73,7 @@ understanding, you may explore all three enrichments sequentially in the noteboo Alternatively, to create a data enrichment pipeline, you can start by creating the following directory structure: -```python +```text url_parser_enrichment/ ā”œā”€ā”€ .dlt/ ā”‚ ā””ā”€ā”€ secrets.toml @@ -100,41 +100,41 @@ Let's examine a synthetic dataset created for this article. It includes: Here's the resource that yields the sample data as discussed above: -```python - import dlt +```py + import dlt - @dlt.resource(write_disposition="append") - def tracked_data(): - """ - A generator function that yields a series of dictionaries, each representing - user tracking data. + @dlt.resource(write_disposition="append") + def tracked_data(): + """ + A generator function that yields a series of dictionaries, each representing + user tracking data. - This function is decorated with `dlt.resource` to integrate into the DLT (Data - Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to - ensure that data from this generator is appended to the existing data in the - destination table. + This function is decorated with `dlt.resource` to integrate into the DLT (Data + Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to + ensure that data from this generator is appended to the existing data in the + destination table. - Yields: - dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', - representing the user's tracking data including their device and the page - they were referred from. - """ + Yields: + dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', + representing the user's tracking data including their device and the page + they were referred from. + """ - # Sample data representing tracked user data - sample_data = [ + # Sample data representing tracked user data + sample_data = [ { "user_id": 1, "device_name": "Sony Experia XZ", "page_referer": "https://b2venture.lightning.force.com/" }, - """ - Data for other users - """ - ] - - # Yielding each user's data as a dictionary - for user_data in sample_data: - yield user_data + """ + Data for other users + """ + ] + + # Yielding each user's data as a dictionary + for user_data in sample_data: + yield user_data ``` ### 2. Create `url_parser` function @@ -143,7 +143,7 @@ We use a free service called [URL Parse API](https://urlparse.com/), to parse th need to register to use this service neither get an API key. 1. Create a `url_parser` function as follows: - ```python + ```py # @dlt.transformer(data_from=tracked_data) def url_parser(record): """ @@ -195,7 +195,7 @@ need to register to use this service neither get an API key. 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_three", @@ -214,7 +214,7 @@ need to register to use this service neither get an API key. do so, you need to add the transformer decorator at the top of the `url_parser` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( tracked_data | url_parser, @@ -230,19 +230,19 @@ need to register to use this service neither get an API key. 1. Install necessary dependencies for the preferred [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: - ``` + ```sh pip install dlt[duckdb] ``` 1. Run the pipeline with the following command: - ``` + ```sh python url_enrichment_pipeline.py ``` 1. To ensure that everything loads as expected, use the command: - ``` + ```sh dlt pipeline show ``` diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md index 8b33a852a8..6b07845689 100644 --- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -41,7 +41,7 @@ Here's the link to the notebook: ### B. Create a pipeline Alternatively, to create a data enrichment pipeline, you can start by creating the following directory structure: -```python +```text user_device_enrichment/ ā”œā”€ā”€ .dlt/ ā”‚ ā””ā”€ā”€ secrets.toml @@ -67,42 +67,42 @@ user_device_enrichment/ Here's the resource that yields the sample data as discussed above: - ```python - import dlt - - @dlt.resource(write_disposition="append") - def tracked_data(): - """ - A generator function that yields a series of dictionaries, each representing - user tracking data. - - This function is decorated with `dlt.resource` to integrate into the DLT (Data - Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to - ensure that data from this generator is appended to the existing data in the - destination table. - - Yields: - dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', - representing the user's tracking data including their device and the page - they were referred from. - """ - - # Sample data representing tracked user data - sample_data = [ - {"user_id": 1, "device_name": "Sony Experia XZ", "page_referer": - "https://b2venture.lightning.force.com/"}, - {"user_id": 2, "device_name": "Samsung Galaxy S23 Ultra 5G", - "page_referer": "https://techcrunch.com/2023/07/20/can-dlthub-solve-the-python-library-problem-for-ai-dig-ventures-thinks-so/"}, - {"user_id": 3, "device_name": "Apple iPhone 14 Pro Max", - "page_referer": "https://dlthub.com/success-stories/freelancers-perspective/"}, - {"user_id": 4, "device_name": "OnePlus 11R", - "page_referer": "https://www.reddit.com/r/dataengineering/comments/173kp9o/ideas_for_data_validation_on_data_ingestion/"}, - {"user_id": 5, "device_name": "Google Pixel 7 Pro", "page_referer": "https://pypi.org/"}, - ] - - # Yielding each user's data as a dictionary - for user_data in sample_data: - yield user_data + ```py + import dlt + + @dlt.resource(write_disposition="append") + def tracked_data(): + """ + A generator function that yields a series of dictionaries, each representing + user tracking data. + + This function is decorated with `dlt.resource` to integrate into the DLT (Data + Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to + ensure that data from this generator is appended to the existing data in the + destination table. + + Yields: + dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', + representing the user's tracking data including their device and the page + they were referred from. + """ + + # Sample data representing tracked user data + sample_data = [ + {"user_id": 1, "device_name": "Sony Experia XZ", "page_referer": + "https://b2venture.lightning.force.com/"}, + {"user_id": 2, "device_name": "Samsung Galaxy S23 Ultra 5G", + "page_referer": "https://techcrunch.com/2023/07/20/can-dlthub-solve-the-python-library-problem-for-ai-dig-ventures-thinks-so/"}, + {"user_id": 3, "device_name": "Apple iPhone 14 Pro Max", + "page_referer": "https://dlthub.com/success-stories/freelancers-perspective/"}, + {"user_id": 4, "device_name": "OnePlus 11R", + "page_referer": "https://www.reddit.com/r/dataengineering/comments/173kp9o/ideas_for_data_validation_on_data_ingestion/"}, + {"user_id": 5, "device_name": "Google Pixel 7 Pro", "page_referer": "https://pypi.org/"}, + ] + + # Yielding each user's data as a dictionary + for user_data in sample_data: + yield user_data ``` ### 2. Create `fetch_average_price` function @@ -118,7 +118,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py [sources] api_key= "Please set me up!" #Serp Api key. ``` @@ -126,7 +126,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Replace the value of the `api_key`. 1. Create `fetch_average_price()` function as follows: - ```python + ```py import datetime import requests @@ -247,7 +247,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_one", @@ -266,7 +266,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the do so, you need to add the transformer decorator at the top of the `fetch_average_price` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( tracked_data | fetch_average_price, @@ -283,19 +283,19 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Install necessary dependencies for the preferred [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: - ``` + ```sh pip install dlt[duckdb] ``` 1. Run the pipeline with the following command: - ``` + ```sh python device_enrichment_pipeline.py ``` 1. To ensure that everything loads as expected, use the command: - ``` + ```sh dlt pipeline show ``` diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md index c20aa62d16..b45ef39f3f 100644 --- a/docs/website/docs/general-usage/destination.md +++ b/docs/website/docs/general-usage/destination.md @@ -75,7 +75,7 @@ azure_storage_account_key="storage key" ``` or via environment variables: -``` +```sh DESTINATION__FILESYSTEM__BUCKET_URL=az://dlt-azure-bucket DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME=dltdata DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY="storage key" diff --git a/docs/website/docs/general-usage/full-loading.md b/docs/website/docs/general-usage/full-loading.md index 4651d156f0..320d0664f5 100644 --- a/docs/website/docs/general-usage/full-loading.md +++ b/docs/website/docs/general-usage/full-loading.md @@ -13,7 +13,7 @@ that are not selected while performing a full load will not replace any data in To perform a full load on one or more of your resources, choose the `write_disposition='replace'` for this resource: -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="github") issues = [] reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", "eyes"] diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 144b176332..b815512070 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -64,7 +64,7 @@ child tables. Example below loads all the GitHub events and updates them in the destination using "id" as primary key, making sure that only a single copy of event is present in `github_repo_events` table: -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def github_repo_events(): yield from _get_event_pages() @@ -72,26 +72,28 @@ def github_repo_events(): You can use compound primary keys: -```python +```py @dlt.resource(primary_key=("id", "url"), write_disposition="merge") -... +def resource(): + ... ``` By default, `primary_key` deduplication is arbitrary. You can pass the `dedup_sort` column hint with a value of `desc` or `asc` to influence which record remains after deduplication. Using `desc`, the records sharing the same `primary_key` are sorted in descending order before deduplication, making sure the record with the highest value for the column with the `dedup_sort` hint remains. `asc` has the opposite behavior. -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", columns={"created_at": {"dedup_sort": "desc"}} # select "latest" record ) -... +def resource(): + ... ``` Example below merges on a column `batch_day` that holds the day for which given record is valid. Merge keys also can be compound: -```python +```py @dlt.resource(merge_key="batch_day", write_disposition="merge") def get_daily_batch(day): yield _get_batch_from_bucket(day) @@ -101,7 +103,7 @@ As with any other write disposition you can use it to load data ad hoc. Below we top reactions for `duckdb` repo. The lists have, obviously, many overlapping issues, but we want to keep just one instance of each. -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="github") issues = [] reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", "eyes"] @@ -117,7 +119,7 @@ Example below dispatches GitHub events to several tables by event type, keeps on by "id" and skips loading of past records using "last value" incremental. As you can see, all of this we can just declare in our resource. -```python +```py @dlt.resource(primary_key="id", write_disposition="merge", table_name=lambda i: i['type']) def github_repo_events(last_created_at = dlt.sources.incremental("created_at", "1970-01-01T00:00:00Z")): """A resource taking a stream of github events and dispatching them to tables named by event type. Deduplicates be 'id'. Loads incrementally by 'created_at' """ @@ -134,7 +136,7 @@ Each record in the destination table with the same `primary_key` or `merge_key` Deletes are propagated to any child table that might exist. For each record that gets deleted in the root table, all corresponding records in the child table(s) will also be deleted. Records in parent and child tables are linked through the `root key` that is explained in the next section. #### Example: with primary key and boolean delete column -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", @@ -157,11 +159,11 @@ def resource(): ``` #### Example: with merge key and non-boolean delete column -```python +```py @dlt.resource( merge_key="id", write_disposition="merge", - columns={"deleted_at_ts": {"hard_delete": True}}} + columns={"deleted_at_ts": {"hard_delete": True}}) def resource(): # this will insert two records yield [ @@ -175,11 +177,11 @@ def resource(): ``` #### Example: with primary key and "dedup_sort" hint -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", - columns={"deleted_flag": {"hard_delete": True}, "lsn": {"dedup_sort": "desc"}} + columns={"deleted_flag": {"hard_delete": True}, "lsn": {"dedup_sort": "desc"}}) def resource(): # this will insert one record (the one with lsn = 3) yield [ @@ -204,7 +206,7 @@ tables. This concept is similar to foreign key which references a parent table, set. We do not enable it everywhere because it takes storage space. Nevertheless, is some cases you may want to permanently enable root key propagation. -```python +```py pipeline = dlt.pipeline( pipeline_name='facebook_insights', destination='duckdb', @@ -243,7 +245,7 @@ Once you've figured that out, `dlt` takes care of finding maximum/minimum cursor duplicates and managing the state with last values of cursor. Take a look at GitHub example below, where we request recently created issues. -```python +```py @dlt.resource(primary_key="id") def repo_issues( access_token, @@ -280,7 +282,7 @@ In the example below we incrementally load the GitHub events, where API does not let us filter for the newest events - it always returns all of them. Nevertheless, `dlt` will load only the new items, filtering out all the duplicates and past issues. -```python +```py # use naming function in table name to generate separate tables for each event @dlt.resource(primary_key="id", table_name=lambda i: i['type']) # type: ignore def repo_events( @@ -309,7 +311,7 @@ and lets you select nested and complex data (including the whole data item when Example below creates last value which is a dictionary holding a max `created_at` value for each created table name: -```python +```py def by_event_type(event): last_value = None if len(event) == 1: @@ -333,7 +335,7 @@ def get_events(last_created_at = dlt.sources.incremental("$", last_value_func=by ### Using `end_value` for backfill You can specify both initial and end dates when defining incremental loading. Let's go back to our Github example: -```python +```py @dlt.resource(primary_key="id") def repo_issues( access_token, @@ -354,7 +356,7 @@ Please note that when `end_date` is specified, `dlt` **will not modify the exist To define specific ranges to load, you can simply override the incremental argument in the resource, for example: -```python +```py july_issues = repo_issues( created_at=dlt.sources.incremental( initial_value='2022-07-01T00:00:00Z', end_value='2022-08-01T00:00:00Z' @@ -399,7 +401,7 @@ The github events example is exactly such case. The results are ordered on curso In the same fashion the `row_order` can be used to **optimize backfill** so we don't continue making unnecessary API requests after the end of range is reached. For example: -```python +```py @dlt.resource(primary_key="id") def tickets( zendesk_client, @@ -432,7 +434,7 @@ incremental and exit yield loop when true. The `dlt.sources.incremental` instance provides `start_out_of_range` and `end_out_of_range` attributes which are set when the resource yields an element with a higher/lower cursor value than the initial or end values. If you do not want `dlt` to stop processing automatically and instead to handle such events yourself, do not specify `row_order`: -```python +```py @dlt.transformer(primary_key="id") def tickets( zendesk_client, @@ -472,7 +474,7 @@ deduplicate and which does not become a table hint. The same setting lets you di deduplication altogether when empty tuple is passed. Below we pass `primary_key` directly to `incremental` to disable deduplication. That overrides `delta` primary_key set in the resource: -```python +```py @dlt.resource(primary_key="delta") # disable the unique value check by passing () as primary key to incremental def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())): @@ -485,7 +487,7 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())) When resources are [created dynamically](source.md#create-resources-dynamically) it is possible to use `dlt.sources.incremental` definition as well. -```python +```py @dlt.source def stripe(): # declare a generator function @@ -521,7 +523,7 @@ result in `IncrementalUnboundError` exception. ### Using Airflow schedule for backfill and incremental loading When [running in Airflow task](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file), you can opt-in your resource to get the `initial_value`/`start_value` and `end_value` from Airflow schedule associated with your DAG. Let's assume that **Zendesk tickets** resource contains a year of data with thousands of tickets. We want to backfill the last year of data week by week and then continue incremental loading daily. -```python +```py @dlt.resource(primary_key="id") def tickets( zendesk_client, @@ -540,7 +542,7 @@ We opt-in to Airflow scheduler by setting `allow_external_schedulers` to `True`: 2. In all other environments, the `incremental` behaves as usual, maintaining `dlt` state. Let's generate a deployment with `dlt deploy zendesk_pipeline.py airflow-composer` and customize the dag: -```python +```py @dag( schedule_interval='@weekly', start_date=pendulum.datetime(2023, 2, 1), @@ -577,7 +579,7 @@ When you enable the DAG in Airflow, it will generate several runs and start exec subsequent weekly intervals starting with `2023-02-12, 00:00:00 UTC` to `2023-02-19, 00:00:00 UTC`. You can repurpose the DAG above to start loading new data incrementally after (or during) the backfill: -```python +```py @dag( schedule_interval='@daily', start_date=pendulum.datetime(2023, 2, 1), @@ -624,7 +626,7 @@ You may force a full refresh of a `merge` and `append` pipelines: Example: -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="dataset_name") # do a full refresh p.run(merge_source(), write_disposition="replace") @@ -655,7 +657,7 @@ is loaded, the yielded resource data will be loaded at the same time with the up In the two examples below you see how the `dlt.sources.incremental` is working under the hood. -```python +```py @resource() def tweets(): # Get a last value from loaded metadata. If not exist, get None @@ -670,7 +672,7 @@ def tweets(): If we keep a list or a dictionary in the state, we can modify the underlying values in the objects, and thus we do not need to set the state back explicitly. -```python +```py @resource() def tweets(): # Get a last value from loaded metadata. If not exist, get None @@ -708,7 +710,7 @@ data twice - even if the user makes a mistake and requests the same months range In the following example, we initialize a variable with an empty list as a default: -```python +```py @dlt.resource(write_disposition="append") def players_games(chess_url, players, start_month=None, end_month=None): loaded_archives_cache = dlt.current.resource_state().setdefault("archives", []) @@ -734,7 +736,7 @@ def players_games(chess_url, players, start_month=None, end_month=None): ### Advanced state usage: tracking the last value for all search terms in Twitter API -```python +```py @dlt.resource(write_disposition="append") def search_tweets(twitter_bearer_token=dlt.secrets.value, search_terms=None, start_time=None, end_time=None, last_value=None): headers = _headers(twitter_bearer_token) diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index 095e03e96d..53eca2e59a 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -15,7 +15,7 @@ Example: This pipeline will load a list of objects into `duckdb` table with a name "three": -```python +```py import dlt pipeline = dlt.pipeline(destination="duckdb", dataset_name="sequence") @@ -53,7 +53,7 @@ Arguments: Example: This pipeline will load the data the generator `generate_rows(10)` produces: -```python +```py import dlt def generate_rows(nr): @@ -110,7 +110,7 @@ pipeline run is progressing. `dlt` supports 4 progress monitors out of the box: You pass the progress monitor in `progress` argument of the pipeline. You can use a name from the list above as in the following example: -```python +```py # create a pipeline loading chess data that dumps # progress to stdout each 10 seconds (the default) pipeline = dlt.pipeline( @@ -123,7 +123,7 @@ pipeline = dlt.pipeline( You can fully configure the progress monitor. See two examples below: -```python +```py # log each minute to Airflow task logger ti = get_current_context()["ti"] pipeline = dlt.pipeline( @@ -134,7 +134,7 @@ pipeline = dlt.pipeline( ) ``` -```python +```py # set tqdm bar color to yellow pipeline = dlt.pipeline( pipeline_name="chess_pipeline", diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index 9b8d45982d..e2e95d937f 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -19,7 +19,7 @@ Commonly used arguments: Example: -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(): for i in range(10): @@ -32,7 +32,7 @@ def source_name(): To get the data of a resource, we could do: -```python +```py for row in generate_rows(): print(row) @@ -57,7 +57,7 @@ accepts following arguments: `dlt` that column `tags` (containing a list of tags) in `user` table should have type `complex` which means that it will be loaded as JSON/struct and not as child table. - ```python + ```py @dlt.resource(name="user", columns={"tags": {"data_type": "complex"}}) def get_users(): ... @@ -82,7 +82,7 @@ You can alternatively use a [Pydantic](https://pydantic-docs.helpmanual.io/) mod For example: -```python +```py from pydantic import BaseModel @@ -119,7 +119,7 @@ Things to note: You can override this by configuring the Pydantic model -```python +```py from typing import ClassVar from dlt.common.libs.pydantic import DltConfig @@ -146,7 +146,7 @@ argument and the `table_name` string as a return value. For example, a resource that loads GitHub repository events wants to send `issue`, `pull request`, and `comment` events to separate tables. The type of the event is in the "type" field. -```python +```py # send item to a table with name item["type"] @dlt.resource(table_name=lambda event: event['type']) def repo_events() -> Iterator[TDataItems]: @@ -154,13 +154,13 @@ def repo_events() -> Iterator[TDataItems]: # the `table_schema` method gets table schema generated by a resource and takes optional # data item to evaluate dynamic hints -print(repo_events().table_schema({"type": "WatchEvent", id=...})) +print(repo_events().table_schema({"type": "WatchEvent", id:...})) ``` In more advanced cases, you can dispatch data to different tables directly in the code of the resource function: -```python +```py @dlt.resource def repo_events() -> Iterator[TDataItems]: # mark the "item" to be sent to table with name item["type"] @@ -172,7 +172,7 @@ def repo_events() -> Iterator[TDataItems]: You can add arguments to your resource functions like to any other. Below we parametrize our `generate_rows` resource to generate the number of rows we request: -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(nr): for i in range(nr): @@ -195,7 +195,7 @@ that returns a list of objects (i.e. users) in one endpoint and user details in with this by declaring a resource that obtains a list of users and another resource that receives items from the list and downloads the profiles. -```python +```py @dlt.resource(write_disposition="replace") def users(limit=None): for u in _get_users(limit): @@ -215,7 +215,7 @@ pipeline.run(user_details) ``` In the example above, `user_details` will receive data from default instance of `users` resource (with `limit` set to `None`). You can also use **pipe |** operator to bind resources dynamically -```python +```py # you can be more explicit and use a pipe operator. # with it you can create dynamic pipelines where the dependencies # are set at run time and resources are parametrized i.e. @@ -225,7 +225,7 @@ pipeline.run(users(limit=100) | user_details) :::tip Transformers are allowed not only to **yield** but also to **return** values and can decorate **async** functions and [**async generators**](../reference/performance.md#extract). Below we decorate an async function and request details on two pokemons. Http calls are made in parallel via httpx library. -```python +```py import dlt import httpx @@ -245,7 +245,7 @@ print(list([1,2] | pokemon())) A standalone resource is defined on a function that is top level in a module (not inner function) that accepts config and secrets values. Additionally if `standalone` flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the decorated function and user must call the wrapper to get the actual resource. Below we declare a `filesystem` resource that must be called before use. -```python +```py @dlt.resource(standalone=True) def filesystem(bucket_url=dlt.config.value): """list and yield files in `bucket_url`""" @@ -256,7 +256,7 @@ pipeline.run(filesystem("s3://my-bucket/reports"), table_name="reports") ``` Standalone may have dynamic name that depends on the arguments passed to the decorated function. For example:: -```python +```py @dlt.resource(standalone=True, name=lambda args: args["stream_name"]) def kinesis(stream_name: str): ... @@ -271,7 +271,7 @@ You can extract multiple resources in parallel threads or with async IO. To enable this for a sync resource you can set the `parallelized` flag to `True` in the resource decorator: -```python +```py @dlt.resource(parallelized=True) def get_users(): for u in _get_users(): @@ -288,7 +288,7 @@ pipeline.run(get_users(), get_orders()) Async generators are automatically extracted concurrently with other resources: -```python +```py @dlt.resource async def get_users(): async for u in _get_users(): # Assuming _get_users is an async generator @@ -317,7 +317,7 @@ so: Here's our resource: -```python +```py import dlt @dlt.resource(write_disposition="replace") @@ -330,7 +330,7 @@ def users(): Here's our script that defines transformations and loads the data: -```python +```py from pipedrive import users def anonymize_user(user_data): @@ -351,7 +351,7 @@ example data and test your transformations etc. In order to do that, you limit h be yielded by a resource by calling `resource.add_limit` method. In the example below we load just 10 first items from and infinite counter - that would otherwise never end. -```python +```py r = dlt.resource(itertools.count(), name="infinity").add_limit(10) assert list(r) == list(range(10)) ``` @@ -375,7 +375,7 @@ that will keep just one updated record per `user_id`. It also adds ["last value" incremental loading](incremental-loading.md#incremental_loading-with-last-value) on `created_at` column to prevent requesting again the already loaded records: -```python +```py tables = sql_database() tables.users.apply_hints( write_disposition="merge", @@ -386,7 +386,7 @@ pipeline.run(tables) ``` To just change a name of a table to which resource will load data, do the following: -```python +```py tables = sql_database() tables.users.table_name = "other_users" ``` @@ -398,7 +398,7 @@ with the existing schema in the same way `apply_hints` method above works. There should avoid lengthy operations (ie. reflecting database tables) during creation of the DAG so it is better do do it when DAG executes. You may also emit partial hints (ie. precision and scale for decimal types) for column to help `dlt` type inference. -```python +```py @dlt.resource def sql_table(credentials, schema, table): # create sql alchemy engine @@ -432,7 +432,7 @@ You can emit columns as Pydantic model and use dynamic hints (ie. lambda for tab ### Duplicate and rename resources There are cases when you your resources are generic (ie. bucket filesystem) and you want to load several instances of it (ie. files from different folders) to separate tables. In example below we use `filesystem` source to load csvs from two different folders into separate tables: -```python +```py @dlt.resource(standalone=True) def filesystem(bucket_url): # list and yield files in bucket_url @@ -463,7 +463,7 @@ You can pass individual resources or list of resources to the `dlt.pipeline` obj loaded outside the source context, will be added to the [default schema](schema.md) of the pipeline. -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(nr): for i in range(nr): @@ -485,6 +485,6 @@ To do a full refresh of an `append` or `merge` resources you temporarily change disposition to replace. You can use `apply_hints` method of a resource or just provide alternative write disposition when loading: -```python +```py p.run(merge_source(), write_disposition="replace") ``` diff --git a/docs/website/docs/general-usage/schema-contracts.md b/docs/website/docs/general-usage/schema-contracts.md index 764b565beb..1b5e67357a 100644 --- a/docs/website/docs/general-usage/schema-contracts.md +++ b/docs/website/docs/general-usage/schema-contracts.md @@ -49,7 +49,7 @@ The `schema_contract` argument accepts two forms: 2. **shorthand** a contract mode (string) that will be applied to all schema entities. For example setting `schema_contract` to *freeze* will expand to the full form: -```python +```py {"tables": "freeze", "columns": "freeze", "data_type": "freeze"} ``` @@ -65,7 +65,7 @@ You can change the contract on the **source** instance via `schema_contract` pro Pydantic models can be used to [define table schemas and validate incoming data](resource.md#define-a-schema-with-pydantic). You can use any model you already have. `dlt` will internally synthesize (if necessary) new models that conform with the **schema contract** on the resource. Just passing a model in `column` argument of the [dlt.resource](resource.md#define-a-schema-with-pydantic) sets a schema contract that conforms to default Pydantic behavior: -```python +```py { "tables": "evolve", "columns": "discard_value", @@ -121,10 +121,10 @@ Here's how `dlt` deals with column modes: When contract is violated in freeze mode, `dlt` raises `DataValidationError` exception. This exception gives access to the full context and passes the evidence to the caller. As with any other exception coming from pipeline run, it will be re-raised via `PipelineStepFailed` exception which you should catch in except: -```python +```py try: pipeline.run() -except as pip_ex: +except Exception as pip_ex: if pip_ex.step == "normalize": if isinstance(pip_ex.__context__.__context__, DataValidationError): ... @@ -195,7 +195,7 @@ def items(): def other_items(): ... -@dlt.source(schema_contract={"columns": "freeze", "data_type": "freeze"}): +@dlt.source(schema_contract={"columns": "freeze", "data_type": "freeze"}) def source(): return [items(), other_items()] diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 7ce1d959c9..164814010d 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -149,7 +149,7 @@ Now imagine the data has changed and `id` field also contains strings ```py data = [ - {"id": 1, "human_name": "Alice"} + {"id": 1, "human_name": "Alice"}, {"id": "idx-nr-456", "human_name": "Bob"} ] ``` @@ -308,7 +308,7 @@ schema available via `dlt.current.source_schema()`. Example: -```python +```py @dlt.source def textual(nesting_level: int): # get the source schema from the `current` context diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md index 1b3d1ce0cc..bcdd137dce 100644 --- a/docs/website/docs/general-usage/source.md +++ b/docs/website/docs/general-usage/source.md @@ -26,7 +26,7 @@ You declare source by decorating an (optionally async) function that return or y You can create resources by using `dlt.resource` as a function. In an example below we reuse a single generator function to create a list of resources for several Hubspot endpoints. -```python +```py @dlt.source def hubspot(api_key=dlt.secrets.value): @@ -59,7 +59,7 @@ If this is impractical (for example you want to reflect a database to create res You can access resources present in a source and select which of them you want to load. In case of `hubspot` resource above we could select and load "companies", "deals" and "products" resources: -```python +```py from hubspot import hubspot source = hubspot() @@ -73,7 +73,7 @@ pipeline.run(source.with_resources("companies", "deals")) Resources can be individually accessed and selected: -```python +```py # resources are accessible as attributes of a source for c in source.companies: # enumerate all data in companies resource print(c) @@ -89,7 +89,7 @@ source.deals.selected = False You can modify and filter data in resources, for example if we want to keep only deals after certain date: -```python +```py source.deals.add_filter(lambda deal: deal["created_at"] > yesterday) ``` @@ -103,7 +103,7 @@ You can easily get your test dataset in a few minutes, when otherwise you'd need the full loading to complete. Below we limit the `pipedrive` source to just get 10 pages of data from each endpoint. Mind that the transformers will be evaluated fully: -```python +```py from pipedrive import pipedrive_source pipeline = dlt.pipeline(pipeline_name='pipedrive', destination='duckdb', dataset_name='pipedrive_data') @@ -121,7 +121,7 @@ declare a new [transformer that takes the data from](resource.md#feeding-data-from-one-resource-into-another) `deals` resource and add it to the source. -```python +```py import dlt from hubspot import hubspot @@ -140,11 +140,11 @@ source.resources.add(source.deals | deal_scores) pipeline.run(source) ``` You can also set the resources in the source as follows -```python +```py source.deal_scores = source.deals | deal_scores ``` or -```python +```py source.resources["deal_scores"] = source.deals | deal_scores ``` :::note @@ -156,7 +156,7 @@ When adding resource to the source, `dlt` clones the resource so your existing i You can limit how deep `dlt` goes when generating child tables. By default, the library will descend and generate child tables for all nested lists, without limit. -```python +```py @dlt.source(max_table_nesting=1) def mongo_db(): ... @@ -172,7 +172,7 @@ tables of child tables). Typical settings: You can achieve the same effect after the source instance is created: -```python +```py from mongo_db import mongo_db source = mongo_db() @@ -202,7 +202,7 @@ You are also free to decompose a single source into several ones. For example, y down a 50 table copy job into an airflow dag with high parallelism to load the data faster. To do so, you could get the list of resources as: -```python +```py # get a list of resources' names resource_list = sql_source().resources.keys() @@ -216,12 +216,12 @@ for res in resource_list: You can temporarily change the "write disposition" to `replace` on all (or selected) resources within a source to force a full refresh: -```python +```py p.run(merge_source(), write_disposition="replace") ``` With selected resources: -```python +```py p.run(tables.with_resources("users"), write_disposition="replace") ``` diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index 23625db27c..0ab2b8a658 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -15,7 +15,7 @@ You read and write the state in your resources. Below we use the state to create game archives which we then use to [prevent requesting duplicates](incremental-loading.md#advanced-state-usage-storing-a-list-of-processed-entities). -```python +```py @dlt.resource(write_disposition="append") def players_games(chess_url, player, start_month=None, end_month=None): # create or request a list of archives from resource scoped state diff --git a/docs/website/docs/getting-started.md b/docs/website/docs/getting-started.md index cd121b0ad5..ecaa78c949 100644 --- a/docs/website/docs/getting-started.md +++ b/docs/website/docs/getting-started.md @@ -20,13 +20,13 @@ Let's get started! Install dlt using `pip`: -```bash +```sh pip install -U dlt ``` The command above installs (or upgrades) the library core, in the example below we use DuckDB as a destination so let's add a `duckdb` dependency: -```bash +```sh pip install "dlt[duckdb]" ``` @@ -63,13 +63,13 @@ When you look at the code above, you can see that we: Save this Python script with the name `quick_start_pipeline.py` and run the following command: -```bash +```sh python quick_start_pipeline.py ``` The output should look like: -```bash +```sh Pipeline quick_start completed in 0.59 seconds 1 load package(s) were loaded to destination duckdb and into dataset mydata The duckdb destination used duckdb:////home/user-name/quick_start/quick_start.duckdb location to store data @@ -82,13 +82,13 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs To allow sneak peek and basic discovery you can take advantage of [built-in integration with Strealmit](reference/command-line-interface#show-tables-and-data-in-the-destination): -```bash +```sh dlt pipeline quick_start show ``` **quick_start** is the name of the pipeline from the script above. If you do not have Streamlit installed yet do: -```bash +```sh pip install streamlit ``` diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md index b37a3a118e..599ffd3ebd 100644 --- a/docs/website/docs/reference/command-line-interface.md +++ b/docs/website/docs/reference/command-line-interface.md @@ -8,7 +8,7 @@ keywords: [command line interface, cli, dlt init] ## `dlt init` -```shell +```sh dlt init ``` This command creates new dlt pipeline script that loads data from `source` to `destination` to it. When you run the command: @@ -26,7 +26,7 @@ version if run again with existing `source` name. You are warned if files will b You can use `--location ` option to specify your own repository with sources. Typically you would [fork ours](https://github.com/dlt-hub/verified-sources) and start customizing and adding sources ie. to use them for your team or organization. You can also specify a branch with `--branch ` ie. to test a version being developed. ### List all verified sources -```shell +```sh dlt init --list-verified-sources ``` Shows all available verified sources and their short descriptions. For each source, checks if your local `dlt` version requires update @@ -43,7 +43,7 @@ that will add additional packages to current environment. ### github-action -```shell +```sh dlt deploy