From 79c018c49b990ea7db97a8870926b840dfc4c31f Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 14 Nov 2024 15:53:19 +0400 Subject: [PATCH 01/71] add pyiceberg dependency and upgrade mypy - mypy upgrade needed to solve this issue: https://github.com/apache/iceberg-python/issues/768 - uses <1.13.0 requirement on mypy because 1.13.0 gives error - new lint errors arising due to version upgrade are simply ignored --- dlt/cli/source_detection.py | 2 +- .../configuration/specs/base_configuration.py | 2 +- dlt/common/data_writers/buffered.py | 2 +- dlt/common/destination/utils.py | 2 +- dlt/common/logger.py | 2 +- dlt/common/metrics.py | 2 +- dlt/common/reflection/utils.py | 14 +- dlt/common/schema/schema.py | 2 +- dlt/common/typing.py | 2 +- dlt/extract/incremental/lag.py | 2 +- poetry.lock | 160 ++++++++++++++---- pyproject.toml | 4 +- tests/libs/test_csv_writer.py | 4 +- .../sql_database/test_sql_database_source.py | 2 +- .../helpers/rest_client/test_client.py | 2 +- 15 files changed, 152 insertions(+), 52 deletions(-) diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index f4e9b3e050..c3e24eca91 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -30,7 +30,7 @@ def find_call_arguments_to_replace( if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str): raise CliCommandInnerException( "init", - f"The pipeline script {init_script_name} must pass the {t_arg_name} as" + f"The pipeline script {init_script_name} must pass the {t_arg_name} as" # type: ignore[attr-defined] f" string to '{arg_name}' function in line {dn_node.lineno}", ) else: diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 8d913d0542..41d1d7a0ca 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: def get_resolvable_fields(cls) -> Dict[str, type]: """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned""" return { - f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type] + f.name: eval(f.type) if isinstance(f.type, str) else f.type for f in cls._get_resolvable_dataclass_fields() } diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index e2b6c9a442..aa20aff760 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if self.writer_spec.is_binary_format: self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore + self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore[unused-ignore] self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index 0bad5b152e..2036a668af 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -38,7 +38,7 @@ def verify_schema_capabilities( exception_log: List[Exception] = [] # combined casing function case_identifier = lambda ident: capabilities.casefold_identifier( - (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore[unused-ignore] ) table_name_lookup: DictStrStr = {} # name collision explanation diff --git a/dlt/common/logger.py b/dlt/common/logger.py index b163c15672..634e305805 100644 --- a/dlt/common/logger.py +++ b/dlt/common/logger.py @@ -47,7 +47,7 @@ def is_logging() -> bool: def log_level() -> str: if not LOGGER: raise RuntimeError("Logger not initialized") - return logging.getLevelName(LOGGER.level) # type: ignore + return logging.getLevelName(LOGGER.level) def is_json_logging(log_format: str) -> bool: diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index d6acf19d0d..2f9f574dd0 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple): created: float last_modified: float - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: # type: ignore[override] if isinstance(other, DataWriterMetrics): return DataWriterMetrics( self.file_path if self.file_path == other.file_path else "", diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index cbf38a7327..bfdd547d70 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -84,24 +84,24 @@ def rewrite_python_script( last_line = -1 last_offset = -1 # sort transformed nodes by line and offset - for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): + for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # type: ignore[attr-defined] # do we have a line changed - if last_line != node.lineno - 1: + if last_line != node.lineno - 1: # type: ignore[attr-defined] # add remainder from the previous line if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) # add all new lines from previous line to current - script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) + script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # type: ignore[attr-defined] # add trailing characters until node in current line starts - script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) + script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) # type: ignore[attr-defined] elif last_offset >= 0: # no line change, add the characters from the end of previous node to the current - script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) + script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # type: ignore[attr-defined] # replace node value script_lines.append(astunparse.unparse(t_value).strip()) - last_line = node.end_lineno - 1 - last_offset = node.end_col_offset + last_line = node.end_lineno - 1 # type: ignore[attr-defined] + last_offset = node.end_col_offset # type: ignore[attr-defined] # add all that was missing if last_offset >= 0: diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 0dbeda93cf..5e014e1cde 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -524,7 +524,7 @@ def get_new_table_columns( Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored. Optionally includes incomplete columns (without data type)""" - casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] + casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str casefold_existing = { casefold_f(col_name): col for col_name, col in existing_columns.items() } diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 94edb57194..771f1fd59b 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -439,7 +439,7 @@ def get_generic_type_argument_from_instance( if cls_: orig_param_type = get_args(cls_)[0] if orig_param_type in (Any, CallableAny) and sample_value is not None: - orig_param_type = type(sample_value) + orig_param_type = type(sample_value) # type: ignore[assignment] return orig_param_type # type: ignore diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py index ee102a9961..dfafa2cd11 100644 --- a/dlt/extract/incremental/lag.py +++ b/dlt/extract/incremental/lag.py @@ -20,7 +20,7 @@ def _apply_lag_to_value( parsed_value = ensure_pendulum_datetime(value) if is_str else value if isinstance(parsed_value, (datetime, date)): - parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) + parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment] # go back to string or pass exact type value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment] diff --git a/poetry.lock b/poetry.lock index 3cb80ef6ed..b312b222a5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "about-time" @@ -5771,44 +5771,49 @@ files = [ [[package]] name = "mypy" -version = "1.10.0" +version = "1.12.1" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, - {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, - {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"}, - {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"}, - {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"}, - {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"}, - {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"}, - {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"}, - {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"}, - {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"}, - {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"}, - {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"}, - {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"}, - {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"}, - {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"}, - {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"}, - {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"}, - {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"}, - {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3d7d4371829184e22fda4015278fbfdef0327a4b955a483012bd2d423a788801"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f59f1dfbf497d473201356966e353ef09d4daec48caeacc0254db8ef633a28a5"}, + {file = "mypy-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b947097fae68004b8328c55161ac9db7d3566abfef72d9d41b47a021c2fba6b1"}, + {file = "mypy-1.12.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:96af62050971c5241afb4701c15189ea9507db89ad07794a4ee7b4e092dc0627"}, + {file = "mypy-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:d90da248f4c2dba6c44ddcfea94bb361e491962f05f41990ff24dbd09969ce20"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66"}, + {file = "mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6"}, + {file = "mypy-1.12.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:186e0c8346efc027ee1f9acf5ca734425fc4f7dc2b60144f0fbe27cc19dc7931"}, + {file = "mypy-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9fb83a7be97c498176fb7486cafbb81decccaef1ac339d837c377b0ce3743a7f"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:389e307e333879c571029d5b93932cf838b811d3f5395ed1ad05086b52148fb0"}, + {file = "mypy-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:94b2048a95a21f7a9ebc9fbd075a4fcd310410d078aa0228dbbad7f71335e042"}, + {file = "mypy-1.12.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ee5932370ccf7ebf83f79d1c157a5929d7ea36313027b0d70a488493dc1b179"}, + {file = "mypy-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:19bf51f87a295e7ab2894f1d8167622b063492d754e69c3c2fed6563268cb42a"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d34167d43613ffb1d6c6cdc0cc043bb106cac0aa5d6a4171f77ab92a3c758bcc"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:427878aa54f2e2c5d8db31fa9010c599ed9f994b3b49e64ae9cd9990c40bd635"}, + {file = "mypy-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fcde63ea2c9f69d6be859a1e6dd35955e87fa81de95bc240143cf00de1f7f81"}, + {file = "mypy-1.12.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d54d840f6c052929f4a3d2aab2066af0f45a020b085fe0e40d4583db52aab4e4"}, + {file = "mypy-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:20db6eb1ca3d1de8ece00033b12f793f1ea9da767334b7e8c626a4872090cf02"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b16fe09f9c741d85a2e3b14a5257a27a4f4886c171d562bc5a5e90d8591906b8"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0dcc1e843d58f444fce19da4cce5bd35c282d4bde232acdeca8279523087088a"}, + {file = "mypy-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e10ba7de5c616e44ad21005fa13450cd0de7caaa303a626147d45307492e4f2d"}, + {file = "mypy-1.12.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e6fe449223fa59fbee351db32283838a8fee8059e0028e9e6494a03802b4004"}, + {file = "mypy-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:dc6e2a2195a290a7fd5bac3e60b586d77fc88e986eba7feced8b778c373f9afe"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:de5b2a8988b4e1269a98beaf0e7cc71b510d050dce80c343b53b4955fff45f19"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843826966f1d65925e8b50d2b483065c51fc16dc5d72647e0236aae51dc8d77e"}, + {file = "mypy-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fe20f89da41a95e14c34b1ddb09c80262edcc295ad891f22cc4b60013e8f78d"}, + {file = "mypy-1.12.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8135ffec02121a75f75dc97c81af7c14aa4ae0dda277132cfcd6abcd21551bfd"}, + {file = "mypy-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:a7b76fa83260824300cc4834a3ab93180db19876bce59af921467fd03e692810"}, + {file = "mypy-1.12.1-py3-none-any.whl", hash = "sha256:ce561a09e3bb9863ab77edf29ae3a50e65685ad74bba1431278185b7e5d5486e"}, + {file = "mypy-1.12.1.tar.gz", hash = "sha256:f5b3936f7a6d0e8280c9bdef94c7ce4847f5cdfc258fbb2c29a8c1711e8bb96d"}, ] [package.dependencies] mypy-extensions = ">=1.0.0" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=4.1.0" +typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] @@ -7364,6 +7369,84 @@ files = [ [package.extras] plugins = ["importlib-metadata"] +[[package]] +name = "pyiceberg" +version = "0.7.1" +description = "Apache Iceberg is an open table format for huge analytic datasets" +optional = true +python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" +files = [ + {file = "pyiceberg-0.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:9e0cc837d41e100df81f1f5e580a89668aade694d8c616941d6e11c3a27e49cb"}, + {file = "pyiceberg-0.7.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:71c053c2d484505d1beabd7d5167fe2e835ca865f52ad91ef4852f0d91fa4a25"}, + {file = "pyiceberg-0.7.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:0549ab1843bc07037a7d212c2db527ff1755f5d8f80420907952b5b080eb3663"}, + {file = "pyiceberg-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec4a8000f0bb6ce6ec47f3368ca99f3191e9105662eeef7be2fbb493363cba96"}, + {file = "pyiceberg-0.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0ef6636d3cf370b796529f9a8dbd84e892a2151f0310a8015b9a1e702647ad90"}, + {file = "pyiceberg-0.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:9b49320f3e9624075879a4ddb4fa5ddff7d4a03f6561ad6fd73d514c63095367"}, + {file = "pyiceberg-0.7.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:27e9b4033691411ef7c49d93df7b3b7f3ed85fe8019cbf0dab5a5ba888b27f34"}, + {file = "pyiceberg-0.7.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:7262ba4f95e05a1421567e24c0db57288dc59974c94676aba34afef121544694"}, + {file = "pyiceberg-0.7.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3eb1fc1d47085b16973999c2111d252fab2a394625c0f25da6515b8c3233c853"}, + {file = "pyiceberg-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1856c5d64197c9335817b8cf7081e490b601385623e5178cb094ee645d4fb24c"}, + {file = "pyiceberg-0.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b6b64006c361220ce103b5bb2f50381a3f851452668adf5a6c61d39f5611e832"}, + {file = "pyiceberg-0.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:57a0b1fb390d26a5b7155de011300300058343e5c2561f4839d69c1775df1d7e"}, + {file = "pyiceberg-0.7.1-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:84f2119705e31929aa75beb9a8ce97210e56f498e863e31dc499a2120c2842bd"}, + {file = "pyiceberg-0.7.1-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:f99ab8d71a2968af0b512fff1d3dcbd145705a95a26b05121c0df712683c9e0c"}, + {file = "pyiceberg-0.7.1-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:5dc17aa1f53f5b8be12eae35dbcb9885b2534138bdecd31a0088680651fbb98e"}, + {file = "pyiceberg-0.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:917fdfd372922f9534fe9b6652881a79f82f94d7d3645ddb1925688e3d9aaf4d"}, + {file = "pyiceberg-0.7.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:910fab27c039d62a1fe4a199aaea63d08ada30ead6fd27d56bf038c487837691"}, + {file = "pyiceberg-0.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:98db6d18dca335682c32b25406d7ab5afad8f1baea4fbdafda166cbc6557409c"}, + {file = "pyiceberg-0.7.1-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:c76ea44cc1b02c15b65e1b0cc81b5b3f813ba40a4e262416d7a1e84345f44cf1"}, + {file = "pyiceberg-0.7.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:57485e9331c7e8b1771ea1b2ecdc417dc7a13c7a9a538d74f3f00de98676958b"}, + {file = "pyiceberg-0.7.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:bbc79698292482360be86f8d728237b78ef8eb416e21aea9d53e4a1b4f429ce7"}, + {file = "pyiceberg-0.7.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f84d912fc12866f22882f5de157cbbfab3dcbad8e0a4378557e5b84a0c3f360"}, + {file = "pyiceberg-0.7.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:f86c535735e57f1a0c76fd0f505e0b172cc212c96a3789f3845220695e792157"}, + {file = "pyiceberg-0.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:d8bee5aa4b34e6028f0465cf405bc4e963e160ac52efbe4bdbc499bb55bc2780"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_12_0_x86_64.whl", hash = "sha256:9ae56197db8570553491173adfd2e01a03ae116a1f9fa78ba5a1a1c4e2ad3dbf"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_13_0_x86_64.whl", hash = "sha256:e28adc58500ca72e45a07ee4dcd90b63699a8875f178001bd12ace37294c5814"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_14_0_arm64.whl", hash = "sha256:1ae47f2d0e87dccd158ae8dafc47125f9739858068fc3add8940f5585ea40ead"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb94c3e11354f85daafb2b2f3e13a245bcb35848135b5ed4e8c83e61393c36ea"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4fe212b0594128d183711c6efb1a40ea5f17372e11595a84f4565eb9fe97c703"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_12_0_x86_64.whl", hash = "sha256:35ce27243b86f7057fbd4594dbe5d6b2a1ccd738ba6b65c2a4f3af249f1e8364"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_13_0_x86_64.whl", hash = "sha256:56e254623669ab03e779e4b696b7e36cd1c6973e8523200ccc232695742e269d"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_14_0_arm64.whl", hash = "sha256:e07b59a5998c6d4cac258763c6c160234e1e3362a2097808bd02e05e0c16208a"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde005aa075fc0e5ed0095438b0a4d39534e3cb84889b93d6aa265dd2e072eff"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:1950f2186f1c99e0d187ffee86e2f8d6bbbad9b0079573a7255b85ffaaa82e79"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_12_0_x86_64.whl", hash = "sha256:273b4b642168a5e64fedc0073e18fd481b11d6891f9e44ceb5ce27126fe418f7"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_13_0_x86_64.whl", hash = "sha256:9a2dbc621cdd4f0c92f5b2520f2b266b976317ff8a984aec2ce9240ee3d80471"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_14_0_arm64.whl", hash = "sha256:34c2d6e9d027b66f8d531fcefeb5cda8b2a37e70170c01f6f1c977954d733c45"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3e97fb65862db191685355e1eb8d97d41d00679a3df1fbd7a1c2560b9e3e6d8"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:98a0de3c2f194907b07522769facbcacdff0ec9577f9710273ba7e0aa8465652"}, + {file = "pyiceberg-0.7.1.tar.gz", hash = "sha256:2fd8f9717b02673cb9cabe7aed82fc38933241b2bd15cbdc1ff7371e70317a47"}, +] + +[package.dependencies] +click = ">=7.1.1,<9.0.0" +fsspec = ">=2023.1.0,<2025.1.0" +mmh3 = ">=4.0.0,<5.0.0" +pydantic = ">=2.0,<2.4.0 || >2.4.0,<2.4.1 || >2.4.1,<3.0" +pyparsing = ">=3.1.0,<4.0.0" +requests = ">=2.20.0,<3.0.0" +rich = ">=10.11.0,<14.0.0" +sortedcontainers = "2.4.0" +strictyaml = ">=1.7.0,<2.0.0" +tenacity = ">=8.2.3,<9.0.0" + +[package.extras] +adlfs = ["adlfs (>=2023.1.0,<2024.8.0)"] +daft = ["getdaft (>=0.2.12)"] +duckdb = ["duckdb (>=0.5.0,<2.0.0)", "numpy (>=1.22.4,<2.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] +dynamodb = ["boto3 (>=1.24.59)"] +gcsfs = ["gcsfs (>=2023.1.0,<2024.1.0)"] +glue = ["boto3 (>=1.24.59)", "mypy-boto3-glue (>=1.28.18)"] +hive = ["thrift (>=0.13.0,<1.0.0)"] +pandas = ["numpy (>=1.22.4,<2.0.0)", "pandas (>=1.0.0,<3.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] +pyarrow = ["numpy (>=1.22.4,<2.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] +ray = ["numpy (>=1.22.4,<2.0.0)", "pandas (>=1.0.0,<3.0.0)", "pyarrow (>=9.0.0,<18.0.0)", "ray (==2.10.0)", "ray (>=2.10.0,<3.0.0)"] +s3fs = ["s3fs (>=2023.1.0,<2024.1.0)"] +snappy = ["python-snappy (>=0.6.0,<1.0.0)"] +sql-postgres = ["psycopg2-binary (>=2.9.6)", "sqlalchemy (>=2.0.18,<3.0.0)"] +sql-sqlite = ["sqlalchemy (>=2.0.18,<3.0.0)"] +zstandard = ["zstandard (>=0.13.0,<1.0.0)"] + [[package]] name = "pyjwt" version = "2.8.0" @@ -9112,6 +9195,20 @@ files = [ [package.dependencies] pbr = ">=2.0.0,<2.1.0 || >2.1.0" +[[package]] +name = "strictyaml" +version = "1.7.3" +description = "Strict, typed YAML parser" +optional = true +python-versions = ">=3.7.0" +files = [ + {file = "strictyaml-1.7.3-py3-none-any.whl", hash = "sha256:fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7"}, + {file = "strictyaml-1.7.3.tar.gz", hash = "sha256:22f854a5fcab42b5ddba8030a0e4be51ca89af0267961c8d6cfa86395586c407"}, +] + +[package.dependencies] +python-dateutil = ">=2.6.0" + [[package]] name = "sympy" version = "1.12" @@ -10379,6 +10476,7 @@ motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] postgres = ["psycopg2-binary", "psycopg2cffi"] +pyiceberg = ["pyiceberg"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] @@ -10392,4 +10490,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "aefcc6fc97be767f995183ea8b5bf0c3ee2388a4ca884cfb2192311f696f24f4" +content-hash = "0307148666fb501f63dde9c7869411fb6fd11f24354d0eef27822b45330b6b08" diff --git a/pyproject.toml b/pyproject.toml index 4b718b316e..f18d178c0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,7 @@ paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } aiohttp = { version = ">=3.9", optional = true } +pyiceberg = { version = ">=0.7.1", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] @@ -119,6 +120,7 @@ lancedb = ["lancedb", "pyarrow", "tantivy"] deltalake = ["deltalake", "pyarrow"] sql_database = ["sqlalchemy"] sqlalchemy = ["sqlalchemy", "alembic"] +pyiceberg = ["pyiceberg"] [tool.poetry.scripts] dlt = "dlt.cli._dlt:_main" @@ -134,7 +136,7 @@ sqlfluff = "^2.3.2" types-deprecated = "^1.2.9.2" pytest-console-scripts = "^1.4.1" pytest = "^7.0.0" -mypy = "^1.10.0" +mypy = ">=1.11.0,<1.13.0" flake8 = "^5.0.0" bandit = "^1.7.0" black = "^23.7.0" diff --git a/tests/libs/test_csv_writer.py b/tests/libs/test_csv_writer.py index 3c30123e1c..a120cd048e 100644 --- a/tests/libs/test_csv_writer.py +++ b/tests/libs/test_csv_writer.py @@ -178,7 +178,7 @@ def test_non_utf8_binary(item_type: TestDataItemFormat) -> None: table = pq.read_table(f) else: table = data - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with pytest.raises(InvalidDataItem) as inv_ex: with get_writer(writer_type, disable_compression=True) as writer: @@ -195,7 +195,7 @@ def test_arrow_struct() -> None: @pytest.mark.parametrize("item_type", ["object", "arrow-table"]) def test_csv_writer_empty(item_type: TestDataItemFormat) -> None: - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with get_writer(writer_type, disable_compression=True) as writer: writer.write_empty_file(TABLE_UPDATE_COLUMNS_SCHEMA) diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 069ebd7841..a1782343fd 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -1103,7 +1103,7 @@ def assert_no_precision_columns( ) -> None: actual = list(columns.values()) # we always infer and emit nullability - expected = cast( + expected = cast( # type: ignore[redundant-cast] List[TColumnSchema], deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS), ) diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 488d7ef525..16956d9bd6 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -400,7 +400,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response, data): + def update_state(self, response, data): # type: ignore[override] self._next_reference = response.json().get("next_page") def update_request(self, request): From 5014f88c6ec4f594a9a50e68daad0fe8949092bb Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 15 Nov 2024 11:29:14 +0400 Subject: [PATCH 02/71] extend pyiceberg dependencies --- poetry.lock | 4 ++-- pyproject.toml | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index b312b222a5..ec0476851a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -10476,7 +10476,7 @@ motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] postgres = ["psycopg2-binary", "psycopg2cffi"] -pyiceberg = ["pyiceberg"] +pyiceberg = ["pyarrow", "pyiceberg", "sqlalchemy"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] @@ -10490,4 +10490,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "0307148666fb501f63dde9c7869411fb6fd11f24354d0eef27822b45330b6b08" +content-hash = "002489e2418eb2c671e2380c11bf50a5cd246a4df729deb861357a6d00b50841" diff --git a/pyproject.toml b/pyproject.toml index f18d178c0b..ea13367f1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,11 @@ paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } aiohttp = { version = ">=3.9", optional = true } +# `sql-sqlite` extra leads to dependency conflict with `apache-airflow` because `apache-airflow` +# requires `sqlalchemy<2.0.0` while the extra requires `sqlalchemy>=2.0.18` +# https://github.com/apache/airflow/issues/28723 +# pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } +# we will rely on manual installation of `sqlalchemy>=2.0.18` instead pyiceberg = { version = ">=0.7.1", optional = true } [tool.poetry.extras] @@ -120,7 +125,7 @@ lancedb = ["lancedb", "pyarrow", "tantivy"] deltalake = ["deltalake", "pyarrow"] sql_database = ["sqlalchemy"] sqlalchemy = ["sqlalchemy", "alembic"] -pyiceberg = ["pyiceberg"] +pyiceberg = ["pyiceberg", "pyarrow", "sqlalchemy"] [tool.poetry.scripts] dlt = "dlt.cli._dlt:_main" From c632dd7d1896557e811fad57e4be6d072c8f479a Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 15 Nov 2024 11:32:00 +0400 Subject: [PATCH 03/71] remove redundant delta annotation --- tests/load/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/load/utils.py b/tests/load/utils.py index 740dafba76..e7b07e63c1 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -609,7 +609,6 @@ def destinations_configs( DestinationTestConfiguration( destination_type="filesystem", bucket_url=bucket, - extra_info=bucket + "-delta", table_format="delta", supports_merge=True, env_vars=( From a3f6587c4c513cd6d356b057aa8fba4c8d102d3f Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 15 Nov 2024 11:54:43 +0400 Subject: [PATCH 04/71] add basic local filesystem iceberg support --- .github/workflows/test_local_destinations.yml | 5 +- dlt/common/libs/pyiceberg.py | 99 +++++++++++++++++++ dlt/destinations/impl/filesystem/factory.py | 4 +- .../impl/filesystem/filesystem.py | 65 +++++++++--- .../load/pipeline/test_filesystem_pipeline.py | 26 +++-- tests/load/utils.py | 7 ++ tests/pipeline/utils.py | 13 +++ 7 files changed, 194 insertions(+), 25 deletions(-) create mode 100644 dlt/common/libs/pyiceberg.py diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index a4548f6529..a945d792e7 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,10 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake -E pyiceberg + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py new file mode 100644 index 0000000000..3b136d099a --- /dev/null +++ b/dlt/common/libs/pyiceberg.py @@ -0,0 +1,99 @@ +from typing import Dict +import os + +from dlt import version, Pipeline +from dlt.common.libs.pyarrow import cast_arrow_schema_types +from dlt.common.schema.typing import TWriteDisposition +from dlt.common.utils import assert_min_pkg_version +from dlt.common.exceptions import MissingDependencyException +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + +assert_min_pkg_version( + pkg_name="sqlalchemy", + version="2.0.18", + msg="`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination.", +) + +try: + from pyiceberg.table import Table as IcebergTable + from pyiceberg.catalog.sql import SqlCatalog + import pyarrow as pa +except ModuleNotFoundError: + raise MissingDependencyException( + "dlt pyiceberg helpers", + [f"{version.DLT_PKG_NAME}[pyiceberg]"], + "Install `pyiceberg` so dlt can create Iceberg tables in the `filesystem` destination.", + ) + + +DLT_ICEBERG_CATALOGS_DIR = "dlt_iceberg_catalogs" +DLT_ICEBERG_NAMESPACE = "dlt" + + +def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema: + ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = { + pa.types.is_time: pa.string(), + pa.types.is_decimal256: pa.string(), # pyarrow does not allow downcasting to decimal128 + } + return cast_arrow_schema_types(schema, ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP) + + +def ensure_iceberg_compatible_arrow_data(data: pa.Table) -> pa.Table: + schema = ensure_iceberg_compatible_arrow_schema(data.schema) + return data.cast(schema) + + +def write_iceberg_table( + table: IcebergTable, + data: pa.Table, + write_disposition: TWriteDisposition, +) -> None: + if write_disposition == "append": + table.append(ensure_iceberg_compatible_arrow_data(data)) + elif write_disposition == "replace": + table.overwrite(ensure_iceberg_compatible_arrow_data(data)) + + +def get_catalog( + client: FilesystemClient, + table_name: str, +) -> SqlCatalog: + catalogs_dir = client.dataset_path + "/" + DLT_ICEBERG_CATALOGS_DIR + os.makedirs(catalogs_dir, exist_ok=True) + return SqlCatalog( + "default", + uri=f"sqlite:///{catalogs_dir}/{table_name}_catalog.db", + ) + + +def get_iceberg_tables( + pipeline: Pipeline, *tables: str, schema_name: str = None +) -> Dict[str, IcebergTable]: + from dlt.common.schema.utils import get_table_format + + with pipeline.destination_client(schema_name=schema_name) as client: + assert isinstance( + client, FilesystemClient + ), "The `get_iceberg_tables` function requires a `filesystem` destination." + + schema_iceberg_tables = [ + t["name"] + for t in client.schema.tables.values() + if get_table_format(client.schema.tables, t["name"]) == "iceberg" + ] + if len(tables) > 0: + invalid_tables = set(tables) - set(schema_iceberg_tables) + if len(invalid_tables) > 0: + available_schemas = "" + if len(pipeline.schema_names) > 1: + available_schemas = f" Available schemas are {pipeline.schema_names}" + raise ValueError( + f"Schema {client.schema.name} does not contain Iceberg tables with these names:" + f" {', '.join(invalid_tables)}.{available_schemas}" + ) + schema_iceberg_tables = [t for t in schema_iceberg_tables if t in tables] + + return { + name: get_catalog(client, name).load_table(f"{DLT_ICEBERG_NAMESPACE}.{name}") + for name in schema_iceberg_tables + } diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py index 2463da58fa..906bd157e4 100644 --- a/dlt/destinations/impl/filesystem/factory.py +++ b/dlt/destinations/impl/filesystem/factory.py @@ -19,7 +19,7 @@ def filesystem_loader_file_format_selector( *, table_schema: TTableSchema, ) -> t.Tuple[TLoaderFileFormat, t.Sequence[TLoaderFileFormat]]: - if table_schema.get("table_format") == "delta": + if table_schema.get("table_format") in ("delta", "iceberg"): return ("parquet", ["parquet"]) return (preferred_loader_file_format, supported_loader_file_formats) @@ -43,7 +43,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext.generic_capabilities( preferred_loader_file_format="jsonl", loader_file_format_selector=filesystem_loader_file_format_selector, - supported_table_formats=["delta"], + supported_table_formats=["delta", "iceberg"], supported_merge_strategies=["upsert"], merge_strategies_selector=filesystem_merge_strategies_selector, ) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 0cf63b3ac9..ea03a85fb5 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -120,16 +120,23 @@ def metrics(self) -> Optional[LoadJobMetrics]: return m._replace(remote_url=self.make_remote_url()) -class DeltaLoadFilesystemJob(FilesystemLoadJob): +class TableFormatLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: super().__init__(file_path=file_path) self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) def make_remote_path(self) -> str: - # remote path is table dir - delta will create its file structure inside it return self._job_client.get_table_dir(self.load_table_name) + @property + def arrow_dataset(self) -> Any: + from dlt.common.libs.pyarrow import pyarrow + + return pyarrow.dataset.dataset(self.file_paths) + + +class DeltaLoadFilesystemJob(TableFormatLoadFilesystemJob): def run(self) -> None: # create Arrow dataset from Parquet files from dlt.common.libs.pyarrow import pyarrow as pa @@ -139,7 +146,7 @@ def run(self) -> None: f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()} [arrow" f" buffer: {pa.total_allocated_bytes()}]" ) - source_ds = pa.dataset.dataset(self.file_paths) + source_ds = self.arrow_dataset delta_table = self._delta_table() # explicitly check if there is data @@ -212,6 +219,28 @@ def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "Delta return _evolve_delta_table_schema(delta_table, arrow_ds.schema) +class IcebergLoadFilesystemJob(TableFormatLoadFilesystemJob): + def run(self) -> None: + from dlt.common.libs.pyiceberg import ( + DLT_ICEBERG_NAMESPACE, + ensure_iceberg_compatible_arrow_schema, + write_iceberg_table, + get_catalog, + ) + + arrow_table = self.arrow_dataset.to_table() + catalog = get_catalog(self._job_client, self.load_table_name) + catalog.create_namespace_if_not_exists(DLT_ICEBERG_NAMESPACE) + table = catalog.create_table_if_not_exists( + f"{DLT_ICEBERG_NAMESPACE}.{self.load_table_name}", + schema=ensure_iceberg_compatible_arrow_schema(arrow_table.schema), + location=self.make_remote_url(), + ) + write_iceberg_table( + table=table, data=arrow_table, write_disposition=self._load_table["write_disposition"] + ) + + class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: jobs = super().create_followup_jobs(final_state) @@ -373,9 +402,13 @@ def update_stored_schema( ) -> TSchemaTables: applied_update = super().update_stored_schema(only_tables, expected_update) # create destination dirs for all tables - table_names = only_tables or self.schema.tables.keys() + tables = self.schema.tables + table_names = only_tables or tables.keys() dirs_to_create = self.get_table_dirs(table_names) for tables_name, directory in zip(table_names, dirs_to_create): + if tables[tables_name].get("table_format") in ("delta", "iceberg"): + # let table format libs manage table directory + continue self.fs_client.makedirs(directory, exist_ok=True) # we need to mark the folders of the data tables as initialized if tables_name in self.schema.dlt_table_names(): @@ -459,12 +492,20 @@ def create_load_job( # where we want to load the state the regular way if table["name"] == self.schema.state_table_name and not self.config.as_staging_destination: return FinalizedLoadJob(file_path) - if table.get("table_format") == "delta": - import dlt.common.libs.deltalake # assert dependencies are installed + table_format = table.get("table_format") + if table_format in ("delta", "iceberg"): # a reference job for a delta table indicates a table chain followup job if ReferenceFollowupJobRequest.is_reference_job(file_path): - return DeltaLoadFilesystemJob(file_path) + if table_format == "delta": + import dlt.common.libs.deltalake + + return DeltaLoadFilesystemJob(file_path) + elif table_format == "iceberg": + import dlt.common.libs.pyiceberg + + return IcebergLoadFilesystemJob(file_path) + # otherwise just continue return FinalizedLoadJobWithFollowupJobs(file_path) @@ -495,10 +536,10 @@ def should_load_data_to_staging_dataset(self, table_name: str) -> bool: def should_truncate_table_before_load(self, table_name: str) -> bool: table = self.prepare_load_table(table_name) - return ( - table["write_disposition"] == "replace" - and not table.get("table_format") == "delta" # Delta can do a logical replace - ) + return table["write_disposition"] == "replace" and not table.get("table_format") in ( + "delta", + "iceberg", + ) # Delta/Iceberg can do a logical replace # # state stuff @@ -719,7 +760,7 @@ def create_table_chain_completed_followup_jobs( jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs ) - if table_chain[0].get("table_format") == "delta": + if table_chain[0].get("table_format") in ("delta", "iceberg"): for table in table_chain: table_job_paths = [ job.file_path diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 8d890642ee..79acecac74 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -258,44 +258,49 @@ def foo(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=(MEMORY_BUCKET, SFTP_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_core( +def test_table_format_core( destination_config: DestinationTestConfiguration, ) -> None: - """Tests core functionality for `delta` table format. + """Tests core functionality for `delta` and `iceberg` table formats. Tests all data types, all filesystems. Tests `append` and `replace` write dispositions (`merge` is tested elsewhere). """ - - from dlt.common.libs.deltalake import get_delta_tables + if ( + destination_config.table_format == "iceberg" + and destination_config.bucket_url != FILE_BUCKET + ): + pytest.skip("remote filesystems not yet implemented for `iceberg`") + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables # create resource that yields rows with all data types column_schemas, row = table_update_and_row() - @dlt.resource(columns=column_schemas, table_format="delta") + @dlt.resource(columns=column_schemas, table_format=destination_config.table_format) def data_types(): nonlocal row yield [row] * 10 pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - # run pipeline, this should create Delta table + # run pipeline, this should create table info = pipeline.run(data_types()) assert_load_info(info) - # `delta` table format should use `parquet` file format + # table formats should use `parquet` file format completed_jobs = info.load_packages[0].jobs["completed_jobs"] data_types_jobs = [ job for job in completed_jobs if job.job_file_info.table_name == "data_types" ] assert all([job.file_path.endswith((".parquet", ".reference")) for job in data_types_jobs]) - # 10 rows should be loaded to the Delta table and the content of the first + # 10 rows should be loaded to the table and the content of the first # row should match expected values rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -322,7 +327,8 @@ def data_types(): # should do logical replace, increasing the table version info = pipeline.run(data_types(), write_disposition="replace") assert_load_info(info) - assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 + if destination_config.table_format == "delta": + assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 diff --git a/tests/load/utils.py b/tests/load/utils.py index e7b07e63c1..21fa2763a9 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -622,6 +622,13 @@ def destinations_configs( ), ) ] + destination_configs += [ + DestinationTestConfiguration( + destination_type="filesystem", + bucket_url=bucket, + table_format="iceberg", + ) + ] # filter out non active destinations destination_configs = [ diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 0ae734f72e..e72a27c827 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -197,10 +197,23 @@ def _load_tables_to_dicts_fs( delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name) + iceberg_table_names = [ + table_name + for table_name in table_names + if get_table_format(client.schema.tables, table_name) == "iceberg" + ] + if len(iceberg_table_names) > 0: + from dlt.common.libs.pyiceberg import get_iceberg_tables + + iceberg_tables = get_iceberg_tables(p, *table_names, schema_name=schema_name) + for table_name in table_names: if table_name in client.schema.data_table_names() and table_name in delta_table_names: dt = delta_tables[table_name] result[table_name] = dt.to_pyarrow_table().to_pylist() + elif table_name in client.schema.data_table_names() and table_name in iceberg_table_names: + it = iceberg_tables[table_name] + result[table_name] = it.scan().to_arrow().to_pylist() else: table_files = client.list_table_files(table_name) for file in table_files: From 87553a68dc6ae7600c11f5e8be1754edd06a32cf Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 15 Nov 2024 11:56:04 +0400 Subject: [PATCH 05/71] add active table format setting --- tests/load/utils.py | 6 ++++++ tests/utils.py | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/tests/load/utils.py b/tests/load/utils.py index 21fa2763a9..9ea8f8f386 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -57,6 +57,7 @@ from dlt.pipeline.exceptions import SqlClientNotAvailable from tests.utils import ( ACTIVE_DESTINATIONS, + ACTIVE_TABLE_FORMATS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS, @@ -635,6 +636,11 @@ def destinations_configs( conf for conf in destination_configs if conf.destination_type in ACTIVE_DESTINATIONS ] + # filter out non active destinations + destination_configs = [ + conf for conf in destination_configs if conf.table_format in ACTIVE_TABLE_FORMATS + ] + # filter out destinations not in subset if subset: destination_configs = [ diff --git a/tests/utils.py b/tests/utils.py index 8ae301a4ab..e5b5ef1bc2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -32,6 +32,7 @@ from dlt.common.runtime.run_context import DOT_DLT, RunContext from dlt.common.runtime.telemetry import start_telemetry, stop_telemetry from dlt.common.schema import Schema +from dlt.common.schema.typing import TTableFormat from dlt.common.storages import FileStorage from dlt.common.storages.versioned_storage import VersionedStorage from dlt.common.typing import DictStrAny, StrAny, TDataItem @@ -88,6 +89,12 @@ ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) +# filter out active table formats for current tests +IMPLEMENTED_TABLE_FORMATS = set(get_args(TTableFormat)) +ACTIVE_TABLE_FORMATS = set( + dlt.config.get("ACTIVE_TABLE_FORMATS", list) or IMPLEMENTED_TABLE_FORMATS +) + # sanity checks assert len(ACTIVE_DESTINATIONS) >= 0, "No active destinations selected" From 10121be7d4649031fedfe63f2c6f0859efab1006 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 15 Nov 2024 13:15:25 +0400 Subject: [PATCH 06/71] disable merge tests for iceberg table format --- tests/load/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/load/utils.py b/tests/load/utils.py index b038daf338..458826aea3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -624,6 +624,7 @@ def destinations_configs( destination_type="filesystem", bucket_url=bucket, table_format="iceberg", + supports_merge=False, ) ] From 23c4db3e3a68d8da0105b0f9f44d59d7b3dc0459 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 15 Nov 2024 16:01:06 +0400 Subject: [PATCH 07/71] restore non-redundant extra info --- tests/load/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/load/utils.py b/tests/load/utils.py index 458826aea3..3de8dcd1fe 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -605,6 +605,7 @@ def destinations_configs( DestinationTestConfiguration( destination_type="filesystem", bucket_url=bucket, + extra_info=bucket, table_format="delta", supports_merge=True, file_format="parquet", @@ -623,6 +624,7 @@ def destinations_configs( DestinationTestConfiguration( destination_type="filesystem", bucket_url=bucket, + extra_info=bucket, table_format="iceberg", supports_merge=False, ) From 195ee4cae2233ede638b5ec2c272ab9a5c938bdf Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 15 Nov 2024 16:36:24 +0400 Subject: [PATCH 08/71] refactor to in-memory iceberg catalog --- dlt/common/libs/pyiceberg.py | 37 ++++++++++++++++--- .../impl/filesystem/filesystem.py | 31 ++++++++-------- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index 3b136d099a..b7b5cb8307 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -2,7 +2,7 @@ import os from dlt import version, Pipeline -from dlt.common.libs.pyarrow import cast_arrow_schema_types +from dlt.common.libs.pyarrow import cast_arrow_schema_types, columns_to_arrow from dlt.common.schema.typing import TWriteDisposition from dlt.common.utils import assert_min_pkg_version from dlt.common.exceptions import MissingDependencyException @@ -58,12 +58,39 @@ def get_catalog( client: FilesystemClient, table_name: str, ) -> SqlCatalog: - catalogs_dir = client.dataset_path + "/" + DLT_ICEBERG_CATALOGS_DIR - os.makedirs(catalogs_dir, exist_ok=True) - return SqlCatalog( + """Returns single-table, ephemeral, in-memory Iceberg catalog.""" + warehouse_path = client.dataset_path + catalogs_path = warehouse_path + "/" + DLT_ICEBERG_CATALOGS_DIR + os.makedirs(catalogs_path, exist_ok=True) + + # create in-memory catalog + catalog = SqlCatalog( "default", - uri=f"sqlite:///{catalogs_dir}/{table_name}_catalog.db", + uri="sqlite:///:memory:", + warehouse=client.make_remote_url(warehouse_path), ) + catalog.create_namespace(DLT_ICEBERG_NAMESPACE) + + # add table to catalog + table_id = f"{DLT_ICEBERG_NAMESPACE}.{table_name}" + table_path = f"{warehouse_path}/{table_name}" + metadata_path = f"{table_path}/metadata" + if client.fs_client.exists(metadata_path): + metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] + last_metadata_file = client.make_remote_url(sorted(metadata_files)[-1]) + catalog.register_table(table_id, last_metadata_file) + else: + arrow_schema = columns_to_arrow( + columns=client.schema.get_table_columns(table_name), + caps=client.capabilities, + ) + catalog.create_table( + table_id, + schema=ensure_iceberg_compatible_arrow_schema(arrow_schema), + location=client.make_remote_url(table_path), + ) + + return catalog def get_iceberg_tables( diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index ea03a85fb5..111059eb41 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -221,25 +221,26 @@ def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "Delta class IcebergLoadFilesystemJob(TableFormatLoadFilesystemJob): def run(self) -> None: - from dlt.common.libs.pyiceberg import ( - DLT_ICEBERG_NAMESPACE, - ensure_iceberg_compatible_arrow_schema, - write_iceberg_table, - get_catalog, - ) + from dlt.common.libs.pyiceberg import write_iceberg_table - arrow_table = self.arrow_dataset.to_table() - catalog = get_catalog(self._job_client, self.load_table_name) - catalog.create_namespace_if_not_exists(DLT_ICEBERG_NAMESPACE) - table = catalog.create_table_if_not_exists( - f"{DLT_ICEBERG_NAMESPACE}.{self.load_table_name}", - schema=ensure_iceberg_compatible_arrow_schema(arrow_table.schema), - location=self.make_remote_url(), - ) write_iceberg_table( - table=table, data=arrow_table, write_disposition=self._load_table["write_disposition"] + table=self._iceberg_table(), + data=self.arrow_dataset.to_table(), + write_disposition=self._load_table["write_disposition"], ) + def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyiceberg import get_catalog + + catalog = get_catalog(self._job_client, self.load_table_name) + return catalog.load_table(self.table_identifier) + + @property + def table_identifier(self) -> str: + from dlt.common.libs.pyiceberg import DLT_ICEBERG_NAMESPACE + + return f"{DLT_ICEBERG_NAMESPACE}.{self.load_table_name}" + class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: From ee6e22e372a913ffaddcb2c265aa57483b111980 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 15 Nov 2024 18:26:02 +0400 Subject: [PATCH 09/71] add s3 support for iceberg table format --- .github/workflows/test_destinations.yml | 5 ++++- .../configuration/specs/aws_credentials.py | 13 ++++++++++++- dlt/common/configuration/specs/mixins.py | 12 ++++++++++++ dlt/common/libs/pyiceberg.py | 18 +++++++++++------- .../load/pipeline/test_filesystem_pipeline.py | 9 +++++---- 5 files changed, 44 insertions(+), 13 deletions(-) create mode 100644 dlt/common/configuration/specs/mixins.py diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index df398e13ad..a784d25794 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -78,7 +78,10 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake -E pyiceberg + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index 5f69be6a33..3a145b9a76 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithPyicebergConfig from dlt.common.configuration.specs.exceptions import ( InvalidBoto3Session, ObjectStoreRsCredentialsException, @@ -16,7 +17,7 @@ @configspec -class AwsCredentialsWithoutDefaults(CredentialsConfiguration): +class AwsCredentialsWithoutDefaults(CredentialsConfiguration, WithPyicebergConfig): # credentials without boto implementation aws_access_key_id: str = None aws_secret_access_key: TSecretStrValue = None @@ -77,6 +78,16 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: return creds + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "s3.access-key-id": self.aws_access_key_id, + "s3.secret-access-key": self.aws_secret_access_key, + "s3.session-token": self.aws_session_token, + "s3.region": self.region_name, + "s3.endpoint": self.endpoint_url, + "s3.connect-timeout": 300, + } + @configspec class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/mixins.py b/dlt/common/configuration/specs/mixins.py new file mode 100644 index 0000000000..7a3c1b66e8 --- /dev/null +++ b/dlt/common/configuration/specs/mixins.py @@ -0,0 +1,12 @@ +from typing import Dict, Any +from abc import abstractmethod, ABC + + +class WithPyicebergConfig(ABC): + @abstractmethod + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + """Returns `pyiceberg` FileIO configuration dictionary. + + https://py.iceberg.apache.org/configuration/#fileio + """ + pass diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index b7b5cb8307..9b33bef000 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Any import os from dlt import version, Pipeline @@ -6,6 +6,8 @@ from dlt.common.schema.typing import TWriteDisposition from dlt.common.utils import assert_min_pkg_version from dlt.common.exceptions import MissingDependencyException +from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs.mixins import WithPyicebergConfig from dlt.destinations.impl.filesystem.filesystem import FilesystemClient assert_min_pkg_version( @@ -26,7 +28,6 @@ ) -DLT_ICEBERG_CATALOGS_DIR = "dlt_iceberg_catalogs" DLT_ICEBERG_NAMESPACE = "dlt" @@ -59,21 +60,18 @@ def get_catalog( table_name: str, ) -> SqlCatalog: """Returns single-table, ephemeral, in-memory Iceberg catalog.""" - warehouse_path = client.dataset_path - catalogs_path = warehouse_path + "/" + DLT_ICEBERG_CATALOGS_DIR - os.makedirs(catalogs_path, exist_ok=True) # create in-memory catalog catalog = SqlCatalog( "default", uri="sqlite:///:memory:", - warehouse=client.make_remote_url(warehouse_path), + **_get_fileio_config(client.config.credentials), ) catalog.create_namespace(DLT_ICEBERG_NAMESPACE) # add table to catalog table_id = f"{DLT_ICEBERG_NAMESPACE}.{table_name}" - table_path = f"{warehouse_path}/{table_name}" + table_path = f"{client.dataset_path}/{table_name}" metadata_path = f"{table_path}/metadata" if client.fs_client.exists(metadata_path): metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] @@ -124,3 +122,9 @@ def get_iceberg_tables( name: get_catalog(client, name).load_table(f"{DLT_ICEBERG_NAMESPACE}.{name}") for name in schema_iceberg_tables } + + +def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]: + if isinstance(credentials, WithPyicebergConfig): + return credentials.to_pyiceberg_fileio_config() + return {} diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 79acecac74..3eb00f60f6 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -32,6 +32,7 @@ DestinationTestConfiguration, MEMORY_BUCKET, FILE_BUCKET, + AWS_BUCKET, AZ_BUCKET, SFTP_BUCKET, ) @@ -271,11 +272,11 @@ def test_table_format_core( Tests all data types, all filesystems. Tests `append` and `replace` write dispositions (`merge` is tested elsewhere). """ - if ( - destination_config.table_format == "iceberg" - and destination_config.bucket_url != FILE_BUCKET + if destination_config.table_format == "iceberg" and destination_config.bucket_url not in ( + FILE_BUCKET, + AWS_BUCKET, ): - pytest.skip("remote filesystems not yet implemented for `iceberg`") + pytest.skip("only local and S3 filesystems are currently implemented `iceberg`") if destination_config.table_format == "delta": from dlt.common.libs.deltalake import get_delta_tables From bc51008bd2d41f9b3d9bcbcfff3cdb0aaf85e962 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sat, 16 Nov 2024 17:53:56 +0400 Subject: [PATCH 10/71] add schema evolution support for iceberg table format --- dlt/common/libs/pyiceberg.py | 17 ++-- .../impl/filesystem/filesystem.py | 2 +- .../load/pipeline/test_filesystem_pipeline.py | 89 ++++++++++++++----- 3 files changed, 79 insertions(+), 29 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index 9b33bef000..ecc682ea11 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -58,6 +58,7 @@ def write_iceberg_table( def get_catalog( client: FilesystemClient, table_name: str, + schema: pa.Schema = None, ) -> SqlCatalog: """Returns single-table, ephemeral, in-memory Iceberg catalog.""" @@ -74,17 +75,21 @@ def get_catalog( table_path = f"{client.dataset_path}/{table_name}" metadata_path = f"{table_path}/metadata" if client.fs_client.exists(metadata_path): + # found metadata; register existing table metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] last_metadata_file = client.make_remote_url(sorted(metadata_files)[-1]) - catalog.register_table(table_id, last_metadata_file) + table = catalog.register_table(table_id, last_metadata_file) + + # evolve schema + if schema is not None: + with table.update_schema() as update: + update.union_by_name(ensure_iceberg_compatible_arrow_schema(schema)) else: - arrow_schema = columns_to_arrow( - columns=client.schema.get_table_columns(table_name), - caps=client.capabilities, - ) + # found no metadata; create new table + assert schema is not None catalog.create_table( table_id, - schema=ensure_iceberg_compatible_arrow_schema(arrow_schema), + schema=ensure_iceberg_compatible_arrow_schema(schema), location=client.make_remote_url(table_path), ) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 111059eb41..6ae6196e07 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -232,7 +232,7 @@ def run(self) -> None: def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.pyiceberg import get_catalog - catalog = get_catalog(self._job_client, self.load_table_name) + catalog = get_catalog(self._job_client, self.load_table_name, self.arrow_dataset.schema) return catalog.load_table(self.table_identifier) @property diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 3eb00f60f6..5afe94e40e 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -2,7 +2,7 @@ import os import posixpath from pathlib import Path -from typing import Any, Callable, List, Dict, cast +from typing import Any, Callable, List, Dict, cast, Tuple from importlib.metadata import version as pkg_version from packaging.version import Version @@ -15,7 +15,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.schema.typing import TWriteDisposition, TTableFormat from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient @@ -653,7 +653,7 @@ def test_delta_table_partitioning_arrow_load_id( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, @@ -666,20 +666,58 @@ def test_delta_table_partitioning_arrow_load_id( pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"), ), ) -def test_delta_table_schema_evolution( +def test_table_format_schema_evolution( destination_config: DestinationTestConfiguration, write_disposition: TWriteDisposition, ) -> None: """Tests schema evolution (adding new columns) for `delta` table format.""" - from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data - from dlt.common.libs.pyarrow import pyarrow + if destination_config.table_format == "iceberg" and write_disposition == { + "disposition": "merge", + "strategy": "upsert", + }: + pytest.skip("`upsert` currently not implemented for `iceberg`") + + from dlt.common.libs.pyarrow import pyarrow, cast_arrow_schema_types + + def get_expected_actual( + table_name: str, table_format: TTableFormat, arrow_table: pyarrow.Table + ) -> Tuple[pyarrow.Table, pyarrow.Table]: + if table_format == "delta": + from dlt.common.libs.deltalake import ( + get_delta_tables, + ensure_delta_compatible_arrow_data, + ) + + dt = get_delta_tables(pipeline, table_name)[table_name] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import ( + get_iceberg_tables, + ensure_iceberg_compatible_arrow_data, + ) + + it = get_iceberg_tables(pipeline, table_name)[table_name] + expected = ensure_iceberg_compatible_arrow_data(arrow_table) + actual = it.scan().to_arrow() + + # work around pyiceberg bug https://github.com/apache/iceberg-python/issues/1128 + schema = cast_arrow_schema_types( + actual.schema, + { + pyarrow.types.is_large_string: pyarrow.string(), + pyarrow.types.is_large_binary: pyarrow.binary(), + }, + ) + actual = actual.cast(schema) + return (expected, actual) @dlt.resource( write_disposition=write_disposition, primary_key="pk", - table_format="delta", + table_format=destination_config.table_format, ) - def delta_table(data): + def evolving_table(data): yield data pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -691,11 +729,11 @@ def delta_table(data): assert arrow_table.shape == (1, 1) # initial load - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - expected = ensure_delta_compatible_arrow_data(arrow_table) - actual = dt.to_pyarrow_table() + expected, actual = get_expected_actual( + "evolving_table", destination_config.table_format, arrow_table + ) assert actual.equals(expected) # create Arrow table with many columns, two rows @@ -710,11 +748,11 @@ def delta_table(data): arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]]) # second load — this should evolve the schema (i.e. add the new columns) - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected = ensure_delta_compatible_arrow_data(arrow_table) + expected, actual = get_expected_actual( + "evolving_table", destination_config.table_format, arrow_table + ) if write_disposition == "append": # just check shape and schema for `append`, because table comparison is # more involved than with the other dispositions @@ -731,13 +769,20 @@ def delta_table(data): empty_arrow_table = arrow_table.schema.empty_table() # load 3 — this should evolve the schema without changing data - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(evolving_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema - assert actual.schema.equals(expected_schema) - expected_num_rows = 3 if write_disposition == "append" else 2 + expected, actual = get_expected_actual( + "evolving_table", destination_config.table_format, arrow_table + ) + assert actual.schema.equals(expected.schema) + if write_disposition == "append": + expected_num_rows = 3 + elif write_disposition == "replace" and destination_config.table_format == "delta": + expected_num_rows = 2 + elif write_disposition == "replace" and destination_config.table_format == "iceberg": + expected_num_rows = 0 + elif write_disposition == {"disposition": "merge", "strategy": "upsert"}: + expected_num_rows = 2 assert actual.num_rows == expected_num_rows # new column should have NULLs only assert ( From 2de58a23ac09d453077351236dae2082495a7a25 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sat, 16 Nov 2024 18:03:18 +0400 Subject: [PATCH 11/71] extract _register_table function --- dlt/common/libs/pyiceberg.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index ecc682ea11..e933953450 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -76,9 +76,7 @@ def get_catalog( metadata_path = f"{table_path}/metadata" if client.fs_client.exists(metadata_path): # found metadata; register existing table - metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] - last_metadata_file = client.make_remote_url(sorted(metadata_files)[-1]) - table = catalog.register_table(table_id, last_metadata_file) + table = _register_table(table_id, metadata_path, catalog, client) # evolve schema if schema is not None: @@ -133,3 +131,15 @@ def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]: if isinstance(credentials, WithPyicebergConfig): return credentials.to_pyiceberg_fileio_config() return {} + + +def _register_table( + identifier: str, + metadata_path: str, + catalog: SqlCatalog, + client: FilesystemClient, +) -> IcebergTable: + # TODO: implement faster way to obtain `last_metadata_file` (listing is slow) + metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] + last_metadata_file = client.make_remote_url(sorted(metadata_files)[-1]) + return catalog.register_table(identifier, last_metadata_file) From dd4ad0f9fb05dd40e7f2e797617e53844acc36cd Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Wed, 20 Nov 2024 23:12:51 +0400 Subject: [PATCH 12/71] add partition support for iceberg table format --- dlt/common/libs/pyiceberg.py | 11 +++- .../impl/filesystem/filesystem.py | 15 +++-- .../load/pipeline/test_filesystem_pipeline.py | 57 ++++++++++++------- 3 files changed, 54 insertions(+), 29 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index e933953450..9bc87b49f8 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -1,4 +1,4 @@ -from typing import Dict, Any +from typing import Dict, Any, List import os from dlt import version, Pipeline @@ -59,6 +59,7 @@ def get_catalog( client: FilesystemClient, table_name: str, schema: pa.Schema = None, + partition_columns: List[str] = None, ) -> SqlCatalog: """Returns single-table, ephemeral, in-memory Iceberg catalog.""" @@ -85,11 +86,15 @@ def get_catalog( else: # found no metadata; create new table assert schema is not None - catalog.create_table( + with catalog.create_table_transaction( table_id, schema=ensure_iceberg_compatible_arrow_schema(schema), location=client.make_remote_url(table_path), - ) + ) as txn: + # add partitioning + with txn.update_spec() as update_spec: + for col in partition_columns: + update_spec.add_identity(col) return catalog diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 6ae6196e07..1744a6aa11 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -135,6 +135,10 @@ def arrow_dataset(self) -> Any: return pyarrow.dataset.dataset(self.file_paths) + @property + def _partition_columns(self) -> List[str]: + return get_columns_names_with_prop(self._load_table, "partition") + class DeltaLoadFilesystemJob(TableFormatLoadFilesystemJob): def run(self) -> None: @@ -196,10 +200,6 @@ def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] else: return None - @property - def _partition_columns(self) -> List[str]: - return get_columns_names_with_prop(self._load_table, "partition") - def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "DeltaTable") -> "DeltaTable": # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.deltalake import ( DeltaTable, @@ -232,7 +232,12 @@ def run(self) -> None: def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.pyiceberg import get_catalog - catalog = get_catalog(self._job_client, self.load_table_name, self.arrow_dataset.schema) + catalog = get_catalog( + client=self._job_client, + table_name=self.load_table_name, + schema=self.arrow_dataset.schema, + partition_columns=self._partition_columns, + ) return catalog.load_table(self.table_identifier) @property diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 5afe94e40e..6edf2e09cb 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -516,34 +516,47 @@ def nested_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_partitioning( +def test_table_format_partitioning( destination_config: DestinationTestConfiguration, ) -> None: - """Tests partitioning for `delta` table format.""" + """Tests partitioning for `delta` and `iceberg` table formats.""" - from dlt.common.libs.deltalake import get_delta_tables from tests.pipeline.utils import users_materialize_table_schema + def assert_partition_columns( + table_name: str, table_format: TTableFormat, expected_partition_columns: List[str] + ) -> None: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + actual_partition_columns = dt.metadata().partition_columns + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + actual_partition_columns = [f.name for f in it.metadata.specs_struct().fields] + assert actual_partition_columns == expected_partition_columns + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) # zero partition columns - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def zero_part(): yield {"foo": 1, "bar": 1} info = pipeline.run(zero_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) assert load_table_counts(pipeline, "zero_part")["zero_part"] == 1 # one partition column - @dlt.resource(table_format="delta", columns={"c1": {"partition": True}}) + @dlt.resource(table_format=destination_config.table_format, columns={"c1": {"partition": True}}) def one_part(): yield [ {"c1": "foo", "c2": 1}, @@ -554,13 +567,13 @@ def one_part(): info = pipeline.run(one_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "one_part")["one_part"] - assert dt.metadata().partition_columns == ["c1"] + assert_partition_columns("one_part", destination_config.table_format, ["c1"]) assert load_table_counts(pipeline, "one_part")["one_part"] == 4 # two partition columns @dlt.resource( - table_format="delta", columns={"c1": {"partition": True}, "c2": {"partition": True}} + table_format=destination_config.table_format, + columns={"c1": {"partition": True}, "c2": {"partition": True}}, ) def two_part(): yield [ @@ -572,29 +585,31 @@ def two_part(): info = pipeline.run(two_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "two_part")["two_part"] - assert dt.metadata().partition_columns == ["c1", "c2"] + assert_partition_columns("two_part", destination_config.table_format, ["c1", "c2"]) assert load_table_counts(pipeline, "two_part")["two_part"] == 4 # test partitioning with empty source users_materialize_table_schema.apply_hints( - table_format="delta", + table_format=destination_config.table_format, columns={"id": {"partition": True}}, ) info = pipeline.run(users_materialize_table_schema()) assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.metadata().partition_columns == ["id"] + assert_partition_columns("users", destination_config.table_format, ["id"]) assert load_table_counts(pipeline, "users")["users"] == 0 # changing partitioning after initial table creation is not supported zero_part.apply_hints(columns={"foo": {"partition": True}}) - with pytest.raises(PipelineStepFailed) as pip_ex: + if destination_config.table_format == "delta": + # Delta raises error when trying to change partitioning + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(zero_part()) + assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) + assert "partitioning" in pip_ex.value.__context__.retry_message + elif destination_config.table_format == "iceberg": + # while Iceberg supports partition evolution, we don't apply it pipeline.run(zero_part()) - assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) - assert "partitioning" in pip_ex.value.__context__.retry_message - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) @pytest.mark.parametrize( From 04be59bdd434ffee7694ca7ce91c89b24752c351 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Wed, 20 Nov 2024 23:13:08 +0400 Subject: [PATCH 13/71] update docstring --- tests/load/pipeline/test_filesystem_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 6edf2e09cb..178282afcf 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -685,7 +685,7 @@ def test_table_format_schema_evolution( destination_config: DestinationTestConfiguration, write_disposition: TWriteDisposition, ) -> None: - """Tests schema evolution (adding new columns) for `delta` table format.""" + """Tests schema evolution (adding new columns) for `delta` and `iceberg` table formats.""" if destination_config.table_format == "iceberg" and write_disposition == { "disposition": "merge", "strategy": "upsert", From 42f59c76031a4ee88e6b09392cae87721eb9e51f Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 21 Nov 2024 11:58:12 +0400 Subject: [PATCH 14/71] enable child table test for iceberg table format --- .../load/pipeline/test_filesystem_pipeline.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 178282afcf..0665f4a255 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -429,17 +429,17 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_child_tables( +def test_table_format_child_tables( destination_config: DestinationTestConfiguration, ) -> None: - """Tests child table handling for `delta` table format.""" + """Tests child table handling for `delta` and `iceberg` table formats.""" - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def nested_table(): yield [ { @@ -501,15 +501,16 @@ def nested_table(): assert len(rows_dict["nested_table__child"]) == 3 assert len(rows_dict["nested_table__child__grandchild"]) == 5 - # now drop children and grandchildren, use merge write disposition to create and pass full table chain - # also for tables that do not have jobs - info = pipeline.run( - [{"foo": 3}] * 10000, - table_name="nested_table", - primary_key="foo", - write_disposition="merge", - ) - assert_load_info(info) + if destination_config.supports_merge: + # now drop children and grandchildren, use merge write disposition to create and pass full table chain + # also for tables that do not have jobs + info = pipeline.run( + [{"foo": 3}] * 10000, + table_name="nested_table", + primary_key="foo", + write_disposition="merge", + ) + assert_load_info(info) @pytest.mark.parametrize( From a540135c336ad8af1138b73edc1674321d293bba Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 21 Nov 2024 23:05:57 +0400 Subject: [PATCH 15/71] enable empty source test for iceberg table format --- .../load/pipeline/test_filesystem_pipeline.py | 166 ++++++++++-------- 1 file changed, 95 insertions(+), 71 deletions(-) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 0665f4a255..c021686d11 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -224,6 +224,48 @@ def some_source(): assert table.column("value").to_pylist() == [1, 2, 3, 4, 5] +# here start the `table_format` tests + + +def get_expected_actual( + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + arrow_table: "pyarrow.Table", # type: ignore[name-defined] # noqa: F821 +) -> Tuple["pyarrow.Table", "pyarrow.Table"]: # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyarrow import pyarrow, cast_arrow_schema_types + + if table_format == "delta": + from dlt.common.libs.deltalake import ( + get_delta_tables, + ensure_delta_compatible_arrow_data, + ) + + dt = get_delta_tables(pipeline, table_name)[table_name] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import ( + get_iceberg_tables, + ensure_iceberg_compatible_arrow_data, + ) + + it = get_iceberg_tables(pipeline, table_name)[table_name] + expected = ensure_iceberg_compatible_arrow_data(arrow_table) + actual = it.scan().to_arrow() + + # work around pyiceberg bug https://github.com/apache/iceberg-python/issues/1128 + schema = cast_arrow_schema_types( + actual.schema, + { + pyarrow.types.is_large_string: pyarrow.string(), + pyarrow.types.is_large_binary: pyarrow.binary(), + }, + ) + actual = actual.cast(schema) + return (expected, actual) + + @pytest.mark.skip( reason="pyarrow version check not needed anymore, since we have 17 as a dependency" ) @@ -693,40 +735,7 @@ def test_table_format_schema_evolution( }: pytest.skip("`upsert` currently not implemented for `iceberg`") - from dlt.common.libs.pyarrow import pyarrow, cast_arrow_schema_types - - def get_expected_actual( - table_name: str, table_format: TTableFormat, arrow_table: pyarrow.Table - ) -> Tuple[pyarrow.Table, pyarrow.Table]: - if table_format == "delta": - from dlt.common.libs.deltalake import ( - get_delta_tables, - ensure_delta_compatible_arrow_data, - ) - - dt = get_delta_tables(pipeline, table_name)[table_name] - expected = ensure_delta_compatible_arrow_data(arrow_table) - actual = dt.to_pyarrow_table() - elif table_format == "iceberg": - from dlt.common.libs.pyiceberg import ( - get_iceberg_tables, - ensure_iceberg_compatible_arrow_data, - ) - - it = get_iceberg_tables(pipeline, table_name)[table_name] - expected = ensure_iceberg_compatible_arrow_data(arrow_table) - actual = it.scan().to_arrow() - - # work around pyiceberg bug https://github.com/apache/iceberg-python/issues/1128 - schema = cast_arrow_schema_types( - actual.schema, - { - pyarrow.types.is_large_string: pyarrow.string(), - pyarrow.types.is_large_binary: pyarrow.binary(), - }, - ) - actual = actual.cast(schema) - return (expected, actual) + from dlt.common.libs.pyarrow import pyarrow @dlt.resource( write_disposition=write_disposition, @@ -748,7 +757,7 @@ def evolving_table(data): info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) expected, actual = get_expected_actual( - "evolving_table", destination_config.table_format, arrow_table + pipeline, "evolving_table", destination_config.table_format, arrow_table ) assert actual.equals(expected) @@ -767,7 +776,7 @@ def evolving_table(data): info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) expected, actual = get_expected_actual( - "evolving_table", destination_config.table_format, arrow_table + pipeline, "evolving_table", destination_config.table_format, arrow_table ) if write_disposition == "append": # just check shape and schema for `append`, because table comparison is @@ -788,7 +797,7 @@ def evolving_table(data): info = pipeline.run(evolving_table(empty_arrow_table)) assert_load_info(info) expected, actual = get_expected_actual( - "evolving_table", destination_config.table_format, arrow_table + pipeline, "evolving_table", destination_config.table_format, arrow_table ) assert actual.schema.equals(expected.schema) if write_disposition == "append": @@ -811,23 +820,38 @@ def evolving_table(data): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_empty_source( +def test_table_format_empty_source( destination_config: DestinationTestConfiguration, ) -> None: - """Tests empty source handling for `delta` table format. + """Tests empty source handling for `delta` and `iceberg` table formats. Tests both empty Arrow table and `dlt.mark.materialize_table_schema()`. """ - from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_data, get_delta_tables from tests.pipeline.utils import users_materialize_table_schema - @dlt.resource(table_format="delta") - def delta_table(data): + def get_table_version( # type: ignore[return] + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + ) -> int: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + return dt.version() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + return it.last_sequence_number - 1 # subtract 1 to match `delta` + + @dlt.resource(table_format=destination_config.table_format) + def a_table(data): yield data # create empty Arrow table with schema @@ -847,49 +871,49 @@ def delta_table(data): # run 1: empty Arrow table with schema # this should create empty Delta table with same schema as Arrow table - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (0, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 0 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (0, expected.num_columns) + assert actual.schema.equals(expected.schema) # run 2: non-empty Arrow table with same schema as run 1 # this should load records into Delta table - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(a_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 1 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 1 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (2, expected.num_columns) + assert actual.schema.equals(expected.schema) # now run the empty frame again - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - # use materialized list - # NOTE: this will create an empty parquet file with a schema takes from dlt schema. - # the original parquet file had a nested (struct) type in `json` field that is now - # in the delta table schema. the empty parquet file lost this information and had - # string type (converted from dlt `json`) - info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="delta_table") - assert_load_info(info) + if destination_config.table_format == "delta": + # use materialized list + # NOTE: this will create an empty parquet file with a schema takes from dlt schema. + # the original parquet file had a nested (struct) type in `json` field that is now + # in the delta table schema. the empty parquet file lost this information and had + # string type (converted from dlt `json`) + info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="a_table") + assert_load_info(info) # test `dlt.mark.materialize_table_schema()` - users_materialize_table_schema.apply_hints(table_format="delta") + users_materialize_table_schema.apply_hints(table_format=destination_config.table_format) info = pipeline.run(users_materialize_table_schema(), loader_file_format="parquet") assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.num_rows == 0 - assert "id", "name" == dt_arrow_table.schema.names[:2] + assert get_table_version(pipeline, "users", destination_config.table_format) == 0 + _, actual = get_expected_actual( + pipeline, "users", destination_config.table_format, empty_arrow_table + ) + assert actual.num_rows == 0 + assert "id", "name" == actual.schema.names[:2] @pytest.mark.parametrize( From 3d1dc6370174b8ef6ccb75e2037fd1a449364d3a Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 22 Nov 2024 21:12:10 +0400 Subject: [PATCH 16/71] make iceberg catalog namespace configurable and default to dataset name --- dlt/common/libs/pyiceberg.py | 17 +++++++++-------- dlt/destinations/impl/filesystem/filesystem.py | 4 +--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index 9bc87b49f8..ff31e5bd9f 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -1,5 +1,4 @@ -from typing import Dict, Any, List -import os +from typing import Dict, Any, List, Optional from dlt import version, Pipeline from dlt.common.libs.pyarrow import cast_arrow_schema_types, columns_to_arrow @@ -28,9 +27,6 @@ ) -DLT_ICEBERG_NAMESPACE = "dlt" - - def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema: ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = { pa.types.is_time: pa.string(), @@ -58,6 +54,7 @@ def write_iceberg_table( def get_catalog( client: FilesystemClient, table_name: str, + namespace_name: Optional[str] = None, schema: pa.Schema = None, partition_columns: List[str] = None, ) -> SqlCatalog: @@ -69,10 +66,14 @@ def get_catalog( uri="sqlite:///:memory:", **_get_fileio_config(client.config.credentials), ) - catalog.create_namespace(DLT_ICEBERG_NAMESPACE) + + # create namespace + if namespace_name is None: + namespace_name = client.dataset_name + catalog.create_namespace(namespace_name) # add table to catalog - table_id = f"{DLT_ICEBERG_NAMESPACE}.{table_name}" + table_id = f"{namespace_name}.{table_name}" table_path = f"{client.dataset_path}/{table_name}" metadata_path = f"{table_path}/metadata" if client.fs_client.exists(metadata_path): @@ -127,7 +128,7 @@ def get_iceberg_tables( schema_iceberg_tables = [t for t in schema_iceberg_tables if t in tables] return { - name: get_catalog(client, name).load_table(f"{DLT_ICEBERG_NAMESPACE}.{name}") + name: get_catalog(client, name).load_table(f"{pipeline.dataset_name}.{name}") for name in schema_iceberg_tables } diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 1744a6aa11..22c03db2d7 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -242,9 +242,7 @@ def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-define @property def table_identifier(self) -> str: - from dlt.common.libs.pyiceberg import DLT_ICEBERG_NAMESPACE - - return f"{DLT_ICEBERG_NAMESPACE}.{self.load_table_name}" + return f"{self._job_client.dataset_name}.{self.load_table_name}" class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): From 59e6d08e6be71c679f2fdf1af63c4a75e6e326f9 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 22 Nov 2024 21:14:51 +0400 Subject: [PATCH 17/71] add optional typing --- dlt/common/libs/pyiceberg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index ff31e5bd9f..7eb3b772ae 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -55,8 +55,8 @@ def get_catalog( client: FilesystemClient, table_name: str, namespace_name: Optional[str] = None, - schema: pa.Schema = None, - partition_columns: List[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, ) -> SqlCatalog: """Returns single-table, ephemeral, in-memory Iceberg catalog.""" @@ -101,7 +101,7 @@ def get_catalog( def get_iceberg_tables( - pipeline: Pipeline, *tables: str, schema_name: str = None + pipeline: Pipeline, *tables: str, schema_name: Optional[str] = None ) -> Dict[str, IcebergTable]: from dlt.common.schema.utils import get_table_format From 71e436d3f3447204be707ff7cf9f978514bca455 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 13:41:55 +0400 Subject: [PATCH 18/71] fix typo --- tests/load/pipeline/test_filesystem_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index c021686d11..5439419952 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -318,7 +318,7 @@ def test_table_format_core( FILE_BUCKET, AWS_BUCKET, ): - pytest.skip("only local and S3 filesystems are currently implemented `iceberg`") + pytest.skip("only local and S3 filesystems are currently implemented for `iceberg`") if destination_config.table_format == "delta": from dlt.common.libs.deltalake import get_delta_tables From 2effa8faf9794c5726543b6c3bb3bfb3ccf648be Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 13:43:22 +0400 Subject: [PATCH 19/71] improve typing --- tests/load/filesystem/test_sql_client.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index ac2ada2551..66561521bc 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -1,7 +1,7 @@ """Test the duckdb supported sql client for special internal features""" -from typing import Any +from typing import Optional import pytest import dlt @@ -12,6 +12,7 @@ from dlt import Pipeline from dlt.common.utils import uniq_id +from dlt.common.schema.typing import TTableFormat from tests.load.utils import ( destinations_configs, @@ -37,7 +38,7 @@ def _run_dataset_checks( pipeline: Pipeline, destination_config: DestinationTestConfiguration, secret_directory: str, - table_format: Any = None, + table_format: Optional[TTableFormat] = None, alternate_access_pipeline: Pipeline = None, ) -> None: total_records = 200 From 8979ee1433fa9957f9de776b53734633442c1102 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 13:44:46 +0400 Subject: [PATCH 20/71] extract logic into dedicated function --- dlt/common/libs/pyiceberg.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index 7eb3b772ae..972fa75ca4 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -139,13 +139,17 @@ def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]: return {} +def _get_last_metadata_file(metadata_path: str, client: FilesystemClient) -> str: + # TODO: implement faster way to obtain `last_metadata_file` (listing is slow) + metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] + return _make_path(sorted(metadata_files)[-1], client) + + def _register_table( identifier: str, metadata_path: str, catalog: SqlCatalog, client: FilesystemClient, ) -> IcebergTable: - # TODO: implement faster way to obtain `last_metadata_file` (listing is slow) - metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] - last_metadata_file = client.make_remote_url(sorted(metadata_files)[-1]) + last_metadata_file = _get_last_metadata_file(metadata_path, client) return catalog.register_table(identifier, last_metadata_file) From e956b09309a286281c82adfc41bbfd81de1e33fa Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 13:46:11 +0400 Subject: [PATCH 21/71] add iceberg read support to filesystem sql client --- dlt/common/libs/pyiceberg.py | 8 +++++++- .../impl/filesystem/sql_client.py | 19 ++++++++++++++++++- tests/load/filesystem/test_sql_client.py | 15 ++++++++++++--- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index 972fa75ca4..9b3affc6be 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -90,7 +90,7 @@ def get_catalog( with catalog.create_table_transaction( table_id, schema=ensure_iceberg_compatible_arrow_schema(schema), - location=client.make_remote_url(table_path), + location=_make_path(table_path, client), ) as txn: # add partitioning with txn.update_spec() as update_spec: @@ -153,3 +153,9 @@ def _register_table( ) -> IcebergTable: last_metadata_file = _get_last_metadata_file(metadata_path, client) return catalog.register_table(identifier, last_metadata_file) + + +def _make_path(path: str, client: FilesystemClient) -> str: + # don't use file protocol for local files because duckdb does not support it + # https://github.com/duckdb/duckdb/issues/13669 + return path if client.is_local_filesystem else client.make_remote_url(path) diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index fec761ff36..48736e0e71 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -255,6 +255,13 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: from_statement = "" if schema_table.get("table_format") == "delta": from_statement = f"delta_scan('{resolved_folder}')" + elif schema_table.get("table_format") == "iceberg": + from dlt.common.libs.pyiceberg import _get_last_metadata_file + + self._setup_iceberg(self._conn) + metadata_path = f"{resolved_folder}/metadata" + last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client) + from_statement = f"iceberg_scan('{last_metadata_file}')" elif first_file_type == "parquet": from_statement = f"read_parquet([{resolved_files_string}])" elif first_file_type == "jsonl": @@ -264,7 +271,7 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: else: raise NotImplementedError( f"Unknown filetype {first_file_type} for table {table_name}. Currently only" - " jsonl and parquet files as well as delta tables are supported." + " jsonl and parquet files as well as delta and iceberg tables are supported." ) # create table @@ -296,6 +303,16 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB with super().execute_query(query, *args, **kwargs) as cursor: yield cursor + @staticmethod + def _setup_iceberg(conn: duckdb.DuckDBPyConnection) -> None: + # needed to make persistent secrets work in new connection + # https://github.com/duckdb/duckdb_iceberg/issues/83 + conn.sql("FROM duckdb_secrets();").show() + + # `duckdb_iceberg` extension does not support autoloading + # https://github.com/duckdb/duckdb_iceberg/issues/71 + conn.execute("INSTALL iceberg; LOAD iceberg;") + def __del__(self) -> None: if self.memory_db: self.memory_db.close() diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index 66561521bc..b47842cdce 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -17,6 +17,7 @@ from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, + FILE_BUCKET, GCS_BUCKET, SFTP_BUCKET, MEMORY_BUCKET, @@ -145,6 +146,8 @@ def _external_duckdb_connection() -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux, see duckdb docs external_db.sql("SET azure_transport_option_type = 'curl';") external_db.sql(f"SET secret_directory = '{secret_directory}';") + if table_format == "iceberg": + FilesystemSqlClient._setup_iceberg(external_db) return external_db def _fs_sql_client_for_external_db( @@ -284,15 +287,21 @@ def test_read_interfaces_filesystem( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=[SFTP_BUCKET, MEMORY_BUCKET], # NOTE: delta does not work on memory buckets ), ids=lambda x: x.name, ) -def test_delta_tables( +def test_table_formats( destination_config: DestinationTestConfiguration, secret_directory: str ) -> None: + if destination_config.table_format == "iceberg" and destination_config.bucket_url not in ( + FILE_BUCKET, + AWS_BUCKET, + ): + pytest.skip("only local and S3 filesystems are currently implemented for `iceberg`") + os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700" pipeline = destination_config.setup_pipeline( @@ -316,7 +325,7 @@ def test_delta_tables( pipeline, destination_config, secret_directory=secret_directory, - table_format="delta", + table_format=destination_config.table_format, alternate_access_pipeline=access_pipeline, ) From 571bf0c22d56bf3ad0466f4833e4fd2fb37dbb47 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 13:46:52 +0400 Subject: [PATCH 22/71] remove unused import --- tests/load/filesystem/test_sql_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index b47842cdce..654594b6c6 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -7,7 +7,6 @@ import dlt import os import shutil -import logging from dlt import Pipeline From 0ec5fcb37014ea99a4d8d4b9c0a0bf6163b6ca1c Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 15:12:13 +0400 Subject: [PATCH 23/71] add todo --- tests/load/pipeline/test_filesystem_pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 5439419952..918089c5f6 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -802,10 +802,11 @@ def evolving_table(data): assert actual.schema.equals(expected.schema) if write_disposition == "append": expected_num_rows = 3 - elif write_disposition == "replace" and destination_config.table_format == "delta": - expected_num_rows = 2 - elif write_disposition == "replace" and destination_config.table_format == "iceberg": + elif write_disposition == "replace": expected_num_rows = 0 + if destination_config.table_format == "delta": + # TODO: fix https://github.com/dlt-hub/dlt/issues/2092 and remove this if-clause + expected_num_rows = 2 elif write_disposition == {"disposition": "merge", "strategy": "upsert"}: expected_num_rows = 2 assert actual.num_rows == expected_num_rows From ab0b9a0c070f07a188ee0b810d030c0757117397 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 15:35:18 +0400 Subject: [PATCH 24/71] extract logic into separate functions --- dlt/common/libs/pyiceberg.py | 56 ++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index 9b3affc6be..a38761d9bc 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -5,6 +5,7 @@ from dlt.common.schema.typing import TWriteDisposition from dlt.common.utils import assert_min_pkg_version from dlt.common.exceptions import MissingDependencyException +from dlt.common.storages.configuration import FileSystemCredentials from dlt.common.configuration.specs import CredentialsConfiguration from dlt.common.configuration.specs.mixins import WithPyicebergConfig from dlt.destinations.impl.filesystem.filesystem import FilesystemClient @@ -51,27 +52,22 @@ def write_iceberg_table( table.overwrite(ensure_iceberg_compatible_arrow_data(data)) -def get_catalog( +def get_sql_catalog(credentials: FileSystemCredentials) -> SqlCatalog: + return SqlCatalog( + "default", + uri="sqlite:///:memory:", + **_get_fileio_config(credentials), + ) + + +def create_or_evolve_table( + catalog: SqlCatalog, client: FilesystemClient, table_name: str, namespace_name: Optional[str] = None, schema: Optional[pa.Schema] = None, partition_columns: Optional[List[str]] = None, ) -> SqlCatalog: - """Returns single-table, ephemeral, in-memory Iceberg catalog.""" - - # create in-memory catalog - catalog = SqlCatalog( - "default", - uri="sqlite:///:memory:", - **_get_fileio_config(client.config.credentials), - ) - - # create namespace - if namespace_name is None: - namespace_name = client.dataset_name - catalog.create_namespace(namespace_name) - # add table to catalog table_id = f"{namespace_name}.{table_name}" table_path = f"{client.dataset_path}/{table_name}" @@ -100,6 +96,36 @@ def get_catalog( return catalog +def get_catalog( + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> SqlCatalog: + """Returns single-table, ephemeral, in-memory Iceberg catalog.""" + + # create in-memory catalog + catalog = get_sql_catalog(client.config.credentials) + + # create namespace + if namespace_name is None: + namespace_name = client.dataset_name + catalog.create_namespace(namespace_name) + + # add table to catalog + catalog = create_or_evolve_table( + catalog=catalog, + client=client, + table_name=table_name, + namespace_name=namespace_name, + schema=schema, + partition_columns=partition_columns, + ) + + return catalog + + def get_iceberg_tables( pipeline: Pipeline, *tables: str, schema_name: Optional[str] = None ) -> Dict[str, IcebergTable]: From e149ba62db52695ab41ed36b1739ab39c7808759 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 17:18:41 +0400 Subject: [PATCH 25/71] add azure support for iceberg table format --- .../configuration/specs/azure_credentials.py | 22 +++++++++++++++++-- dlt/common/libs/pyiceberg.py | 6 ++++- .../load/pipeline/test_filesystem_pipeline.py | 5 ++++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index c2071e2188..09fbe57794 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -8,13 +8,14 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithPyicebergConfig from dlt import version _AZURE_STORAGE_EXTRA = f"{version.DLT_PKG_NAME}[az]" @configspec -class AzureCredentialsWithoutDefaults(CredentialsConfiguration): +class AzureCredentialsWithoutDefaults(CredentialsConfiguration, WithPyicebergConfig): """Credentials for Azure Blob Storage, compatible with adlfs""" azure_storage_account_name: str = None @@ -40,6 +41,13 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: creds.pop("account_key") return creds + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.account-key": self.azure_storage_account_key, + "adlfs.sas-token": self.azure_storage_sas_token, + } + def create_sas_token(self) -> None: try: from azure.storage.blob import generate_account_sas, ResourceTypes @@ -63,7 +71,9 @@ def on_partial(self) -> None: @configspec -class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration): +class AzureServicePrincipalCredentialsWithoutDefaults( + CredentialsConfiguration, WithPyicebergConfig +): azure_storage_account_name: str = None azure_tenant_id: str = None azure_client_id: str = None @@ -81,6 +91,14 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: # https://docs.rs/object_store/latest/object_store/azure return self.to_adlfs_credentials() + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.tenant-id": self.azure_tenant_id, + "adlfs.client-id": self.azure_client_id, + "adlfs.client-secret": self.azure_client_secret, + } + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index a38761d9bc..8b08724b44 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -184,4 +184,8 @@ def _register_table( def _make_path(path: str, client: FilesystemClient) -> str: # don't use file protocol for local files because duckdb does not support it # https://github.com/duckdb/duckdb/issues/13669 - return path if client.is_local_filesystem else client.make_remote_url(path) + if not client.is_local_filesystem: + path = client.config.make_url(path) + # pyiceberg does not know `az://` scheme + path = path.replace("az://", "abfss://") + return path diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 918089c5f6..c7def8886f 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -34,6 +34,7 @@ FILE_BUCKET, AWS_BUCKET, AZ_BUCKET, + ABFS_BUCKET, SFTP_BUCKET, ) @@ -317,8 +318,10 @@ def test_table_format_core( if destination_config.table_format == "iceberg" and destination_config.bucket_url not in ( FILE_BUCKET, AWS_BUCKET, + AZ_BUCKET, + ABFS_BUCKET, ): - pytest.skip("only local and S3 filesystems are currently implemented for `iceberg`") + pytest.skip("only local, S3, and Azure filesystems are currently implemented for `iceberg`") if destination_config.table_format == "delta": from dlt.common.libs.deltalake import get_delta_tables From 27b8659b9b3c798d695b9247fdc24eade9d7e428 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 19:30:27 +0400 Subject: [PATCH 26/71] generalize delta table format tests --- .../load/pipeline/test_filesystem_pipeline.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index c7def8886f..d20f050a22 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -383,15 +383,16 @@ def data_types(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_does_not_contain_job_files( +def test_table_format_does_not_contain_job_files( destination_config: DestinationTestConfiguration, ) -> None: - """Asserts Parquet job files do not end up in Delta table.""" + """Asserts Parquet job files do not end up in table.""" pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -428,17 +429,18 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_multiple_files( +def test_table_format_multiple_files( destination_config: DestinationTestConfiguration, ) -> None: - """Tests loading multiple files into a Delta table. + """Tests loading multiple files into a table. - Files should be loaded into the Delta table in a single commit. + Files should be loaded into the table in a single commit. """ from dlt.common.libs.deltalake import get_delta_tables @@ -924,12 +926,13 @@ def a_table(data): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_mixed_source( +def test_table_format_mixed_source( destination_config: DestinationTestConfiguration, ) -> None: """Tests file format handling in mixed source. @@ -973,12 +976,13 @@ def s(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_dynamic_dispatch( +def test_table_format_dynamic_dispatch( destination_config: DestinationTestConfiguration, ) -> None: @dlt.resource(primary_key="id", table_name=lambda i: i["type"], table_format="delta") From d39d58df2eb8cd0811fc48591f9d925ccdf3da87 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 24 Nov 2024 20:05:31 +0400 Subject: [PATCH 27/71] enable get tables function test for iceberg table format --- .../load/pipeline/test_filesystem_pipeline.py | 85 +++++++++++-------- 1 file changed, 50 insertions(+), 35 deletions(-) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index d20f050a22..1a0d05a06d 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -1005,80 +1005,95 @@ def github_events(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_get_delta_tables_helper( +def test_table_format_get_tables_helper( destination_config: DestinationTestConfiguration, ) -> None: - """Tests `get_delta_tables` helper function.""" - from dlt.common.libs.deltalake import DeltaTable, get_delta_tables + """Tests `get_delta_tables` / `get_iceberg_tables` helper functions.""" + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import DeltaTable, get_delta_tables - @dlt.resource(table_format="delta") - def foo_delta(): + get_tables = get_delta_tables + get_num_rows = lambda table: table.to_pyarrow_table().num_rows + elif destination_config.table_format == "iceberg": + from dlt.common.libs.pyiceberg import IcebergTable, get_iceberg_tables + + get_tables = get_iceberg_tables # type: ignore[assignment] + get_num_rows = lambda table: table.scan().to_arrow().num_rows + + @dlt.resource(table_format=destination_config.table_format) + def foo_table_format(): yield [{"foo": 1}, {"foo": 2}] - @dlt.resource(table_format="delta") - def bar_delta(): + @dlt.resource(table_format=destination_config.table_format) + def bar_table_format(): yield [{"bar": 1}] @dlt.resource - def baz_not_delta(): + def baz_not_table_format(): yield [{"baz": 1}] pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - info = pipeline.run(foo_delta()) + info = pipeline.run(foo_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta"} - assert isinstance(delta_tables["foo_delta"], DeltaTable) - assert delta_tables["foo_delta"].to_pyarrow_table().num_rows == 2 + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format"} + if destination_config.table_format == "delta": + assert isinstance(tables["foo_table_format"], DeltaTable) + elif destination_config.table_format == "iceberg": + assert isinstance(tables["foo_table_format"], IcebergTable) + assert get_num_rows(tables["foo_table_format"]) == 2 - info = pipeline.run([foo_delta(), bar_delta(), baz_not_delta()]) + info = pipeline.run([foo_table_format(), bar_table_format(), baz_not_table_format()]) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta", "bar_delta"} - assert delta_tables["bar_delta"].to_pyarrow_table().num_rows == 1 - assert get_delta_tables(pipeline, "foo_delta").keys() == {"foo_delta"} - assert get_delta_tables(pipeline, "bar_delta").keys() == {"bar_delta"} - assert get_delta_tables(pipeline, "foo_delta", "bar_delta").keys() == {"foo_delta", "bar_delta"} + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format", "bar_table_format"} + assert get_num_rows(tables["bar_table_format"]) == 1 + assert get_tables(pipeline, "foo_table_format").keys() == {"foo_table_format"} + assert get_tables(pipeline, "bar_table_format").keys() == {"bar_table_format"} + assert get_tables(pipeline, "foo_table_format", "bar_table_format").keys() == { + "foo_table_format", + "bar_table_format", + } # test with child table - @dlt.resource(table_format="delta") - def parent_delta(): + @dlt.resource(table_format=destination_config.table_format) + def parent_table_format(): yield [{"foo": 1, "child": [1, 2, 3]}] - info = pipeline.run(parent_delta()) + info = pipeline.run(parent_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert "parent_delta__child" in delta_tables.keys() - assert delta_tables["parent_delta__child"].to_pyarrow_table().num_rows == 3 + tables = get_tables(pipeline) + assert "parent_table_format__child" in tables.keys() + assert get_num_rows(tables["parent_table_format__child"]) == 3 # test invalid input with pytest.raises(ValueError): - get_delta_tables(pipeline, "baz_not_delta") + get_tables(pipeline, "baz_not_table_format") with pytest.raises(ValueError): - get_delta_tables(pipeline, "non_existing_table") + get_tables(pipeline, "non_existing_table") # test unknown schema with pytest.raises(FileNotFoundError): - get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2") + get_tables(pipeline, "non_existing_table", schema_name="aux_2") # load to a new schema and under new name aux_schema = dlt.Schema("aux_2") # NOTE: you cannot have a file with name - info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema) + info = pipeline.run(parent_table_format().with_name("aux_table"), schema=aux_schema) # also state in seprate package assert_load_info(info, expected_load_packages=2) - delta_tables = get_delta_tables(pipeline, schema_name="aux_2") - assert "aux_delta__child" in delta_tables.keys() - get_delta_tables(pipeline, "aux_delta", schema_name="aux_2") + tables = get_tables(pipeline, schema_name="aux_2") + assert "aux_table__child" in tables.keys() + get_tables(pipeline, "aux_table", schema_name="aux_2") with pytest.raises(ValueError): - get_delta_tables(pipeline, "aux_delta") + get_tables(pipeline, "aux_table") @pytest.mark.parametrize( From 547a37a4a743812abb4b7e9cc4e849376d54de56 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 25 Nov 2024 11:29:17 +0400 Subject: [PATCH 28/71] remove ignores --- dlt/common/data_writers/buffered.py | 2 +- dlt/common/destination/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index aa20aff760..6ef431a4d0 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if self.writer_spec.is_binary_format: self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore[unused-ignore] + self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index 2036a668af..c98344b687 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -38,7 +38,7 @@ def verify_schema_capabilities( exception_log: List[Exception] = [] # combined casing function case_identifier = lambda ident: capabilities.casefold_identifier( - (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore[unused-ignore] + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) ) table_name_lookup: DictStrStr = {} # name collision explanation From 53b2d56cf1ff83e4b1933aeb8f51abb7e298d3bc Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 25 Nov 2024 12:22:09 +0400 Subject: [PATCH 29/71] undo table directory management change --- dlt/destinations/impl/filesystem/filesystem.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 22c03db2d7..757f25bcd7 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -406,13 +406,9 @@ def update_stored_schema( ) -> TSchemaTables: applied_update = super().update_stored_schema(only_tables, expected_update) # create destination dirs for all tables - tables = self.schema.tables - table_names = only_tables or tables.keys() + table_names = only_tables or self.schema.tables.keys() dirs_to_create = self.get_table_dirs(table_names) for tables_name, directory in zip(table_names, dirs_to_create): - if tables[tables_name].get("table_format") in ("delta", "iceberg"): - # let table format libs manage table directory - continue self.fs_client.makedirs(directory, exist_ok=True) # we need to mark the folders of the data tables as initialized if tables_name in self.schema.dlt_table_names(): From 3ff9fb779533021ae9da2c2628d41bbd1761f4a1 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 25 Nov 2024 15:07:51 +0400 Subject: [PATCH 30/71] enable test_read_interfaces tests for iceberg --- tests/load/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/load/utils.py b/tests/load/utils.py index 3de8dcd1fe..e368cfcacd 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -627,6 +627,7 @@ def destinations_configs( extra_info=bucket, table_format="iceberg", supports_merge=False, + file_format="parquet", ) ] From 5b0cd17fae8bc84091e532a06c2ca6f1ca70c408 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 25 Nov 2024 15:31:45 +0400 Subject: [PATCH 31/71] fix active table format filter --- tests/load/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/load/utils.py b/tests/load/utils.py index e368cfcacd..27a7e7f43d 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -636,9 +636,11 @@ def destinations_configs( conf for conf in destination_configs if conf.destination_type in ACTIVE_DESTINATIONS ] - # filter out non active destinations + # filter out non active table formats destination_configs = [ - conf for conf in destination_configs if conf.table_format in ACTIVE_TABLE_FORMATS + conf + for conf in destination_configs + if conf.table_format is None or conf.table_format in ACTIVE_TABLE_FORMATS ] # filter out destinations not in subset From 8798d7942796561ab8b3947c4029523646c1f3f6 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 25 Nov 2024 18:01:26 +0400 Subject: [PATCH 32/71] use mixin for object store rs credentials --- dlt/common/configuration/specs/aws_credentials.py | 6 ++++-- dlt/common/configuration/specs/azure_credentials.py | 4 ++-- dlt/common/configuration/specs/gcp_credentials.py | 3 ++- dlt/common/configuration/specs/mixins.py | 12 ++++++++++++ dlt/common/libs/deltalake.py | 6 +++--- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index 3a145b9a76..a75cd85225 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -8,7 +8,7 @@ CredentialsWithDefault, configspec, ) -from dlt.common.configuration.specs.mixins import WithPyicebergConfig +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.configuration.specs.exceptions import ( InvalidBoto3Session, ObjectStoreRsCredentialsException, @@ -17,7 +17,9 @@ @configspec -class AwsCredentialsWithoutDefaults(CredentialsConfiguration, WithPyicebergConfig): +class AwsCredentialsWithoutDefaults( + CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig +): # credentials without boto implementation aws_access_key_id: str = None aws_secret_access_key: TSecretStrValue = None diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 7d21341411..aabd0b471a 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -8,7 +8,7 @@ CredentialsWithDefault, configspec, ) -from dlt.common.configuration.specs.mixins import WithPyicebergConfig +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt import version from dlt.common.utils import without_none @@ -16,7 +16,7 @@ @configspec -class AzureCredentialsBase(CredentialsConfiguration): +class AzureCredentialsBase(CredentialsConfiguration, WithObjectStoreRsCredentials): azure_storage_account_name: str = None azure_account_host: Optional[str] = None """Alternative host when accessing blob storage endpoint ie. my_account.dfs.core.windows.net""" diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 60ab1d4b56..acd8a2b2ca 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -12,6 +12,7 @@ NativeValueError, OAuth2ScopesRequired, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import DictStrAny, TSecretStrValue, StrAny from dlt.common.configuration.specs.base_configuration import ( @@ -23,7 +24,7 @@ @configspec -class GcpCredentials(CredentialsConfiguration): +class GcpCredentials(CredentialsConfiguration, WithObjectStoreRsCredentials): token_uri: Final[str] = dataclasses.field( default="https://oauth2.googleapis.com/token", init=False, repr=False, compare=False ) diff --git a/dlt/common/configuration/specs/mixins.py b/dlt/common/configuration/specs/mixins.py index 7a3c1b66e8..2f843aee5b 100644 --- a/dlt/common/configuration/specs/mixins.py +++ b/dlt/common/configuration/specs/mixins.py @@ -2,6 +2,18 @@ from abc import abstractmethod, ABC +class WithObjectStoreRsCredentials(ABC): + @abstractmethod + def to_object_store_rs_credentials(self) -> Dict[str, Any]: + """Returns credentials dictionary for object_store Rust crate. + + Can be used for libraries that build on top of the object_store crate, such as `deltalake`. + + https://docs.rs/object_store/latest/object_store/ + """ + pass + + class WithPyicebergConfig(ABC): @abstractmethod def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index 4047bc3a1a..0f938e7102 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -10,6 +10,7 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.storages import FilesystemConfiguration from dlt.common.utils import assert_min_pkg_version +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials from dlt.destinations.impl.filesystem.filesystem import FilesystemClient try: @@ -191,10 +192,9 @@ def get_delta_tables( def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str]: """Returns dict that can be passed as `storage_options` in `deltalake` library.""" - creds = {} # type: ignore + creds = {} extra_options = {} - # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery - if hasattr(config.credentials, "to_object_store_rs_credentials"): + if isinstance(config.credentials, WithObjectStoreRsCredentials): creds = config.credentials.to_object_store_rs_credentials() if config.deltalake_storage_options is not None: extra_options = config.deltalake_storage_options From c1cc0680b4df98398cc2d58df5828ccc3d9b382c Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 25 Nov 2024 19:12:47 +0400 Subject: [PATCH 33/71] generalize catalog typing --- dlt/common/libs/pyiceberg.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index 8b08724b44..d5e0664d48 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -10,15 +10,10 @@ from dlt.common.configuration.specs.mixins import WithPyicebergConfig from dlt.destinations.impl.filesystem.filesystem import FilesystemClient -assert_min_pkg_version( - pkg_name="sqlalchemy", - version="2.0.18", - msg="`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination.", -) try: from pyiceberg.table import Table as IcebergTable - from pyiceberg.catalog.sql import SqlCatalog + from pyiceberg.catalog import MetastoreCatalog import pyarrow as pa except ModuleNotFoundError: raise MissingDependencyException( @@ -52,7 +47,17 @@ def write_iceberg_table( table.overwrite(ensure_iceberg_compatible_arrow_data(data)) -def get_sql_catalog(credentials: FileSystemCredentials) -> SqlCatalog: +def get_sql_catalog(credentials: FileSystemCredentials) -> "SqlCatalog": # type: ignore[name-defined] # noqa: F821 + assert_min_pkg_version( + pkg_name="sqlalchemy", + version="2.0.18", + msg=( + "`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination." + ), + ) + + from pyiceberg.catalog.sql import SqlCatalog + return SqlCatalog( "default", uri="sqlite:///:memory:", @@ -61,13 +66,13 @@ def get_sql_catalog(credentials: FileSystemCredentials) -> SqlCatalog: def create_or_evolve_table( - catalog: SqlCatalog, + catalog: MetastoreCatalog, client: FilesystemClient, table_name: str, namespace_name: Optional[str] = None, schema: Optional[pa.Schema] = None, partition_columns: Optional[List[str]] = None, -) -> SqlCatalog: +) -> MetastoreCatalog: # add table to catalog table_id = f"{namespace_name}.{table_name}" table_path = f"{client.dataset_path}/{table_name}" @@ -102,11 +107,11 @@ def get_catalog( namespace_name: Optional[str] = None, schema: Optional[pa.Schema] = None, partition_columns: Optional[List[str]] = None, -) -> SqlCatalog: +) -> MetastoreCatalog: """Returns single-table, ephemeral, in-memory Iceberg catalog.""" # create in-memory catalog - catalog = get_sql_catalog(client.config.credentials) + catalog: MetastoreCatalog = get_sql_catalog(client.config.credentials) # create namespace if namespace_name is None: @@ -174,7 +179,7 @@ def _get_last_metadata_file(metadata_path: str, client: FilesystemClient) -> str def _register_table( identifier: str, metadata_path: str, - catalog: SqlCatalog, + catalog: MetastoreCatalog, client: FilesystemClient, ) -> IcebergTable: last_metadata_file = _get_last_metadata_file(metadata_path, client) From 35f590b96ae37655ef91a1c91691b5510db35668 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 26 Nov 2024 12:08:36 +0400 Subject: [PATCH 34/71] extract pyiceberg scheme mapping into separate function --- dlt/common/libs/pyiceberg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index d5e0664d48..f0cf33b236 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -191,6 +191,9 @@ def _make_path(path: str, client: FilesystemClient) -> str: # https://github.com/duckdb/duckdb/issues/13669 if not client.is_local_filesystem: path = client.config.make_url(path) + return _map_scheme(path) + + +def _map_scheme(path: str) -> str: # pyiceberg does not know `az://` scheme - path = path.replace("az://", "abfss://") - return path + return path.replace("az://", "abfss://") From e0d6a1b248e02967e0de863e9d158db5008e0b9b Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 26 Nov 2024 12:12:12 +0400 Subject: [PATCH 35/71] generalize credentials mixin test setup --- ...dentials.py => test_credentials_mixins.py} | 147 ++++++++++++------ 1 file changed, 99 insertions(+), 48 deletions(-) rename tests/load/filesystem/{test_object_store_rs_credentials.py => test_credentials_mixins.py} (55%) diff --git a/tests/load/filesystem/test_object_store_rs_credentials.py b/tests/load/filesystem/test_credentials_mixins.py similarity index 55% rename from tests/load/filesystem/test_object_store_rs_credentials.py rename to tests/load/filesystem/test_credentials_mixins.py index f23187a269..b9f65d72c4 100644 --- a/tests/load/filesystem/test_object_store_rs_credentials.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -1,12 +1,8 @@ -"""Tests translation of `dlt` credentials into `object_store` Rust crate credentials.""" - -from typing import Any, Dict +from typing import Any, Dict, Union, Type, get_args, cast import os import json # noqa: I251 import pytest -from deltalake import DeltaTable -from deltalake.exceptions import TableNotFoundError import dlt from dlt.common.configuration import resolve_configuration @@ -24,6 +20,7 @@ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs.gcp_credentials import GcpDefaultCredentials from dlt.common.configuration.specs.exceptions import ObjectStoreRsCredentialsException +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from tests.load.utils import ( AZ_BUCKET, @@ -34,6 +31,9 @@ ) +TCredentialsMixin = Union[WithObjectStoreRsCredentials, WithPyicebergConfig] +ALL_CREDENTIALS_MIXINS = get_args(TCredentialsMixin) + pytestmark = pytest.mark.essential if all(driver not in ALL_FILESYSTEM_DRIVERS for driver in ("az", "s3", "gs", "r2")): @@ -53,11 +53,27 @@ def fs_creds() -> Dict[str, Any]: return creds -def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> bool: - """Returns True if client can connect to object store, False otherwise. +def can_connect(bucket_url: str, credentials: TCredentialsMixin, mixin: Type[TCredentialsMixin]) -> bool: # type: ignore[return] + """Returns True if client can connect to object store, False otherwise.""" + if mixin == WithObjectStoreRsCredentials: + credentials = cast(WithObjectStoreRsCredentials, credentials) + return can_connect_object_store_rs_credentials( + bucket_url, credentials.to_object_store_rs_credentials() + ) + elif mixin == WithPyicebergConfig: + credentials = cast(WithPyicebergConfig, credentials) + return can_connect_pyiceberg_fileio_config( + bucket_url, credentials.to_pyiceberg_fileio_config() + ) + + +def can_connect_object_store_rs_credentials( + bucket_url: str, object_store_rs_credentials: Dict[str, str] +) -> bool: + # uses `deltatable` library as Python interface to `object_store` Rust crate + from deltalake import DeltaTable + from deltalake.exceptions import TableNotFoundError - Uses `deltatable` library as Python interface to `object_store` Rust crate. - """ try: DeltaTable( bucket_url, @@ -70,16 +86,37 @@ def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> return False +def can_connect_pyiceberg_fileio_config( + bucket_url: str, pyiceberg_fileio_config: Dict[str, str] +) -> bool: + from pyiceberg.table import StaticTable + from dlt.common.libs.pyiceberg import _map_scheme + + try: + StaticTable.from_metadata( + f"{_map_scheme(bucket_url)}/non_existing_metadata_file.json", + properties=pyiceberg_fileio_config, + ) + except FileNotFoundError: + # this error implies the connection was successful + # there is no Iceberg metadata file at the specified path + return True + return False + + @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az")] ) -def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_azure_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: AnyAzureCredentials creds = AzureServicePrincipalCredentialsWithoutDefaults( **dlt.secrets.get("destination.fsazureprincipal.credentials") ) - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(AZ_BUCKET, creds, mixin) # without SAS token creds = AzureCredentialsWithoutDefaults( @@ -87,18 +124,21 @@ def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any] azure_storage_account_key=fs_creds["azure_storage_account_key"], ) assert creds.azure_storage_sas_token is None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(AZ_BUCKET, creds, mixin) # with SAS token creds = resolve_configuration(creds) assert creds.azure_storage_sas_token is not None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(AZ_BUCKET, creds, mixin) @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("s3", "r2")] ) -def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_aws_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: AwsCredentialsWithoutDefaults if driver == "r2": @@ -112,9 +152,11 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) + if mixin == WithObjectStoreRsCredentials: + assert ( + "aws_session_token" not in creds.to_object_store_rs_credentials() + ) # no auto-generated token + assert can_connect(AWS_BUCKET, creds, mixin) # AwsCredentials: no user-provided session token creds = AwsCredentials( @@ -124,28 +166,29 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) - - # exception should be raised if both `endpoint_url` and `region_name` are - # not provided - with pytest.raises(ObjectStoreRsCredentialsException): - AwsCredentials( - aws_access_key_id=fs_creds["aws_access_key_id"], - aws_secret_access_key=fs_creds["aws_secret_access_key"], - ).to_object_store_rs_credentials() - - if "endpoint_url" in object_store_rs_creds: - # TODO: make sure this case is tested on GitHub CI, e.g. by adding - # a local MinIO bucket to the set of tested buckets - if object_store_rs_creds["endpoint_url"].startswith("http://"): - assert object_store_rs_creds["aws_allow_http"] == "true" - - # remainder of tests use session tokens - # we don't run them on S3 compatible storage because session tokens - # may not be available - return + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert "aws_session_token" not in object_store_rs_creds # no auto-generated token + + # exception should be raised if both `endpoint_url` and `region_name` are + # not provided + with pytest.raises(ObjectStoreRsCredentialsException): + AwsCredentials( + aws_access_key_id=fs_creds["aws_access_key_id"], + aws_secret_access_key=fs_creds["aws_secret_access_key"], + ).to_object_store_rs_credentials() + + if "endpoint_url" in object_store_rs_creds: + # TODO: make sure this case is tested on GitHub CI, e.g. by adding + # a local MinIO bucket to the set of tested buckets + if object_store_rs_creds["endpoint_url"].startswith("http://"): + assert object_store_rs_creds["aws_allow_http"] == "true" + + # remainder of tests use session tokens + # we don't run them on S3 compatible storage because session tokens + # may not be available + return # AwsCredentials: user-provided session token # use previous credentials to create session token for new credentials @@ -158,9 +201,10 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None # AwsCredentialsWithoutDefaults: user-provided session token creds = AwsCredentialsWithoutDefaults( @@ -170,15 +214,22 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("gs")] ) -def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_gcp_credentials_mixins( + driver, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: + if mixin == WithPyicebergConfig: + pytest.skip("`WithPyicebergConfig` mixin currently not implemented for GCP.") + creds: GcpCredentials # GcpServiceAccountCredentialsWithoutDefaults @@ -189,7 +240,7 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No private_key_id=fs_creds["private_key_id"], client_email=fs_creds["client_email"], ) - assert can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(GCS_BUCKET, creds, mixin) # GcpDefaultCredentials @@ -206,7 +257,7 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": path}): creds = GcpDefaultCredentials() resolve_configuration(creds) - can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(GCS_BUCKET, creds, mixin) # GcpOAuthCredentialsWithoutDefaults is currently not supported with pytest.raises(NotImplementedError): From 85a10a2421ab4c88b7ec05bf6d3879df79ac3a82 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 26 Nov 2024 13:43:35 +0400 Subject: [PATCH 36/71] remove unused import --- dlt/common/libs/pyiceberg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index f0cf33b236..303c20cf7e 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -1,7 +1,7 @@ from typing import Dict, Any, List, Optional from dlt import version, Pipeline -from dlt.common.libs.pyarrow import cast_arrow_schema_types, columns_to_arrow +from dlt.common.libs.pyarrow import cast_arrow_schema_types from dlt.common.schema.typing import TWriteDisposition from dlt.common.utils import assert_min_pkg_version from dlt.common.exceptions import MissingDependencyException From 54cd0bcebffad15d522e734da321c602f4bd7461 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 26 Nov 2024 15:56:35 +0400 Subject: [PATCH 37/71] add centralized fallback to append when merge is not supported --- dlt/common/destination/reference.py | 25 ++++++++++++++++++++++++- dlt/common/destination/utils.py | 11 ----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index d1024eb28c..ee05757530 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -56,6 +56,7 @@ DestinationSchemaTampered, DestinationTransientException, ) +from dlt.common.destination.utils import resolve_merge_strategy from dlt.common.schema.exceptions import UnknownTableException from dlt.common.storages import FileStorage from dlt.common.storages.load_storage import ParsedLoadJobFileName @@ -675,7 +676,29 @@ def update_stored_schema( def prepare_load_table(self, table_name: str) -> PreparedTableSchema: """Prepares a table schema to be loaded by filling missing hints and doing other modifications requires by given destination.""" try: - return fill_hints_from_parent_and_clone_table(self.schema.tables, self.schema.tables[table_name]) # type: ignore[return-value] + table_schema = fill_hints_from_parent_and_clone_table( + self.schema.tables, self.schema.tables[table_name] + ) + if ( + table_schema["write_disposition"] == "merge" + and resolve_merge_strategy( + self.schema.tables, self.schema.tables[table_name], self.capabilities + ) + is None + ): + table_format_info = "" + if self.capabilities.supported_table_formats: + table_format_info = ( + " or try different table format which may offer `merge`:" + f" {self.capabilities.supported_table_formats}" + ) + logger.warning( + "Destination does not support any merge strategies and `merge` write" + f" disposition for table `{table_name}` cannot be met and will fall back to" + f" `append`. Change write disposition{table_format_info}." + ) + table_schema["write_disposition"] = "append" + return table_schema # type: ignore[return-value] except KeyError: raise UnknownTableException(self.schema.name, table_name) diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index c98344b687..bb128cc4fd 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -214,17 +214,6 @@ def resolve_merge_strategy( supported_strategies, table_schema=table ) if not supported_strategies: - table_format_info = "" - if destination_capabilities.supported_table_formats: - table_format_info = ( - " or try different table format which may offer `merge`:" - f" {destination_capabilities.supported_table_formats}" - ) - logger.warning( - "Destination does not support any merge strategies and `merge` write disposition " - f" for table `{table_name}` cannot be met and will fall back to `append`. Change" - f" write disposition{table_format_info}." - ) return None merge_strategy = get_merge_strategy(tables, table_name) # use first merge strategy as default From 4e979f01665bd3a421ae7df3204f1d76b74a257e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Wed, 27 Nov 2024 10:03:33 +0400 Subject: [PATCH 38/71] Revert "add centralized fallback to append when merge is not supported" This reverts commit 54cd0bcebffad15d522e734da321c602f4bd7461. --- dlt/common/destination/reference.py | 25 +------------------------ dlt/common/destination/utils.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index ee05757530..d1024eb28c 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -56,7 +56,6 @@ DestinationSchemaTampered, DestinationTransientException, ) -from dlt.common.destination.utils import resolve_merge_strategy from dlt.common.schema.exceptions import UnknownTableException from dlt.common.storages import FileStorage from dlt.common.storages.load_storage import ParsedLoadJobFileName @@ -676,29 +675,7 @@ def update_stored_schema( def prepare_load_table(self, table_name: str) -> PreparedTableSchema: """Prepares a table schema to be loaded by filling missing hints and doing other modifications requires by given destination.""" try: - table_schema = fill_hints_from_parent_and_clone_table( - self.schema.tables, self.schema.tables[table_name] - ) - if ( - table_schema["write_disposition"] == "merge" - and resolve_merge_strategy( - self.schema.tables, self.schema.tables[table_name], self.capabilities - ) - is None - ): - table_format_info = "" - if self.capabilities.supported_table_formats: - table_format_info = ( - " or try different table format which may offer `merge`:" - f" {self.capabilities.supported_table_formats}" - ) - logger.warning( - "Destination does not support any merge strategies and `merge` write" - f" disposition for table `{table_name}` cannot be met and will fall back to" - f" `append`. Change write disposition{table_format_info}." - ) - table_schema["write_disposition"] = "append" - return table_schema # type: ignore[return-value] + return fill_hints_from_parent_and_clone_table(self.schema.tables, self.schema.tables[table_name]) # type: ignore[return-value] except KeyError: raise UnknownTableException(self.schema.name, table_name) diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index bb128cc4fd..c98344b687 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -214,6 +214,17 @@ def resolve_merge_strategy( supported_strategies, table_schema=table ) if not supported_strategies: + table_format_info = "" + if destination_capabilities.supported_table_formats: + table_format_info = ( + " or try different table format which may offer `merge`:" + f" {destination_capabilities.supported_table_formats}" + ) + logger.warning( + "Destination does not support any merge strategies and `merge` write disposition " + f" for table `{table_name}` cannot be met and will fall back to `append`. Change" + f" write disposition{table_format_info}." + ) return None merge_strategy = get_merge_strategy(tables, table_name) # use first merge strategy as default From 54f135345abe73bc0344cb8f8acd3c81f2ddd2ce Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Wed, 27 Nov 2024 11:19:10 +0400 Subject: [PATCH 39/71] fall back to append if merge is not supported on filesystem --- dlt/destinations/impl/filesystem/filesystem.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 757f25bcd7..4db72be3b0 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -160,9 +160,6 @@ def run(self) -> None: else: with source_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader if self._load_table["write_disposition"] == "merge" and delta_table is not None: - self._load_table["x-merge-strategy"] = resolve_merge_strategy( # type: ignore[typeddict-unknown-key] - self._schema.tables, self._load_table, self._job_client.capabilities - ) merge_delta_table( table=delta_table, data=arrow_rbr, @@ -428,6 +425,13 @@ def prepare_load_table(self, table_name: str) -> PreparedTableSchema: if table["write_disposition"] == "merge": table["write_disposition"] = "append" table.pop("table_format", None) + merge_strategy = resolve_merge_strategy(self.schema.tables, table, self.capabilities) + if table["write_disposition"] == "merge": + if merge_strategy is None: + # no supported merge strategies, fall back to append + table["write_disposition"] = "append" + else: + table["x-merge-strategy"] = merge_strategy # type: ignore[typeddict-unknown-key] return table def get_table_dir(self, table_name: str, remote: bool = False) -> str: From 28d0fd23557579d33054b8c3fa603898df8f87bf Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Wed, 27 Nov 2024 12:06:58 +0400 Subject: [PATCH 40/71] fix test for s3-compatible storage --- tests/load/filesystem/test_credentials_mixins.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/load/filesystem/test_credentials_mixins.py b/tests/load/filesystem/test_credentials_mixins.py index b9f65d72c4..1299614304 100644 --- a/tests/load/filesystem/test_credentials_mixins.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -179,16 +179,18 @@ def test_aws_credentials_mixins( aws_secret_access_key=fs_creds["aws_secret_access_key"], ).to_object_store_rs_credentials() - if "endpoint_url" in object_store_rs_creds: + if "endpoint_url" in object_store_rs_creds and object_store_rs_creds[ + "endpoint_url" + ].startswith("http://"): # TODO: make sure this case is tested on GitHub CI, e.g. by adding # a local MinIO bucket to the set of tested buckets - if object_store_rs_creds["endpoint_url"].startswith("http://"): - assert object_store_rs_creds["aws_allow_http"] == "true" + assert object_store_rs_creds["aws_allow_http"] == "true" - # remainder of tests use session tokens - # we don't run them on S3 compatible storage because session tokens - # may not be available - return + if creds.endpoint_url is not None: + # remainder of tests use session tokens + # we don't run them on S3 compatible storage because session tokens + # may not be available + return # AwsCredentials: user-provided session token # use previous credentials to create session token for new credentials From 90b17296852748e14fbec0ee621b21cf13daad6d Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Wed, 27 Nov 2024 13:07:52 +0400 Subject: [PATCH 41/71] remove obsolete code path --- dlt/destinations/impl/filesystem/filesystem.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 4db72be3b0..db46f09caf 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -245,10 +245,7 @@ def table_identifier(self) -> str: class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: jobs = super().create_followup_jobs(final_state) - if self._load_table.get("table_format") == "delta": - # delta table jobs only require table chain followup jobs - pass - elif final_state == "completed": + if final_state == "completed": ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._job_client.make_remote_url(self.make_remote_path())], From d0f7c88bb28a8afe226a5615ba791c8aa3410eef Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Wed, 27 Nov 2024 18:54:49 +0400 Subject: [PATCH 42/71] exclude gcs read interface tests for iceberg --- tests/load/test_read_interfaces.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index f5a8d51baf..4de21edc97 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -53,6 +53,9 @@ def populated_pipeline(request) -> Any: """fixture that returns a pipeline object populated with the example data""" destination_config = cast(DestinationTestConfiguration, request.param) + if destination_config.table_format == "iceberg" and destination_config.bucket_url == GCS_BUCKET: + pytest.skip("We currently don't support writing `iceberg` tables on GCS.") + if ( destination_config.file_format not in ["parquet", "jsonl"] and destination_config.destination_type == "filesystem" From 050bea73a502609b0cb72866edf1363537667e0f Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 28 Nov 2024 19:33:34 +0400 Subject: [PATCH 43/71] add gcs support for iceberg table format --- dlt/common/configuration/specs/exceptions.py | 4 +++ .../configuration/specs/gcp_credentials.py | 25 ++++++++++++-- .../filesystem/test_credentials_mixins.py | 34 +++++++++++++------ .../load/pipeline/test_filesystem_pipeline.py | 9 ----- tests/load/test_read_interfaces.py | 3 -- tests/load/utils.py | 17 ++++++++-- 6 files changed, 66 insertions(+), 26 deletions(-) diff --git a/dlt/common/configuration/specs/exceptions.py b/dlt/common/configuration/specs/exceptions.py index 928e46a8a0..fe87ef24d7 100644 --- a/dlt/common/configuration/specs/exceptions.py +++ b/dlt/common/configuration/specs/exceptions.py @@ -72,3 +72,7 @@ def __init__(self, spec: Type[Any], native_value: Any): class ObjectStoreRsCredentialsException(ConfigurationException): pass + + +class UnsupportedAuthenticationMethodException(ConfigurationException): + pass diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index acd8a2b2ca..177ecdae35 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -11,8 +11,9 @@ InvalidGoogleServicesJson, NativeValueError, OAuth2ScopesRequired, + UnsupportedAuthenticationMethodException, ) -from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import DictStrAny, TSecretStrValue, StrAny from dlt.common.configuration.specs.base_configuration import ( @@ -24,7 +25,7 @@ @configspec -class GcpCredentials(CredentialsConfiguration, WithObjectStoreRsCredentials): +class GcpCredentials(CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig): token_uri: Final[str] = dataclasses.field( default="https://oauth2.googleapis.com/token", init=False, repr=False, compare=False ) @@ -127,6 +128,12 @@ def to_native_credentials(self) -> Any: else: return ServiceAccountCredentials.from_service_account_info(self) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Service Account authentication not supported with `iceberg` table format. Use OAuth" + " authentication instead." + ) + def __str__(self) -> str: return f"{self.client_email}@{self.project_id}" @@ -182,6 +189,14 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: " https://docs.rs/object_store/latest/object_store/gcp." ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + self.auth() + return { + "gcs.project-id": self.project_id, + "gcs.oauth2.token": self.token, + "gcs.oauth2.token-expires-at": (pendulum.now().timestamp() + 60) * 1000, + } + def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) -> None: if not self.refresh_token: self.add_scopes(scopes) @@ -314,6 +329,12 @@ def to_native_credentials(self) -> Any: else: return super().to_native_credentials() + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Application Default Credentials authentication not supported with `iceberg` table" + " format. Use OAuth authentication instead." + ) + @configspec class GcpServiceAccountCredentials( diff --git a/tests/load/filesystem/test_credentials_mixins.py b/tests/load/filesystem/test_credentials_mixins.py index 1299614304..323b31f089 100644 --- a/tests/load/filesystem/test_credentials_mixins.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -19,7 +19,10 @@ from dlt.common.utils import custom_environ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs.gcp_credentials import GcpDefaultCredentials -from dlt.common.configuration.specs.exceptions import ObjectStoreRsCredentialsException +from dlt.common.configuration.specs.exceptions import ( + ObjectStoreRsCredentialsException, + UnsupportedAuthenticationMethodException, +) from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from tests.load.utils import ( @@ -229,9 +232,6 @@ def test_aws_credentials_mixins( def test_gcp_credentials_mixins( driver, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] ) -> None: - if mixin == WithPyicebergConfig: - pytest.skip("`WithPyicebergConfig` mixin currently not implemented for GCP.") - creds: GcpCredentials # GcpServiceAccountCredentialsWithoutDefaults @@ -242,7 +242,11 @@ def test_gcp_credentials_mixins( private_key_id=fs_creds["private_key_id"], client_email=fs_creds["client_email"], ) - assert can_connect(GCS_BUCKET, creds, mixin) + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) # GcpDefaultCredentials @@ -250,7 +254,7 @@ def test_gcp_credentials_mixins( GcpDefaultCredentials._LAST_FAILED_DEFAULT = 0 # write service account key to JSON file - service_json = json.loads(creds.to_object_store_rs_credentials()["service_account_key"]) + service_json = json.loads(creds.to_native_representation()) path = "_secrets/service.json" os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as f: @@ -259,8 +263,18 @@ def test_gcp_credentials_mixins( with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": path}): creds = GcpDefaultCredentials() resolve_configuration(creds) + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) + + # GcpOAuthCredentialsWithoutDefaults + creds = resolve_configuration( + GcpOAuthCredentialsWithoutDefaults(), sections=("destination", "fsgcpoauth") + ) + if mixin == WithPyicebergConfig: assert can_connect(GCS_BUCKET, creds, mixin) - - # GcpOAuthCredentialsWithoutDefaults is currently not supported - with pytest.raises(NotImplementedError): - GcpOAuthCredentialsWithoutDefaults().to_object_store_rs_credentials() + elif mixin == WithObjectStoreRsCredentials: + with pytest.raises(NotImplementedError): + assert can_connect(GCS_BUCKET, creds, mixin) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 1a0d05a06d..05c8541e73 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -32,9 +32,7 @@ DestinationTestConfiguration, MEMORY_BUCKET, FILE_BUCKET, - AWS_BUCKET, AZ_BUCKET, - ABFS_BUCKET, SFTP_BUCKET, ) @@ -315,13 +313,6 @@ def test_table_format_core( Tests all data types, all filesystems. Tests `append` and `replace` write dispositions (`merge` is tested elsewhere). """ - if destination_config.table_format == "iceberg" and destination_config.bucket_url not in ( - FILE_BUCKET, - AWS_BUCKET, - AZ_BUCKET, - ABFS_BUCKET, - ): - pytest.skip("only local, S3, and Azure filesystems are currently implemented for `iceberg`") if destination_config.table_format == "delta": from dlt.common.libs.deltalake import get_delta_tables diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index 4de21edc97..f5a8d51baf 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -53,9 +53,6 @@ def populated_pipeline(request) -> Any: """fixture that returns a pipeline object populated with the example data""" destination_config = cast(DestinationTestConfiguration, request.param) - if destination_config.table_format == "iceberg" and destination_config.bucket_url == GCS_BUCKET: - pytest.skip("We currently don't support writing `iceberg` tables on GCS.") - if ( destination_config.file_format not in ["parquet", "jsonl"] and destination_config.destination_type == "filesystem" diff --git a/tests/load/utils.py b/tests/load/utils.py index 27a7e7f43d..5bf5c51518 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -26,7 +26,10 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs import ( + CredentialsConfiguration, + GcpOAuthCredentialsWithoutDefaults, +) from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, JobClientBase, @@ -172,7 +175,9 @@ def destination_factory(self, **kwargs) -> Destination[Any, Any]: dest_type = kwargs.pop("destination", self.destination_type) dest_name = kwargs.pop("destination_name", self.destination_name) self.setup() - return Destination.from_reference(dest_type, destination_name=dest_name, **kwargs) + return Destination.from_reference( + dest_type, self.credentials, destination_name=dest_name, **kwargs + ) def raw_capabilities(self) -> DestinationCapabilitiesContext: dest = Destination.from_reference(self.destination_type) @@ -628,6 +633,14 @@ def destinations_configs( table_format="iceberg", supports_merge=False, file_format="parquet", + credentials=( + resolve_configuration( + GcpOAuthCredentialsWithoutDefaults(), + sections=("destination", "fsgcpoauth"), + ) + if bucket == GCS_BUCKET + else None + ), ) ] From ff48ca96d58eb6085af0cb65b390558c3501c57c Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 28 Nov 2024 19:34:14 +0400 Subject: [PATCH 44/71] switch to UnsupportedAuthenticationMethodException --- dlt/common/configuration/specs/gcp_credentials.py | 6 +++--- tests/load/filesystem/test_credentials_mixins.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 177ecdae35..21ae2587ed 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -184,9 +184,9 @@ def to_native_representation(self) -> str: return json.dumps(self._info_dict()) def to_object_store_rs_credentials(self) -> Dict[str, str]: - raise NotImplementedError( - "`object_store` Rust crate does not support OAuth for GCP credentials. Reference:" - " https://docs.rs/object_store/latest/object_store/gcp." + raise UnsupportedAuthenticationMethodException( + "OAuth authentication not supported with `delta` table format. Use Service Account or" + " Application Default Credentials authentication instead." ) def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: diff --git a/tests/load/filesystem/test_credentials_mixins.py b/tests/load/filesystem/test_credentials_mixins.py index 323b31f089..6ce80aa8ee 100644 --- a/tests/load/filesystem/test_credentials_mixins.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -276,5 +276,5 @@ def test_gcp_credentials_mixins( if mixin == WithPyicebergConfig: assert can_connect(GCS_BUCKET, creds, mixin) elif mixin == WithObjectStoreRsCredentials: - with pytest.raises(NotImplementedError): + with pytest.raises(UnsupportedAuthenticationMethodException): assert can_connect(GCS_BUCKET, creds, mixin) From 01e8d26d5d53bda538a4dbaca5bbabe500322d95 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 28 Nov 2024 19:38:11 +0400 Subject: [PATCH 45/71] add iceberg table format docs --- .../dlt-ecosystem/destinations/filesystem.md | 136 +++++++++++------- .../dlt-ecosystem/table-formats/iceberg.md | 2 +- 2 files changed, 88 insertions(+), 50 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 0a41bb4c7b..bbc795d637 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -404,29 +404,6 @@ The filesystem destination handles the write dispositions as follows: - `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added. - `merge` - falls back to `append` -### Merge with Delta table format (experimental) -The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [Delta table format](#delta-table-format). - -:::caution -The `upsert` merge strategy for the filesystem destination with Delta table format is experimental. -::: - -```py -@dlt.resource( - write_disposition={"disposition": "merge", "strategy": "upsert"}, - primary_key="my_primary_key", - table_format="delta" -) -def my_upsert_resource(): - ... -... -``` - -#### Known limitations -- `hard_delete` hint not supported -- Deleting records from nested tables not supported - - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. - ## File compression The filesystem destination in the dlt library uses `gzip` compression by default for efficiency, which may result in the files being stored in a compressed format. This format may not be easily readable as plain text or JSON Lines (`jsonl`) files. If you encounter files that seem unreadable, they may be compressed. @@ -647,8 +624,9 @@ You can choose the following file formats: You can choose the following table formats: * [Delta table](../table-formats/delta.md) is supported +* [Iceberg](../table-formats/iceberg.md) is supported (**experimental**) -### Delta table format +### Delta table format dependencies You need the `deltalake` package to use this format: @@ -662,7 +640,23 @@ You also need `pyarrow>=17.0.0`: pip install 'pyarrow>=17.0.0' ``` -Set the `table_format` argument to `delta` when defining your resource: +### Iceberg table format dependencies + +You need the `pyiceberg` package to use this format: + +```sh +pip install "dlt[pyiceberg]" +``` + +You also need `sqlalchemy>=2.0.18`: + +```sh +pip install 'sqlalchemy>=2.0.18' +``` + +### Set table format + +Set the `table_format` argument to `delta` or `iceberg` when defining your resource: ```py @dlt.resource(table_format="delta") @@ -670,16 +664,19 @@ def my_delta_resource(): ... ``` +or when calling `run` on your pipeline: + +```py +pipeline.run(my_resource, table_format="delta") +``` + :::note -`dlt` always uses Parquet as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded. +`dlt` always uses Parquet as `loader_file_format` when using the `delta` or `iceberg` table format. Any setting of `loader_file_format` is disregarded. ::: -:::caution -Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. -::: -#### Delta table partitioning -A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column: +#### Table format partitioning +Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: ```py @dlt.resource( @@ -690,30 +687,20 @@ def my_delta_resource(): ... ``` -:::caution -It is **not** possible to change partition columns after the Delta table has been created. Trying to do so causes an error stating that the partition columns don't match. +:::note +Delta uses [Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/), while Iceberg uses [hidden partioning](https://iceberg.apache.org/docs/latest/partitioning/). ::: +:::caution +Partition evolution (changing partition columns after a table has been created) is not supported. +::: -#### Storage options -You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: - -```toml -[destination.filesystem] -deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' -``` - -`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. - -You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. - ->❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. - -#### `get_delta_tables` helper -You can use the `get_delta_tables` helper function to get `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects for your Delta tables: +#### Table access helper functions +You can use the `get_delta_tables` and `get_iceberg_tables` helper functions to acccess native table objects. For `delta` these are `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects, for `iceberg` these are `pyiceberg` [Table](https://py.iceberg.apache.org/reference/pyiceberg/table/#pyiceberg.table.Table) objects. ```py from dlt.common.libs.deltalake import get_delta_tables +# from dlt.common.libs.pyiceberg import get_iceberg_tables ... @@ -725,9 +712,60 @@ delta_tables["my_delta_table"].optimize.compact() delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) # delta_tables["my_delta_table"].vacuum() # etc. +``` + +#### Table format Google Cloud Storage authentication + +Note that not all authentication methods are supported when using table formats on Google Cloud Storage: + +| Authentication method | `delta` | `iceberg` | +| -- | -- | -- | +| [Service Account](bigquery.md#setup-guide) | ✅ | ❌ | +| [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | +| [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | + +#### Table format `merge` support (**experimental**) +The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`. + +:::caution +The `upsert` merge strategy for the filesystem destination with Delta table format is **experimental**. +::: + +```py +@dlt.resource( + write_disposition={"disposition": "merge", "strategy": "upsert"}, + primary_key="my_primary_key", + table_format="delta" +) +def my_upsert_resource(): + ... +... +``` + +#### Known limitations +- `hard_delete` hint not supported +- Deleting records from nested tables not supported + - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. + +#### Delta table format storage options +You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: +```toml +[destination.filesystem] +deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' ``` +`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. + +You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. + +>❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. + +#### Delta table format memory usage +:::caution +Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. +::: + ## Syncing of dlt state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured. diff --git a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md index 233ae0ce21..edca521e52 100644 --- a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md +++ b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md @@ -10,5 +10,5 @@ keywords: [iceberg, table formats] ## Supported destinations -Supported by: **Athena** +Supported by: **Athena**, **filesystem** From ef29aa7c2fdba79441573850c7d15b83526c011a Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 29 Nov 2024 11:36:13 +0400 Subject: [PATCH 46/71] use shorter pipeline name to prevent too long sql identifiers --- tests/load/pipeline/test_merge_disposition.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index 2925bfac6f..2b638fee9a 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -284,9 +284,7 @@ def test_merge_nested_records_inserted_deleted( destination_config: DestinationTestConfiguration, merge_strategy: TLoaderMergeStrategy, ) -> None: - p = destination_config.setup_pipeline( - "test_merge_nested_records_inserted_deleted", dev_mode=True - ) + p = destination_config.setup_pipeline("abstract", dev_mode=True) skip_if_not_supported(merge_strategy, p.destination) From f463d065cbdd9ac06d33ee1fe19f2d716e5a950b Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 29 Nov 2024 13:53:16 +0400 Subject: [PATCH 47/71] add iceberg catalog note to docs --- docs/website/docs/dlt-ecosystem/destinations/filesystem.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index bbc795d637..ce05e44ebc 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -654,6 +654,10 @@ You also need `sqlalchemy>=2.0.18`: pip install 'sqlalchemy>=2.0.18' ``` +:::note +`dlt` uses an ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/). This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. +::: + ### Set table format Set the `table_format` argument to `delta` or `iceberg` when defining your resource: From d50aaa16db97d0edbc8fd78b0ef2e26547d0368d Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 29 Nov 2024 13:59:26 +0400 Subject: [PATCH 48/71] black format --- dlt/common/libs/pyarrow.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 37268c0d2f..029cd75399 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -628,7 +628,14 @@ def row_tuples_to_arrow( " extracting an SQL VIEW that selects with cast." ) json_str_array = pa.array( - [None if s is None else json.dumps(s) if not issubclass(type(s), set) else json.dumps(list(s)) for s in columnar_known_types[field.name]] + [ + ( + None + if s is None + else json.dumps(s) if not issubclass(type(s), set) else json.dumps(list(s)) + ) + for s in columnar_known_types[field.name] + ] ) columnar_known_types[field.name] = json_str_array From 6cce03b77111825b0714597e6d494df97145f0f2 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 29 Nov 2024 15:01:10 +0400 Subject: [PATCH 49/71] use shorter pipeline name to prevent too long sql identifiers --- tests/load/pipeline/test_merge_disposition.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index 2b638fee9a..8b3f55d540 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -419,9 +419,7 @@ def test_bring_your_own_dlt_id( destination_config: DestinationTestConfiguration, merge_strategy: TLoaderMergeStrategy, ) -> None: - p = destination_config.setup_pipeline( - "test_merge_nested_records_inserted_deleted", dev_mode=True - ) + p = destination_config.setup_pipeline("abstract", dev_mode=True) skip_if_not_supported(merge_strategy, p.destination) From fc61663365e7b5601201ceaa799ee322816d7a2a Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 29 Nov 2024 16:39:27 +0400 Subject: [PATCH 50/71] correct max id length for sqlalchemy mysql dialect --- dlt/destinations/impl/sqlalchemy/factory.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dlt/destinations/impl/sqlalchemy/factory.py b/dlt/destinations/impl/sqlalchemy/factory.py index edd827ed00..e61ac1fb6a 100644 --- a/dlt/destinations/impl/sqlalchemy/factory.py +++ b/dlt/destinations/impl/sqlalchemy/factory.py @@ -81,6 +81,9 @@ def adjust_capabilities( caps.max_column_identifier_length = dialect.max_identifier_length caps.supports_native_boolean = dialect.supports_native_boolean if dialect.name == "mysql": + # correct max identifier length + # dialect uses 255 (max length for aliases) instead of 64 (max length of identifiers) + caps.max_identifier_length = 64 caps.format_datetime_literal = _format_mysql_datetime_literal return caps From b011907b07ea49b53fe05e2fc9bb6ae2b78c1298 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 29 Nov 2024 16:40:40 +0400 Subject: [PATCH 51/71] Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit 6cce03b77111825b0714597e6d494df97145f0f2. --- tests/load/pipeline/test_merge_disposition.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index 8b3f55d540..2b638fee9a 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -419,7 +419,9 @@ def test_bring_your_own_dlt_id( destination_config: DestinationTestConfiguration, merge_strategy: TLoaderMergeStrategy, ) -> None: - p = destination_config.setup_pipeline("abstract", dev_mode=True) + p = destination_config.setup_pipeline( + "test_merge_nested_records_inserted_deleted", dev_mode=True + ) skip_if_not_supported(merge_strategy, p.destination) From e748dcfa8395ff67bdca8cda36c8ca5c9a90553e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 29 Nov 2024 16:41:22 +0400 Subject: [PATCH 52/71] Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit ef29aa7c2fdba79441573850c7d15b83526c011a. --- tests/load/pipeline/test_merge_disposition.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index 2b638fee9a..2925bfac6f 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -284,7 +284,9 @@ def test_merge_nested_records_inserted_deleted( destination_config: DestinationTestConfiguration, merge_strategy: TLoaderMergeStrategy, ) -> None: - p = destination_config.setup_pipeline("abstract", dev_mode=True) + p = destination_config.setup_pipeline( + "test_merge_nested_records_inserted_deleted", dev_mode=True + ) skip_if_not_supported(merge_strategy, p.destination) From 1b47893bc194bfdb3fa7eefc412fcd00e383a9db Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 29 Nov 2024 18:02:26 +0400 Subject: [PATCH 53/71] replace show with execute to prevent useless print output --- dlt/destinations/impl/filesystem/sql_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index 8847633310..4e744eea1a 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -310,7 +310,7 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB def _setup_iceberg(conn: duckdb.DuckDBPyConnection) -> None: # needed to make persistent secrets work in new connection # https://github.com/duckdb/duckdb_iceberg/issues/83 - conn.sql("FROM duckdb_secrets();").show() + conn.execute("FROM duckdb_secrets();") # `duckdb_iceberg` extension does not support autoloading # https://github.com/duckdb/duckdb_iceberg/issues/71 From 133f1ce2a3855235825d8cc11dcc9d0b8b7d78b1 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sat, 30 Nov 2024 15:33:25 +0400 Subject: [PATCH 54/71] add abfss scheme to test --- tests/load/filesystem/test_credentials_mixins.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/load/filesystem/test_credentials_mixins.py b/tests/load/filesystem/test_credentials_mixins.py index 6ce80aa8ee..4a1c383d11 100644 --- a/tests/load/filesystem/test_credentials_mixins.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -27,6 +27,7 @@ from tests.load.utils import ( AZ_BUCKET, + ABFS_BUCKET, AWS_BUCKET, GCS_BUCKET, R2_BUCKET_CONFIG, @@ -108,18 +109,19 @@ def can_connect_pyiceberg_fileio_config( @pytest.mark.parametrize( - "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az")] + "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az", "abfss")] ) @pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) def test_azure_credentials_mixins( driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] ) -> None: + buckets = {"az": AZ_BUCKET, "abfss": ABFS_BUCKET} creds: AnyAzureCredentials creds = AzureServicePrincipalCredentialsWithoutDefaults( **dlt.secrets.get("destination.fsazureprincipal.credentials") ) - assert can_connect(AZ_BUCKET, creds, mixin) + assert can_connect(buckets[driver], creds, mixin) # without SAS token creds = AzureCredentialsWithoutDefaults( @@ -127,12 +129,12 @@ def test_azure_credentials_mixins( azure_storage_account_key=fs_creds["azure_storage_account_key"], ) assert creds.azure_storage_sas_token is None - assert can_connect(AZ_BUCKET, creds, mixin) + assert can_connect(buckets[driver], creds, mixin) # with SAS token creds = resolve_configuration(creds) assert creds.azure_storage_sas_token is not None - assert can_connect(AZ_BUCKET, creds, mixin) + assert can_connect(buckets[driver], creds, mixin) @pytest.mark.parametrize( From eceb19ff78dfafd67ff85820e75e8fc61277f06b Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sat, 30 Nov 2024 15:34:37 +0400 Subject: [PATCH 55/71] remove az support for iceberg table format --- dlt/common/libs/pyiceberg.py | 9 +-------- tests/load/filesystem/test_credentials_mixins.py | 6 ++++-- tests/load/utils.py | 3 +++ 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py index 303c20cf7e..19ce9abbf2 100644 --- a/dlt/common/libs/pyiceberg.py +++ b/dlt/common/libs/pyiceberg.py @@ -189,11 +189,4 @@ def _register_table( def _make_path(path: str, client: FilesystemClient) -> str: # don't use file protocol for local files because duckdb does not support it # https://github.com/duckdb/duckdb/issues/13669 - if not client.is_local_filesystem: - path = client.config.make_url(path) - return _map_scheme(path) - - -def _map_scheme(path: str) -> str: - # pyiceberg does not know `az://` scheme - return path.replace("az://", "abfss://") + return path if client.is_local_filesystem else client.config.make_url(path) diff --git a/tests/load/filesystem/test_credentials_mixins.py b/tests/load/filesystem/test_credentials_mixins.py index 4a1c383d11..c1fb02c152 100644 --- a/tests/load/filesystem/test_credentials_mixins.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -94,11 +94,10 @@ def can_connect_pyiceberg_fileio_config( bucket_url: str, pyiceberg_fileio_config: Dict[str, str] ) -> bool: from pyiceberg.table import StaticTable - from dlt.common.libs.pyiceberg import _map_scheme try: StaticTable.from_metadata( - f"{_map_scheme(bucket_url)}/non_existing_metadata_file.json", + f"{bucket_url}/non_existing_metadata_file.json", properties=pyiceberg_fileio_config, ) except FileNotFoundError: @@ -115,6 +114,9 @@ def can_connect_pyiceberg_fileio_config( def test_azure_credentials_mixins( driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] ) -> None: + if mixin == WithPyicebergConfig and driver == "az": + pytest.skip("`pyiceberg` does not support `az` scheme") + buckets = {"az": AZ_BUCKET, "abfss": ABFS_BUCKET} creds: AnyAzureCredentials diff --git a/tests/load/utils.py b/tests/load/utils.py index 5bf5c51518..7835927f5c 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -625,6 +625,9 @@ def destinations_configs( ), ) ] + if bucket == AZ_BUCKET: + # `pyiceberg` does not support `az` scheme + continue destination_configs += [ DestinationTestConfiguration( destination_type="filesystem", From e75114d90b233cfdc484f7e36ffa7b39270b1a6e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sat, 30 Nov 2024 15:35:23 +0400 Subject: [PATCH 56/71] remove iceberg bucket test exclusion --- tests/load/filesystem/test_sql_client.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index 654594b6c6..1bf70e3f1e 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -16,11 +16,9 @@ from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, - FILE_BUCKET, GCS_BUCKET, SFTP_BUCKET, MEMORY_BUCKET, - AWS_BUCKET, ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT @@ -295,12 +293,6 @@ def test_read_interfaces_filesystem( def test_table_formats( destination_config: DestinationTestConfiguration, secret_directory: str ) -> None: - if destination_config.table_format == "iceberg" and destination_config.bucket_url not in ( - FILE_BUCKET, - AWS_BUCKET, - ): - pytest.skip("only local and S3 filesystems are currently implemented for `iceberg`") - os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700" pipeline = destination_config.setup_pipeline( From 049c00828ee7a43c7d9b4c07114c2066f6ad6a72 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sat, 30 Nov 2024 15:55:38 +0400 Subject: [PATCH 57/71] add note to docs on azure scheme support for iceberg table format --- docs/website/docs/dlt-ecosystem/destinations/filesystem.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 04dbea49d8..d9ed52230c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -166,6 +166,8 @@ Run `pip install "dlt[az]"` which will install the `adlfs` package to interface Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default; replace them with your Azure credentials. +#### Supported schemes + `dlt` supports both forms of the blob storage urls: ```toml [destination.filesystem] @@ -181,6 +183,8 @@ bucket_url = "abfss://@.dfs.core.windows.n You can use `az`, `abfss`, `azure` and `abfs` url schemes. +#### Custom host + If you need to use a custom host to account your storage account you can set it up like below: ```toml [destination.filesystem.credentials] @@ -728,6 +732,9 @@ Note that not all authentication methods are supported when using table formats | [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | | [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | +#### Iceberg Azure scheme +The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`. + #### Table format `merge` support (**experimental**) The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`. From a0fc017c701538b04d6f6dfd9ad28d5d1f6e8352 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 1 Dec 2024 12:23:12 +0400 Subject: [PATCH 58/71] exclude iceberg from duckdb s3-compatibility test --- dlt/destinations/impl/filesystem/sql_client.py | 5 +++-- docs/website/docs/dlt-ecosystem/destinations/filesystem.md | 7 ++++++- tests/load/filesystem/test_sql_client.py | 3 ++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index 4e744eea1a..6409167e29 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -169,8 +169,9 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non # native google storage implementation is not supported.. elif self.fs_client.config.protocol in ["gs", "gcs"]: logger.warn( - "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer. Falling" - " back to fsspec." + "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer if" + " possible (not supported when using `iceberg` table format). Falling back to" + " fsspec." ) self._conn.register_filesystem(self.fs_client.fs_client) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index d9ed52230c..c2582d6059 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -108,7 +108,8 @@ You need to create an S3 bucket and a user who can access that bucket. dlt does #### Using S3 compatible storage -To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: +To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/), [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/) or [Google +Cloud Storage](https://cloud.google.com/storage/docs/interoperability), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: ```toml [destination.filesystem] @@ -732,6 +733,10 @@ Note that not all authentication methods are supported when using table formats | [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | | [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | +:::note +The [S3-compatible](#using-s3-compatible-storage) interface for Google Cloud Storage is not supported when using `iceberg`. +::: + #### Iceberg Azure scheme The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`. diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index 1bf70e3f1e..a73b0f7e31 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -303,8 +303,9 @@ def test_table_formats( # in case of gcs we use the s3 compat layer for reading # for writing we still need to use the gc authentication, as delta_rs seems to use # methods on the s3 interface that are not implemented by gcs + # s3 compat layer does not work with `iceberg` table format access_pipeline = pipeline - if destination_config.bucket_url == GCS_BUCKET: + if destination_config.bucket_url == GCS_BUCKET and destination_config.table_format != "iceberg": gcp_bucket = filesystem( GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp" ) From de0086e6e8f3890ccde6c2fc80d9d48b7cef5e1e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 1 Dec 2024 13:12:05 +0400 Subject: [PATCH 59/71] disable pyiceberg info logs for tests --- tests/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 6088fa976c..a5a349f8d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -120,6 +120,9 @@ def _create_pipeline_instance_id(self) -> str: # disable googleapiclient logging logging.getLogger("googleapiclient.discovery_cache").setLevel("WARNING") + # disable pyiceberg logging + logging.getLogger("pyiceberg").setLevel("WARNING") + # reset and init airflow db import warnings From ca7f655316f09235421d32908b1d58bda441412e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 1 Dec 2024 15:14:45 +0400 Subject: [PATCH 60/71] extend table format docs and move into own page --- .../destinations/delta-iceberg.md | 167 ++++++++++++++++++ .../dlt-ecosystem/destinations/filesystem.md | 157 +--------------- docs/website/sidebars.js | 1 + 3 files changed, 171 insertions(+), 154 deletions(-) create mode 100644 docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md new file mode 100644 index 0000000000..d062317f1c --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md @@ -0,0 +1,167 @@ +--- +title: Delta / Iceberg +description: Delta / Iceberg `dlt` destination +keywords: [delta, iceberg, destination, data warehouse] +--- + +# Delta and Iceberg table formats +`dlt` supports writing [Delta](https://delta.io/) and [Iceberg](https://iceberg.apache.org/) tables when using the [filesystem](./filesystem.md) destination. + +## How it works +`dlt` uses the [deltalake](https://pypi.org/project/deltalake/) and [pyiceberg](https://pypi.org/project/pyiceberg/) libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`. + +## Iceberg catalog +`dlt` uses single-table, ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/)s. These catalogs are created "on demand" when a pipeline is run, and do not persist afterwards. If a table already exists in the filesystem, it gets registered into the catalog using its latest metadata file. This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. + +:::caution +While ephemeral catalogs make it easy to get started with Iceberg, it comes with limitations: +- concurrent writes are not handled and may lead to corrupt table state +- the latest manifest file needs to be searched for using file listing—this can become slow with large tables, especially in cloud object stores +::: + +## Delta dependencies + +You need the `deltalake` package to use this format: + +```sh +pip install "dlt[deltalake]" +``` + +You also need `pyarrow>=17.0.0`: + +```sh +pip install 'pyarrow>=17.0.0' +``` + +## Iceberg dependencies + +You need the `pyiceberg` package to use this format: + +```sh +pip install "dlt[pyiceberg]" +``` + +You also need `sqlalchemy>=2.0.18`: + +```sh +pip install 'sqlalchemy>=2.0.18' +``` + +## Set table format + +Set the `table_format` argument to `delta` or `iceberg` when defining your resource: + +```py +@dlt.resource(table_format="delta") +def my_delta_resource(): + ... +``` + +or when calling `run` on your pipeline: + +```py +pipeline.run(my_resource, table_format="delta") +``` + +:::note +`dlt` always uses Parquet as `loader_file_format` when using the `delta` or `iceberg` table format. Any setting of `loader_file_format` is disregarded. +::: + + +## Table format partitioning +Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: + +```py +@dlt.resource( + table_format="delta", + columns={"foo": {"partition": True}} +) +def my_delta_resource(): + ... +``` + +:::note +Delta uses [Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/), while Iceberg uses [hidden partioning](https://iceberg.apache.org/docs/latest/partitioning/). +::: + +:::caution +Partition evolution (changing partition columns after a table has been created) is not supported. +::: + +## Table access helper functions +You can use the `get_delta_tables` and `get_iceberg_tables` helper functions to acccess native table objects. For `delta` these are `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects, for `iceberg` these are `pyiceberg` [Table](https://py.iceberg.apache.org/reference/pyiceberg/table/#pyiceberg.table.Table) objects. + +```py +from dlt.common.libs.deltalake import get_delta_tables +# from dlt.common.libs.pyiceberg import get_iceberg_tables + +... + +# get dictionary of DeltaTable objects +delta_tables = get_delta_tables(pipeline) + +# execute operations on DeltaTable objects +delta_tables["my_delta_table"].optimize.compact() +delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) +# delta_tables["my_delta_table"].vacuum() +# etc. +``` + +## Table format Google Cloud Storage authentication + +Note that not all authentication methods are supported when using table formats on Google Cloud Storage: + +| Authentication method | `delta` | `iceberg` | +| -- | -- | -- | +| [Service Account](bigquery.md#setup-guide) | ✅ | ❌ | +| [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | +| [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | + +:::note +The [S3-compatible](#using-s3-compatible-storage) interface for Google Cloud Storage is not supported when using `iceberg`. +::: + +## Iceberg Azure scheme +The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`. + +## Table format `merge` support (**experimental**) +The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`. + +:::caution +The `upsert` merge strategy for the filesystem destination with Delta table format is **experimental**. +::: + +```py +@dlt.resource( + write_disposition={"disposition": "merge", "strategy": "upsert"}, + primary_key="my_primary_key", + table_format="delta" +) +def my_upsert_resource(): + ... +... +``` + +### Known limitations +- `hard_delete` hint not supported +- Deleting records from nested tables not supported + - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. + +## Delta table format storage options +You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: + +```toml +[destination.filesystem] +deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' +``` + +`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. + +You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. + +>❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. + +## Delta table format memory usage +:::caution +Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. +::: \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index c2582d6059..efb39cc58e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -627,160 +627,9 @@ You can choose the following file formats: ## Supported table formats -You can choose the following table formats: -* [Delta table](../table-formats/delta.md) is supported -* [Iceberg](../table-formats/iceberg.md) is supported (**experimental**) - -### Delta table format dependencies - -You need the `deltalake` package to use this format: - -```sh -pip install "dlt[deltalake]" -``` - -You also need `pyarrow>=17.0.0`: - -```sh -pip install 'pyarrow>=17.0.0' -``` - -### Iceberg table format dependencies - -You need the `pyiceberg` package to use this format: - -```sh -pip install "dlt[pyiceberg]" -``` - -You also need `sqlalchemy>=2.0.18`: - -```sh -pip install 'sqlalchemy>=2.0.18' -``` - -:::note -`dlt` uses an ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/). This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. -::: - -### Set table format - -Set the `table_format` argument to `delta` or `iceberg` when defining your resource: - -```py -@dlt.resource(table_format="delta") -def my_delta_resource(): - ... -``` - -or when calling `run` on your pipeline: - -```py -pipeline.run(my_resource, table_format="delta") -``` - -:::note -`dlt` always uses Parquet as `loader_file_format` when using the `delta` or `iceberg` table format. Any setting of `loader_file_format` is disregarded. -::: - - -#### Table format partitioning -Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: - -```py -@dlt.resource( - table_format="delta", - columns={"foo": {"partition": True}} -) -def my_delta_resource(): - ... -``` - -:::note -Delta uses [Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/), while Iceberg uses [hidden partioning](https://iceberg.apache.org/docs/latest/partitioning/). -::: - -:::caution -Partition evolution (changing partition columns after a table has been created) is not supported. -::: - -#### Table access helper functions -You can use the `get_delta_tables` and `get_iceberg_tables` helper functions to acccess native table objects. For `delta` these are `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects, for `iceberg` these are `pyiceberg` [Table](https://py.iceberg.apache.org/reference/pyiceberg/table/#pyiceberg.table.Table) objects. - -```py -from dlt.common.libs.deltalake import get_delta_tables -# from dlt.common.libs.pyiceberg import get_iceberg_tables - -... - -# get dictionary of DeltaTable objects -delta_tables = get_delta_tables(pipeline) - -# execute operations on DeltaTable objects -delta_tables["my_delta_table"].optimize.compact() -delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) -# delta_tables["my_delta_table"].vacuum() -# etc. -``` - -#### Table format Google Cloud Storage authentication - -Note that not all authentication methods are supported when using table formats on Google Cloud Storage: - -| Authentication method | `delta` | `iceberg` | -| -- | -- | -- | -| [Service Account](bigquery.md#setup-guide) | ✅ | ❌ | -| [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | -| [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | - -:::note -The [S3-compatible](#using-s3-compatible-storage) interface for Google Cloud Storage is not supported when using `iceberg`. -::: - -#### Iceberg Azure scheme -The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`. - -#### Table format `merge` support (**experimental**) -The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`. - -:::caution -The `upsert` merge strategy for the filesystem destination with Delta table format is **experimental**. -::: - -```py -@dlt.resource( - write_disposition={"disposition": "merge", "strategy": "upsert"}, - primary_key="my_primary_key", - table_format="delta" -) -def my_upsert_resource(): - ... -... -``` - -#### Known limitations -- `hard_delete` hint not supported -- Deleting records from nested tables not supported - - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. - -#### Delta table format storage options -You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: - -```toml -[destination.filesystem] -deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' -``` - -`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. - -You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. - ->❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. - -#### Delta table format memory usage -:::caution -Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. -::: +You can choose the following [table formats](./delta-iceberg.md): +* Delta table +* Iceberg ## Syncing of dlt state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured. diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 274f3e82b3..8e8c11fc09 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -167,6 +167,7 @@ const sidebars = { 'dlt-ecosystem/destinations/synapse', 'dlt-ecosystem/destinations/clickhouse', 'dlt-ecosystem/destinations/filesystem', + 'dlt-ecosystem/destinations/delta-iceberg', 'dlt-ecosystem/destinations/postgres', 'dlt-ecosystem/destinations/redshift', 'dlt-ecosystem/destinations/snowflake', From 2ba8fcb64da25c4284486b15f72ebc7a59dcfe3c Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 1 Dec 2024 18:50:01 +0400 Subject: [PATCH 61/71] upgrade adlfs to enable account_host attribute --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7d5e236be2..ccbec5bd8f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -13,13 +13,13 @@ files = [ [[package]] name = "adlfs" -version = "2024.4.1" +version = "2024.7.0" description = "Access Azure Datalake Gen1 with fsspec and dask" optional = true python-versions = ">=3.8" files = [ - {file = "adlfs-2024.4.1-py3-none-any.whl", hash = "sha256:acea94612ddacaa34ea8c6babcc95b8da6982f930cdade7a86fbd17382403e16"}, - {file = "adlfs-2024.4.1.tar.gz", hash = "sha256:75530a45447f358ae53c5c39c298b8d966dae684be84db899f63b94cd96fc000"}, + {file = "adlfs-2024.7.0-py3-none-any.whl", hash = "sha256:2005c8e124fda3948f2a6abb2dbebb2c936d2d821acaca6afd61932edfa9bc07"}, + {file = "adlfs-2024.7.0.tar.gz", hash = "sha256:106995b91f0eb5e775bcd5957d180d9a14faef3271a063b1f65c66fd5ab05ddf"}, ] [package.dependencies] @@ -10716,4 +10716,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "3f57364040f4f97a1422a6bd6ce72a70c655f379c77f65d091ac0417d44c0a31" +content-hash = "76b04f900df1025a0a530825b5ee36dab2897ae490c34d77ccf90ce7697249de" diff --git a/pyproject.toml b/pyproject.toml index 17c7af99d8..d62e5f4036 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,7 @@ cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} -adlfs = {version = ">=2022.4.0", optional = true} +adlfs = {version = ">=2024.7.0", optional = true} pyodbc = {version = ">=4.0.39", optional = true} qdrant-client = {version = ">=1.8", optional = true, extras = ["fastembed"]} databricks-sql-connector = {version = ">=2.9.3", optional = true} From 0517a95c54a4fbeb00ed34a157e4353ea5455555 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 2 Dec 2024 22:45:55 +0400 Subject: [PATCH 62/71] Merge branch 'devel' of https://github.com/dlt-hub/dlt into feat/1996-iceberg-filesystem --- dlt/cli/deploy_command_helpers.py | 13 +- dlt/cli/source_detection.py | 7 +- .../configuration/specs/base_configuration.py | 2 +- dlt/common/data_writers/buffered.py | 2 +- dlt/common/destination/reference.py | 1 - dlt/common/destination/utils.py | 2 +- dlt/common/libs/pandas.py | 5 +- dlt/common/logger.py | 2 +- dlt/common/metrics.py | 2 +- dlt/common/normalizers/json/helpers.py | 141 + dlt/common/normalizers/json/relational.py | 172 +- dlt/common/normalizers/naming/naming.py | 2 + dlt/common/normalizers/typing.py | 2 + dlt/common/reflection/utils.py | 28 +- dlt/common/schema/configuration.py | 2 + dlt/common/schema/migrations.py | 7 +- dlt/common/schema/normalizers.py | 7 +- dlt/common/schema/schema.py | 28 +- dlt/common/schema/typing.py | 2 +- dlt/common/typing.py | 2 +- dlt/common/utils.py | 5 + dlt/destinations/dataset.py | 6 +- .../impl/clickhouse/sql_client.py | 6 +- .../impl/filesystem/filesystem.py | 3 +- dlt/extract/extractors.py | 14 +- dlt/extract/incremental/lag.py | 2 +- dlt/normalize/worker.py | 5 +- dlt/reflection/script_visitor.py | 9 +- dlt/sources/sql_database/arrow_helpers.py | 5 +- .../dlt-ecosystem/destinations/filesystem.md | 4 +- .../verified-sources/arrow-pandas.md | 2 + .../docs/general-usage/naming-convention.md | 39 + mypy.ini | 2 +- poetry.lock | 2 +- pyproject.toml | 4 +- .../cases/schemas/eth/ethereum_schema_v11.yml | 394 +++ .../cases/schemas/github/issues.schema.json | 2404 ++++++++--------- .../normalizers/test_json_relational.py | 10 +- .../normalizers/test_naming_snake_case.py | 8 + .../common/schema/test_import_normalizers.py | 36 +- .../schema/test_normalize_identifiers.py | 62 +- tests/common/schema/test_schema.py | 20 +- tests/common/schema/test_versioning.py | 12 +- tests/common/storages/test_schema_storage.py | 12 +- tests/common/storages/utils.py | 4 +- tests/common/test_utils.py | 9 + tests/common/test_validation.py | 2 +- tests/common/utils.py | 6 +- .../cases/eth_source/ethereum.schema.yaml | 4 +- tests/extract/test_decorators.py | 4 +- tests/extract/test_incremental.py | 76 +- tests/libs/pyarrow/test_pyarrow_normalizer.py | 4 +- tests/libs/test_csv_writer.py | 4 +- .../test_clickhouse_configuration.py | 26 +- tests/load/conftest.py | 2 +- tests/load/duckdb/test_duckdb_client.py | 2 +- tests/load/filesystem/test_aws_credentials.py | 1 - .../load/filesystem/test_filesystem_common.py | 1 - tests/load/pipeline/conftest.py | 2 +- tests/load/pipeline/test_merge_disposition.py | 2 +- tests/load/pipeline/test_scd2.py | 3 +- tests/load/qdrant/utils.py | 1 - tests/load/redshift/test_redshift_client.py | 2 +- .../sql_database/test_sql_database_source.py | 2 +- tests/load/test_job_client.py | 2 +- tests/load/test_read_interfaces.py | 11 +- tests/load/test_sql_client.py | 2 +- tests/load/weaviate/utils.py | 1 - .../cases/github_pipeline/github_rev.py | 26 + tests/pipeline/test_dlt_versions.py | 56 + .../test_max_nesting.py | 0 tests/pipeline/test_pipeline.py | 105 + .../helpers/rest_client/test_client.py | 2 +- 73 files changed, 2219 insertions(+), 1638 deletions(-) create mode 100644 dlt/common/normalizers/json/helpers.py create mode 100644 tests/common/cases/schemas/eth/ethereum_schema_v11.yml create mode 100644 tests/pipeline/cases/github_pipeline/github_rev.py rename tests/{normalize => pipeline}/test_max_nesting.py (100%) diff --git a/dlt/cli/deploy_command_helpers.py b/dlt/cli/deploy_command_helpers.py index b508b32226..e3719fbe38 100644 --- a/dlt/cli/deploy_command_helpers.py +++ b/dlt/cli/deploy_command_helpers.py @@ -5,7 +5,6 @@ from yaml import Dumper from itertools import chain from typing import List, Optional, Sequence, Tuple, Any, Dict -from astunparse import unparse # optional dependencies import pipdeptree @@ -23,7 +22,7 @@ from dlt.common.git import get_origin, get_repo, Repo from dlt.common.configuration.specs.runtime_configuration import get_default_pipeline_name from dlt.common.typing import StrAny -from dlt.common.reflection.utils import evaluate_node_literal +from dlt.common.reflection.utils import evaluate_node_literal, ast_unparse from dlt.common.pipeline import LoadInfo, TPipelineState, get_dlt_repos_dir from dlt.common.storages import FileStorage from dlt.common.utils import set_working_dir @@ -313,7 +312,7 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio if f_r_value is None: fmt.warning( "The value of `dev_mode` in call to `dlt.pipeline` cannot be" - f" determined from {unparse(f_r_node).strip()}. We assume that you know" + f" determined from {ast_unparse(f_r_node).strip()}. We assume that you know" " what you are doing :)" ) if f_r_value is True: @@ -331,8 +330,8 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio raise CliCommandInnerException( "deploy", "The value of 'pipelines_dir' argument in call to `dlt_pipeline` cannot be" - f" determined from {unparse(p_d_node).strip()}. Pipeline working dir will" - " be found. Pass it directly with --pipelines-dir option.", + f" determined from {ast_unparse(p_d_node).strip()}. Pipeline working dir" + " will be found. Pass it directly with --pipelines-dir option.", ) p_n_node = call_args.arguments.get("pipeline_name") @@ -342,8 +341,8 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio raise CliCommandInnerException( "deploy", "The value of 'pipeline_name' argument in call to `dlt_pipeline` cannot be" - f" determined from {unparse(p_d_node).strip()}. Pipeline working dir will" - " be found. Pass it directly with --pipeline-name option.", + f" determined from {ast_unparse(p_d_node).strip()}. Pipeline working dir" + " will be found. Pass it directly with --pipeline-name option.", ) pipelines.append((pipeline_name, pipelines_dir)) diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index c3e24eca91..7067f8b896 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -1,11 +1,10 @@ import ast import inspect -from astunparse import unparse from typing import Dict, Tuple, Set, List from dlt.common.configuration import is_secret_hint from dlt.common.configuration.specs import BaseConfiguration -from dlt.common.reflection.utils import creates_func_def_name_node +from dlt.common.reflection.utils import creates_func_def_name_node, ast_unparse from dlt.common.typing import is_optional_type from dlt.sources import SourceReference @@ -30,7 +29,7 @@ def find_call_arguments_to_replace( if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str): raise CliCommandInnerException( "init", - f"The pipeline script {init_script_name} must pass the {t_arg_name} as" # type: ignore[attr-defined] + f"The pipeline script {init_script_name} must pass the {t_arg_name} as" f" string to '{arg_name}' function in line {dn_node.lineno}", ) else: @@ -65,7 +64,7 @@ def find_source_calls_to_replace( for calls in visitor.known_sources_resources_calls.values(): for call in calls: transformed_nodes.append( - (call.func, ast.Name(id=pipeline_name + "_" + unparse(call.func))) + (call.func, ast.Name(id=pipeline_name + "_" + ast_unparse(call.func))) ) return transformed_nodes diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 41d1d7a0ca..8d913d0542 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: def get_resolvable_fields(cls) -> Dict[str, type]: """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned""" return { - f.name: eval(f.type) if isinstance(f.type, str) else f.type + f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type] for f in cls._get_resolvable_dataclass_fields() } diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index 6ef431a4d0..eb7487051f 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if self.writer_spec.is_binary_format: self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") + self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore[operator] self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index d1024eb28c..e27f99cde7 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -81,7 +81,6 @@ DataFrame = Any ArrowTable = Any IbisBackend = Any - else: DataFrame = Any ArrowTable = Any diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index c98344b687..96503c036f 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -38,7 +38,7 @@ def verify_schema_capabilities( exception_log: List[Exception] = [] # combined casing function case_identifier = lambda ident: capabilities.casefold_identifier( - (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore[operator] ) table_name_lookup: DictStrStr = {} # name collision explanation diff --git a/dlt/common/libs/pandas.py b/dlt/common/libs/pandas.py index a165ea8747..35cfe623bb 100644 --- a/dlt/common/libs/pandas.py +++ b/dlt/common/libs/pandas.py @@ -8,8 +8,9 @@ raise MissingDependencyException("dlt Pandas Helpers", ["pandas"]) -def pandas_to_arrow(df: pandas.DataFrame) -> Any: +def pandas_to_arrow(df: pandas.DataFrame, preserve_index: bool = False) -> Any: """Converts pandas to arrow or raises an exception if pyarrow is not installed""" from dlt.common.libs.pyarrow import pyarrow as pa - return pa.Table.from_pandas(df) + # NOTE: None preserves named indexes but ignores unnamed + return pa.Table.from_pandas(df, preserve_index=preserve_index) diff --git a/dlt/common/logger.py b/dlt/common/logger.py index 634e305805..0533713fda 100644 --- a/dlt/common/logger.py +++ b/dlt/common/logger.py @@ -47,7 +47,7 @@ def is_logging() -> bool: def log_level() -> str: if not LOGGER: raise RuntimeError("Logger not initialized") - return logging.getLevelName(LOGGER.level) + return logging.getLevelName(LOGGER.level) # type: ignore[no-any-return] def is_json_logging(log_format: str) -> bool: diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index 2f9f574dd0..d6acf19d0d 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple): created: float last_modified: float - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: # type: ignore[override] + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: if isinstance(other, DataWriterMetrics): return DataWriterMetrics( self.file_path if self.file_path == other.file_path else "", diff --git a/dlt/common/normalizers/json/helpers.py b/dlt/common/normalizers/json/helpers.py new file mode 100644 index 0000000000..96c9ab4954 --- /dev/null +++ b/dlt/common/normalizers/json/helpers.py @@ -0,0 +1,141 @@ +""" +Cached helper methods for all operations that are called often +""" +from functools import lru_cache +from typing import Any, Dict, List, Optional, Tuple, cast + +from dlt.common.json import json +from dlt.common.destination.utils import resolve_merge_strategy +from dlt.common.normalizers.naming import NamingConvention +from dlt.common.normalizers.typing import TRowIdType +from dlt.common.normalizers.utils import DLT_ID_LENGTH_BYTES +from dlt.common.schema import Schema +from dlt.common.schema.typing import TColumnSchema, C_DLT_ID, DLT_NAME_PREFIX +from dlt.common.schema.utils import ( + get_columns_names_with_prop, + get_first_column_name_with_prop, + is_nested_table, +) +from dlt.common.utils import digest128 + + +@lru_cache(maxsize=None) +def shorten_fragments(naming: NamingConvention, *idents: str) -> str: + return naming.shorten_fragments(*idents) + + +@lru_cache(maxsize=None) +def normalize_table_identifier(schema: Schema, naming: NamingConvention, table_name: str) -> str: + if schema._normalizers_config.get("use_break_path_on_normalize", True): + return naming.normalize_tables_path(table_name) + else: + return naming.normalize_table_identifier(table_name) + + +@lru_cache(maxsize=None) +def normalize_identifier(schema: Schema, naming: NamingConvention, identifier: str) -> str: + if schema._normalizers_config.get("use_break_path_on_normalize", True): + return naming.normalize_path(identifier) + else: + return naming.normalize_identifier(identifier) + + +@lru_cache(maxsize=None) +def get_table_nesting_level( + schema: Schema, table_name: str, default_nesting: int = 1000 +) -> Optional[int]: + """gets table nesting level, will inherit from parent if not set""" + + table = schema.tables.get(table_name) + if ( + table + and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting"))) is not None + ): + return max_nesting + return default_nesting + + +@lru_cache(maxsize=None) +def get_primary_key(schema: Schema, table_name: str) -> List[str]: + if table_name not in schema.tables: + return [] + table = schema.get_table(table_name) + return get_columns_names_with_prop(table, "primary_key", include_incomplete=True) + + +@lru_cache(maxsize=None) +def is_nested_type( + schema: Schema, + table_name: str, + field_name: str, + _r_lvl: int, +) -> bool: + """For those paths the nested objects should be left in place. + Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster + """ + + # nesting level is counted backwards + # is we have traversed to or beyond the calculated nesting level, we detect a nested type + if _r_lvl <= 0: + return True + + column: TColumnSchema = None + table = schema.tables.get(table_name) + if table: + column = table["columns"].get(field_name) + if column is None or "data_type" not in column: + data_type = schema.get_preferred_type(field_name) + else: + data_type = column["data_type"] + + return data_type == "json" + + +@lru_cache(maxsize=None) +def get_nested_row_id_type(schema: Schema, table_name: str) -> Tuple[TRowIdType, bool]: + """Gets type of row id to be added to nested table and if linking information should be added""" + if table := schema.tables.get(table_name): + merge_strategy = resolve_merge_strategy(schema.tables, table) + if merge_strategy not in ("upsert", "scd2") and not is_nested_table(table): + return "random", False + else: + # table will be created, use standard linking + pass + return "row_hash", True + + +@lru_cache(maxsize=None) +def get_root_row_id_type(schema: Schema, table_name: str) -> TRowIdType: + if table := schema.tables.get(table_name): + merge_strategy = resolve_merge_strategy(schema.tables, table) + if merge_strategy == "upsert": + return "key_hash" + elif merge_strategy == "scd2": + x_row_version_col = get_first_column_name_with_prop( + schema.get_table(table_name), + "x-row-version", + include_incomplete=True, + ) + if x_row_version_col == schema.naming.normalize_identifier(C_DLT_ID): + return "row_hash" + return "random" + + +def get_row_hash(row: Dict[str, Any], subset: Optional[List[str]] = None) -> str: + """Returns hash of row. + + Hash includes column names and values and is ordered by column name. + Excludes dlt system columns. + Can be used as deterministic row identifier. + """ + row_filtered = {k: v for k, v in row.items() if not k.startswith(DLT_NAME_PREFIX)} + if subset is not None: + row_filtered = {k: v for k, v in row.items() if k in subset} + row_str = json.dumps(row_filtered, sort_keys=True) + return digest128(row_str, DLT_ID_LENGTH_BYTES) + + +def get_nested_row_hash(parent_row_id: str, nested_table: str, list_idx: int) -> str: + # create deterministic unique id of the nested row taking into account that all lists are ordered + # and all nested tables must be lists + return digest128(f"{parent_row_id}_{nested_table}_{list_idx}", DLT_ID_LENGTH_BYTES) diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index c5338192a0..e365017125 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -1,34 +1,27 @@ -from functools import lru_cache from typing import Dict, List, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any -from dlt.common.destination.utils import resolve_merge_strategy -from dlt.common.json import json -from dlt.common.normalizers.exceptions import InvalidJsonNormalizer -from dlt.common.normalizers.typing import TJSONNormalizer, TRowIdType -from dlt.common.normalizers.utils import generate_dlt_id, DLT_ID_LENGTH_BYTES +from dlt.common.normalizers.exceptions import InvalidJsonNormalizer +from dlt.common.normalizers.typing import TJSONNormalizer +from dlt.common.normalizers.utils import generate_dlt_id from dlt.common.typing import DictStrAny, TDataItem, StrAny from dlt.common.schema import Schema from dlt.common.schema.typing import ( C_DLT_ID, C_DLT_LOAD_ID, - TColumnSchema, TColumnName, TSimpleRegex, - DLT_NAME_PREFIX, ) from dlt.common.schema.utils import ( column_name_validator, - get_columns_names_with_prop, - get_first_column_name_with_prop, - has_column_with_prop, is_nested_table, ) -from dlt.common.utils import digest128, update_dict_nested +from dlt.common.utils import update_dict_nested from dlt.common.normalizers.json import ( TNormalizedRowIterator, wrap_in_dict, DataItemNormalizer as DataItemNormalizerBase, ) +from dlt.common.normalizers.json import helpers from dlt.common.validation import validate_dict @@ -103,18 +96,18 @@ def _flatten( def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) -> None: for k, v in dict_row.items(): if k.strip(): - norm_k = self._normalize_identifier(self.schema, k) + norm_k = helpers.normalize_identifier(self.schema, self.naming, k) else: # for empty keys in the data use _ norm_k = self.EMPTY_KEY_IDENTIFIER # if norm_k != k: # print(f"{k} -> {norm_k}") nested_name = ( - norm_k if path == () else self._shorten_fragments(self.schema, *path, norm_k) + norm_k if path == () else helpers.shorten_fragments(self.naming, *path, norm_k) ) # for lists and dicts we must check if type is possibly nested if isinstance(v, (dict, list)): - if not self._is_nested_type(self.schema, table, nested_name, __r_lvl): + if not helpers.is_nested_type(self.schema, table, nested_name, __r_lvl): # TODO: if schema contains table {table}__{nested_name} then convert v into single element list if isinstance(v, dict): # flatten the dict more @@ -122,7 +115,8 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - else: # pass the list to out_rec_list out_rec_list[ - path + (self._normalize_table_identifier(self.schema, k),) + path + + (helpers.normalize_table_identifier(self.schema, self.naming, k),) ] = v continue else: @@ -134,26 +128,6 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - norm_row_dicts(dict_row, _r_lvl) return out_rec_row, out_rec_list - @staticmethod - def get_row_hash(row: Dict[str, Any], subset: Optional[List[str]] = None) -> str: - """Returns hash of row. - - Hash includes column names and values and is ordered by column name. - Excludes dlt system columns. - Can be used as deterministic row identifier. - """ - row_filtered = {k: v for k, v in row.items() if not k.startswith(DLT_NAME_PREFIX)} - if subset is not None: - row_filtered = {k: v for k, v in row.items() if k in subset} - row_str = json.dumps(row_filtered, sort_keys=True) - return digest128(row_str, DLT_ID_LENGTH_BYTES) - - @staticmethod - def _get_nested_row_hash(parent_row_id: str, nested_table: str, list_idx: int) -> str: - # create deterministic unique id of the nested row taking into account that all lists are ordered - # and all nested tables must be lists - return digest128(f"{parent_row_id}_{nested_table}_{list_idx}", DLT_ID_LENGTH_BYTES) - def _link_row(self, row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: assert parent_row_id row[self.c_dlt_parent_id] = parent_row_id @@ -175,20 +149,20 @@ def _add_row_id( is_root: bool = False, ) -> str: if is_root: # root table - row_id_type = self._get_root_row_id_type(self.schema, table) + row_id_type = helpers.get_root_row_id_type(self.schema, table) if row_id_type in ("key_hash", "row_hash"): subset = None if row_id_type == "key_hash": - subset = self._get_primary_key(self.schema, table) + subset = helpers.get_primary_key(self.schema, table) # base hash on `dict_row` instead of `flattened_row` # so changes in nested tables lead to new row id - row_id = self.get_row_hash(dict_row, subset=subset) + row_id = helpers.get_row_hash(dict_row, subset=subset) else: row_id = generate_dlt_id() else: # nested table - row_id_type, is_nested = self._get_nested_row_id_type(self.schema, table) + row_id_type, is_nested = helpers.get_nested_row_id_type(self.schema, table) if row_id_type == "row_hash": - row_id = DataItemNormalizer._get_nested_row_hash(parent_row_id, table, pos) + row_id = helpers.get_nested_row_hash(parent_row_id, table, pos) # link to parent table if is_nested: self._link_row(flattened_row, parent_row_id, pos) @@ -227,7 +201,7 @@ def _normalize_list( parent_row_id: Optional[str] = None, _r_lvl: int = 0, ) -> TNormalizedRowIterator: - table = self._shorten_fragments(self.schema, *parent_path, *ident_path) + table = helpers.shorten_fragments(self.naming, *parent_path, *ident_path) for idx, v in enumerate(seq): if isinstance(v, dict): @@ -251,7 +225,7 @@ def _normalize_list( wrap_v = wrap_in_dict(self.c_value, v) DataItemNormalizer._extend_row(extend, wrap_v) self._add_row_id(table, wrap_v, wrap_v, parent_row_id, idx) - yield (table, self._shorten_fragments(self.schema, *parent_path)), wrap_v + yield (table, helpers.shorten_fragments(self.naming, *parent_path)), wrap_v def _normalize_row( self, @@ -264,8 +238,8 @@ def _normalize_row( _r_lvl: int = 0, is_root: bool = False, ) -> TNormalizedRowIterator: - schema = self.schema - table = self._shorten_fragments(schema, *parent_path, *ident_path) + naming = self.naming + table = helpers.shorten_fragments(naming, *parent_path, *ident_path) # flatten current row and extract all lists to recur into flattened_row, lists = self._flatten(table, dict_row, _r_lvl) # always extend row @@ -280,7 +254,7 @@ def _normalize_row( # yield parent table first should_descend = yield ( - (table, self._shorten_fragments(schema, *parent_path)), + (table, helpers.shorten_fragments(naming, *parent_path)), flattened_row, ) if should_descend is False: @@ -361,8 +335,10 @@ def normalize_data_item( # identify load id if loaded data must be processed after loading incrementally row[self.c_dlt_load_id] = load_id # get table name and nesting level - root_table_name = self._normalize_table_identifier(self.schema, table_name) - max_nesting = self._get_table_nesting_level(self.schema, root_table_name, self.max_nesting) + root_table_name = helpers.normalize_table_identifier(self.schema, self.naming, table_name) + max_nesting = helpers.get_table_nesting_level( + self.schema, root_table_name, self.max_nesting + ) yield from self._normalize_row( row, @@ -426,103 +402,3 @@ def _normalize_prop( "./normalizers/json/config", validator_f=column_name_validator(schema.naming), ) - - # - # Cached helper methods for all operations that are called often - # - @staticmethod - @lru_cache(maxsize=None) - def _shorten_fragments(schema: Schema, *idents: str) -> str: - return schema.naming.shorten_fragments(*idents) - - @staticmethod - @lru_cache(maxsize=None) - def _normalize_table_identifier(schema: Schema, table_name: str) -> str: - return schema.naming.normalize_table_identifier(table_name) - - @staticmethod - @lru_cache(maxsize=None) - def _normalize_identifier(schema: Schema, identifier: str) -> str: - return schema.naming.normalize_path(identifier) - - @staticmethod - @lru_cache(maxsize=None) - def _get_table_nesting_level( - schema: Schema, table_name: str, default_nesting: int = 1000 - ) -> Optional[int]: - """gets table nesting level, will inherit from parent if not set""" - - table = schema.tables.get(table_name) - if ( - table - and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting"))) - is not None - ): - return max_nesting - return default_nesting - - @staticmethod - @lru_cache(maxsize=None) - def _get_primary_key(schema: Schema, table_name: str) -> List[str]: - if table_name not in schema.tables: - return [] - table = schema.get_table(table_name) - return get_columns_names_with_prop(table, "primary_key", include_incomplete=True) - - @staticmethod - @lru_cache(maxsize=None) - def _is_nested_type( - schema: Schema, - table_name: str, - field_name: str, - _r_lvl: int, - ) -> bool: - """For those paths the nested objects should be left in place. - Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster - """ - - # nesting level is counted backwards - # is we have traversed to or beyond the calculated nesting level, we detect a nested type - if _r_lvl <= 0: - return True - - column: TColumnSchema = None - table = schema.tables.get(table_name) - if table: - column = table["columns"].get(field_name) - if column is None or "data_type" not in column: - data_type = schema.get_preferred_type(field_name) - else: - data_type = column["data_type"] - - return data_type == "json" - - @staticmethod - @lru_cache(maxsize=None) - def _get_nested_row_id_type(schema: Schema, table_name: str) -> Tuple[TRowIdType, bool]: - """Gets type of row id to be added to nested table and if linking information should be added""" - if table := schema.tables.get(table_name): - merge_strategy = resolve_merge_strategy(schema.tables, table) - if merge_strategy not in ("upsert", "scd2") and not is_nested_table(table): - return "random", False - else: - # table will be created, use standard linking - pass - return "row_hash", True - - @staticmethod - @lru_cache(maxsize=None) - def _get_root_row_id_type(schema: Schema, table_name: str) -> TRowIdType: - if table := schema.tables.get(table_name): - merge_strategy = resolve_merge_strategy(schema.tables, table) - if merge_strategy == "upsert": - return "key_hash" - elif merge_strategy == "scd2": - x_row_version_col = get_first_column_name_with_prop( - schema.get_table(table_name), - "x-row-version", - include_incomplete=True, - ) - if x_row_version_col == schema.naming.normalize_identifier(C_DLT_ID): - return "row_hash" - return "random" diff --git a/dlt/common/normalizers/naming/naming.py b/dlt/common/normalizers/naming/naming.py index 5ae5847963..9953d25913 100644 --- a/dlt/common/normalizers/naming/naming.py +++ b/dlt/common/normalizers/naming/naming.py @@ -45,6 +45,8 @@ def make_path(self, *identifiers: str) -> str: def break_path(self, path: str) -> Sequence[str]: """Breaks path into sequence of identifiers""" + # TODO: this is no longer needed if we modify all naming convention to do not contract + # underscores then also normalize_path will not be needed return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] def normalize_path(self, path: str) -> str: diff --git a/dlt/common/normalizers/typing.py b/dlt/common/normalizers/typing.py index 9840f3a4d2..16ad097fde 100644 --- a/dlt/common/normalizers/typing.py +++ b/dlt/common/normalizers/typing.py @@ -18,5 +18,7 @@ class TJSONNormalizer(TypedDict, total=False): class TNormalizersConfig(TypedDict, total=False): names: str allow_identifier_change_on_table_with_data: Optional[bool] + use_break_path_on_normalize: Optional[bool] + """Post 1.4.0 to allow table and column names that contain table separators""" detections: Optional[List[str]] json: TJSONNormalizer diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index bfdd547d70..c612c5a4f1 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -1,7 +1,13 @@ import ast import inspect -import astunparse -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, Callable + +try: + import astunparse + + ast_unparse: Callable[[ast.AST], str] = astunparse.unparse +except ImportError: + ast_unparse = ast.unparse # type: ignore[attr-defined, unused-ignore] from dlt.common.typing import AnyFun @@ -25,7 +31,7 @@ def get_literal_defaults(node: Union[ast.FunctionDef, ast.AsyncFunctionDef]) -> literal_defaults: Dict[str, str] = {} for arg, default in zip(reversed(args), reversed(defaults)): if default: - literal_defaults[str(arg.arg)] = astunparse.unparse(default).strip() + literal_defaults[str(arg.arg)] = ast_unparse(default).strip() return literal_defaults @@ -84,24 +90,24 @@ def rewrite_python_script( last_line = -1 last_offset = -1 # sort transformed nodes by line and offset - for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # type: ignore[attr-defined] + for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # do we have a line changed - if last_line != node.lineno - 1: # type: ignore[attr-defined] + if last_line != node.lineno - 1: # add remainder from the previous line if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) # add all new lines from previous line to current - script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # type: ignore[attr-defined] + script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # add trailing characters until node in current line starts - script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) # type: ignore[attr-defined] + script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) elif last_offset >= 0: # no line change, add the characters from the end of previous node to the current - script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # type: ignore[attr-defined] + script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # replace node value - script_lines.append(astunparse.unparse(t_value).strip()) - last_line = node.end_lineno - 1 # type: ignore[attr-defined] - last_offset = node.end_col_offset # type: ignore[attr-defined] + script_lines.append(ast_unparse(t_value).strip()) + last_line = node.end_lineno - 1 + last_offset = node.end_col_offset # add all that was missing if last_offset >= 0: diff --git a/dlt/common/schema/configuration.py b/dlt/common/schema/configuration.py index e64dd57494..72f79026da 100644 --- a/dlt/common/schema/configuration.py +++ b/dlt/common/schema/configuration.py @@ -14,3 +14,5 @@ class SchemaConfiguration(BaseConfiguration): naming: Optional[TNamingConventionReferenceArg] = None # Union[str, NamingConvention] json_normalizer: Optional[DictStrAny] = None allow_identifier_change_on_table_with_data: Optional[bool] = None + use_break_path_on_normalize: Optional[bool] = None + """Post 1.4.0 to allow table and column names that contain table separators""" diff --git a/dlt/common/schema/migrations.py b/dlt/common/schema/migrations.py index d9e758f204..06eb35c0f6 100644 --- a/dlt/common/schema/migrations.py +++ b/dlt/common/schema/migrations.py @@ -29,13 +29,13 @@ def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> schema_dict["excludes"] = [] from_engine = 2 if from_engine == 2 and to_engine > 2: - from dlt.common.schema.normalizers import import_normalizers, explicit_normalizers + from dlt.common.schema.normalizers import import_normalizers, configured_normalizers # current version of the schema current = cast(TStoredSchema, schema_dict) # add default normalizers and root hash propagation # use explicit None to get default settings. ignore any naming conventions - normalizers = explicit_normalizers(naming=None, json_normalizer=None) + normalizers = configured_normalizers(naming=None, json_normalizer=None) current["normalizers"], _, _ = import_normalizers(normalizers, normalizers) current["normalizers"]["json"]["config"] = { "propagation": {"root": {"_dlt_id": "_dlt_root_id"}} @@ -169,6 +169,9 @@ def migrate_filters(group: str, filters: List[str]) -> None: json_config.pop("generate_dlt_id", None) from_engine = 10 + if from_engine == 10 and to_engine > 10: + schema_dict["normalizers"]["use_break_path_on_normalize"] = False + from_engine = 11 schema_dict["engine_version"] = from_engine if from_engine != to_engine: diff --git a/dlt/common/schema/normalizers.py b/dlt/common/schema/normalizers.py index 9b2a37e708..8f42e90596 100644 --- a/dlt/common/schema/normalizers.py +++ b/dlt/common/schema/normalizers.py @@ -40,13 +40,14 @@ def _section_for_schema(kwargs: Dict[str, Any]) -> Tuple[str, ...]: @with_config(spec=SchemaConfiguration, sections=_section_for_schema) # type: ignore[call-overload] -def explicit_normalizers( +def configured_normalizers( naming: TNamingConventionReferenceArg = dlt.config.value, json_normalizer: TJSONNormalizer = dlt.config.value, allow_identifier_change_on_table_with_data: bool = None, + use_break_path_on_normalize: Optional[bool] = None, schema_name: Optional[str] = None, ) -> TNormalizersConfig: - """Gets explicitly configured normalizers without any defaults or capabilities injection. If `naming` + """Gets explicitly onfigured normalizers without any defaults or capabilities injection. If `naming` is a module or a type it will get converted into string form via import. If `schema_name` is present, a section ("sources", schema_name, "schema") is used to inject the config @@ -57,6 +58,8 @@ def explicit_normalizers( norm_conf["allow_identifier_change_on_table_with_data"] = ( allow_identifier_change_on_table_with_data ) + if use_break_path_on_normalize is not None: + norm_conf["use_break_path_on_normalize"] = use_break_path_on_normalize return norm_conf diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 5e014e1cde..d6031a08fa 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -57,7 +57,7 @@ SchemaCorruptedException, TableIdentifiersFrozen, ) -from dlt.common.schema.normalizers import import_normalizers, explicit_normalizers +from dlt.common.schema.normalizers import import_normalizers, configured_normalizers from dlt.common.schema.exceptions import DataValidationError from dlt.common.validation import validate_dict @@ -439,7 +439,8 @@ def update_schema(self, schema: "Schema") -> None: """Updates this schema from an incoming schema. Normalizes identifiers after updating normalizers.""" # pass normalizer config self._settings = deepcopy(schema.settings) - self._configure_normalizers(schema._normalizers_config) + # make shallow copy of normalizer settings + self._configure_normalizers(copy(schema._normalizers_config)) self._compile_settings() # update all tables for table in schema.tables.values(): @@ -524,7 +525,7 @@ def get_new_table_columns( Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored. Optionally includes incomplete columns (without data type)""" - casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str + casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] casefold_existing = { casefold_f(col_name): col for col_name, col in existing_columns.items() } @@ -753,7 +754,7 @@ def update_normalizers(self) -> None: Default hints, preferred data types and normalize configs (ie. column propagation) are normalized as well. Regexes are included as long as textual parts can be extracted from an expression. """ - self._configure_normalizers(explicit_normalizers(schema_name=self._schema_name)) + self._configure_normalizers(configured_normalizers(schema_name=self._schema_name)) self._compile_settings() def will_update_normalizers(self) -> bool: @@ -761,7 +762,7 @@ def will_update_normalizers(self) -> bool: # import desired modules _, to_naming, _ = import_normalizers( - explicit_normalizers(schema_name=self._schema_name), self._normalizers_config + configured_normalizers(schema_name=self._schema_name), self._normalizers_config ) return type(to_naming) is not type(self.naming) # noqa @@ -1106,13 +1107,13 @@ def _verify_identifiers(table: TTableSchema, norm_table: TTableSchema) -> None: else: return self._schema_tables - def _renormalize_schema_identifiers( + def _replace_and_apply_naming( self, normalizers_config: TNormalizersConfig, to_naming: NamingConvention, from_naming: NamingConvention, ) -> None: - """Normalizes all identifiers in the schema in place""" + """Normalizes all identifiers in the schema in place according to `to_naming`""" self._schema_tables = self._verify_update_normalizers( normalizers_config, to_naming, from_naming ) @@ -1140,10 +1141,19 @@ def _renormalize_schema_identifiers( def _configure_normalizers(self, explicit_normalizers: TNormalizersConfig) -> None: """Gets naming and item normalizer from schema yaml, config providers and destination capabilities and applies them to schema.""" + # preserve current schema settings if not explicitly set in `explicit_normalizers` + if explicit_normalizers and self._normalizers_config: + for prop_ in [ + "use_break_path_on_normalize", + "allow_identifier_change_on_table_with_data", + ]: + if prop_ in self._normalizers_config and prop_ not in explicit_normalizers: + explicit_normalizers[prop_] = self._normalizers_config[prop_] # type: ignore[literal-required] + normalizers_config, to_naming, item_normalizer_class = import_normalizers( explicit_normalizers, self._normalizers_config ) - self._renormalize_schema_identifiers(normalizers_config, to_naming, self.naming) + self._replace_and_apply_naming(normalizers_config, to_naming, self.naming) # data item normalization function self.data_item_normalizer = item_normalizer_class(self) self.data_item_normalizer.extend_schema() @@ -1174,7 +1184,7 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No self._add_standard_hints() # configure normalizers, including custom config if present if not normalizers: - normalizers = explicit_normalizers(schema_name=self._schema_name) + normalizers = configured_normalizers(schema_name=self._schema_name) self._configure_normalizers(normalizers) # add version tables self._add_standard_tables() diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index c8f5de03ed..6f5d6213c9 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -28,7 +28,7 @@ # current version of schema engine -SCHEMA_ENGINE_VERSION = 10 +SCHEMA_ENGINE_VERSION = 11 # dlt tables VERSION_TABLE_NAME = "_dlt_version" diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 8986d753f3..a3364d1b07 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -446,7 +446,7 @@ def get_generic_type_argument_from_instance( if cls_: orig_param_type = get_args(cls_)[0] if orig_param_type in (Any, CallableAny) and sample_value is not None: - orig_param_type = type(sample_value) # type: ignore[assignment] + orig_param_type = type(sample_value) return orig_param_type # type: ignore diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 3ff23c9bae..58e1dbd824 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -647,3 +647,8 @@ def is_typeerror_due_to_wrong_call(exc: Exception, func: AnyFun) -> bool: func_name = func.__name__ message = str(exc) return message.__contains__(f"{func_name}()") + + +removeprefix = getattr( + str, "removeprefix", lambda s_, p_: s_[len(p_) :] if s_.startswith(p_) else s_ +) diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py index 411c876c19..27a7f5a7af 100644 --- a/dlt/destinations/dataset.py +++ b/dlt/destinations/dataset.py @@ -3,12 +3,8 @@ from contextlib import contextmanager from dlt import version - from dlt.common.json import json - -from dlt.common.normalizers.naming.naming import NamingConvention from dlt.common.exceptions import MissingDependencyException - from dlt.common.destination import AnyDestination from dlt.common.destination.reference import ( SupportsReadableRelation, @@ -109,7 +105,7 @@ def query(self) -> Any: return self._provided_query table_name = self.sql_client.make_qualified_table_name( - self.schema.naming.normalize_path(self._table_name) + self.schema.naming.normalize_tables_path(self._table_name) ) maybe_limit_clause_1 = "" diff --git a/dlt/destinations/impl/clickhouse/sql_client.py b/dlt/destinations/impl/clickhouse/sql_client.py index 00f35da082..a6c4ee0458 100644 --- a/dlt/destinations/impl/clickhouse/sql_client.py +++ b/dlt/destinations/impl/clickhouse/sql_client.py @@ -28,6 +28,7 @@ from dlt.common import logger from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.typing import DictStrAny +from dlt.common.utils import removeprefix from dlt.destinations.exceptions import ( DatabaseUndefinedRelation, @@ -88,9 +89,8 @@ def has_dataset(self) -> bool: sentinel_table = self.config.dataset_sentinel_table_name all_ds_tables = self._list_tables() if self.dataset_name: - return sentinel_table in [ - t.split(self.config.dataset_table_separator)[1] for t in all_ds_tables - ] + prefix = self.dataset_name + self.config.dataset_table_separator + return sentinel_table in [removeprefix(t, prefix) for t in all_ds_tables] else: # if no dataset specified we look for sentinel table return sentinel_table in all_ds_tables diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index db46f09caf..ccf764811b 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -37,7 +37,7 @@ TPipelineStateDoc, load_package as current_load_package, ) -from dlt.destinations.sql_client import DBApiCursor, WithSqlClient, SqlClientBase +from dlt.destinations.sql_client import WithSqlClient, SqlClientBase from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( FollowupJobRequest, @@ -63,7 +63,6 @@ from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration from dlt.destinations import path_utils from dlt.destinations.fs_client import FSClientBase -from dlt.destinations.dataset import ReadableDBAPIDataset from dlt.destinations.utils import verify_schema_merge_disposition INIT_FILE_NAME = "init" diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 41d3035a9f..03f8a31462 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -18,6 +18,8 @@ TTableSchemaColumns, TPartialTableSchema, ) +from dlt.common.normalizers.json import helpers as normalize_helpers + from dlt.extract.hints import HintsMeta, TResourceHints from dlt.extract.resource import DltResource from dlt.extract.items import DataItemWithMeta, TableNameMeta @@ -141,7 +143,9 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No self._write_to_dynamic_table(resource, items, meta) def write_empty_items_file(self, table_name: str) -> None: - table_name = self.naming.normalize_table_identifier(table_name) + table_name = normalize_helpers.normalize_table_identifier( + self.schema, self.naming, table_name + ) self.item_storage.write_empty_items_file(self.load_id, self.schema.name, table_name, None) def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[str]: @@ -151,10 +155,12 @@ def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[s table_name = meta.table_name else: table_name = resource.table_name # type: ignore[assignment] - return self.naming.normalize_table_identifier(table_name) + return normalize_helpers.normalize_table_identifier(self.schema, self.naming, table_name) def _get_dynamic_table_name(self, resource: DltResource, item: TDataItem) -> str: - return self.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) + return normalize_helpers.normalize_table_identifier( + self.schema, self.naming, resource._table_name_hint_fun(item) + ) def _write_item( self, @@ -322,7 +328,7 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No ) for tbl in ( ( - # 1. Convert pandas frame(s) to arrow Table + # 1. Convert pandas frame(s) to arrow Table, remove indexes because we store pandas_to_arrow(item) if (pandas and isinstance(item, pandas.DataFrame)) else item diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py index dfafa2cd11..ee102a9961 100644 --- a/dlt/extract/incremental/lag.py +++ b/dlt/extract/incremental/lag.py @@ -20,7 +20,7 @@ def _apply_lag_to_value( parsed_value = ensure_pendulum_datetime(value) if is_str else value if isinstance(parsed_value, (datetime, date)): - parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment] + parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # go back to string or pass exact type value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment] diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py index 53a856f7d0..5eccdf5433 100644 --- a/dlt/normalize/worker.py +++ b/dlt/normalize/worker.py @@ -20,6 +20,7 @@ ParsedLoadJobFileName, ) from dlt.common.schema import TSchemaUpdate, Schema +from dlt.common.normalizers.json import helpers as normalize_helpers from dlt.normalize.configuration import NormalizeConfiguration from dlt.normalize.exceptions import NormalizeJobFailed @@ -218,8 +219,8 @@ def _gather_metrics_and_close( parsed_file_name = ParsedLoadJobFileName.parse(extracted_items_file) # normalize table name in case the normalization changed # NOTE: this is the best we can do, until a full lineage information is in the schema - root_table_name = schema.naming.normalize_table_identifier( - parsed_file_name.table_name + root_table_name = normalize_helpers.normalize_table_identifier( + schema, schema.naming, parsed_file_name.table_name ) root_tables.add(root_table_name) root_table = stored_schema["tables"].get(root_table_name, {"name": root_table_name}) diff --git a/dlt/reflection/script_visitor.py b/dlt/reflection/script_visitor.py index f4a5569ed0..c49fed20ab 100644 --- a/dlt/reflection/script_visitor.py +++ b/dlt/reflection/script_visitor.py @@ -1,10 +1,9 @@ import inspect import ast -import astunparse from ast import NodeVisitor from typing import Any, Dict, List -from dlt.common.reflection.utils import find_outer_func_def +from dlt.common.reflection.utils import find_outer_func_def, ast_unparse import dlt.reflection.names as n @@ -68,9 +67,9 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any: for deco in node.decorator_list: # decorators can be function calls, attributes or names if isinstance(deco, (ast.Name, ast.Attribute)): - alias_name = astunparse.unparse(deco).strip() + alias_name = ast_unparse(deco).strip() elif isinstance(deco, ast.Call): - alias_name = astunparse.unparse(deco.func).strip() + alias_name = ast_unparse(deco.func).strip() else: raise ValueError( self.source_segment(deco), type(deco), "Unknown decorator form" @@ -87,7 +86,7 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any: def visit_Call(self, node: ast.Call) -> Any: if self._curr_pass == 2: # check if this is a call to any of known functions - alias_name = astunparse.unparse(node.func).strip() + alias_name = ast_unparse(node.func).strip() fn = self.func_aliases.get(alias_name) if not fn: # try a fallback to "run" function that may be called on pipeline or source diff --git a/dlt/sources/sql_database/arrow_helpers.py b/dlt/sources/sql_database/arrow_helpers.py index 1f72205a2a..1de9dffc87 100644 --- a/dlt/sources/sql_database/arrow_helpers.py +++ b/dlt/sources/sql_database/arrow_helpers.py @@ -4,9 +4,6 @@ from dlt.common.configuration import with_config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.libs.pyarrow import ( - row_tuples_to_arrow as _row_tuples_to_arrow, -) @with_config @@ -20,6 +17,8 @@ def row_tuples_to_arrow( is always the case if run within the pipeline. This will generate arrow schema compatible with the destination. Otherwise generic capabilities are used """ + from dlt.common.libs.pyarrow import row_tuples_to_arrow as _row_tuples_to_arrow + return _row_tuples_to_arrow( rows, caps or DestinationCapabilitiesContext.generic_capabilities(), columns, tz ) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index efb39cc58e..de3d12e8e1 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -184,9 +184,7 @@ bucket_url = "abfss://@.dfs.core.windows.n You can use `az`, `abfss`, `azure` and `abfs` url schemes. -#### Custom host - -If you need to use a custom host to account your storage account you can set it up like below: +If you need to use a custom host for your storage account, you can set it up like below: ```toml [destination.filesystem.credentials] # The storage account name is always required diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index 11d4382a22..fa5cf7b128 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -39,6 +39,8 @@ pipeline = dlt.pipeline("orders_pipeline", destination="snowflake") pipeline.run(df, table_name="orders") ``` +Note that Pandas indexes are not save by default (up from `dlt` version 1.4.1). If for some reason you need the destination, +use `Table.from_pandas` with `preserve_index` set to True to explicitly convert the dataframe into arrow table. A `pyarrow` table can be loaded in the same way: diff --git a/docs/website/docs/general-usage/naming-convention.md b/docs/website/docs/general-usage/naming-convention.md index f1766d1797..c10ac3e3d0 100644 --- a/docs/website/docs/general-usage/naming-convention.md +++ b/docs/website/docs/general-usage/naming-convention.md @@ -69,6 +69,45 @@ Note that many destinations are exclusively case-insensitive, of which some pres ### Identifier shortening Identifier shortening happens during normalization. `dlt` takes the maximum length of the identifier from the destination capabilities and will trim the identifiers that are too long. The default shortening behavior generates short deterministic hashes of the source identifiers and places them in the middle of the destination identifier. This (with a high probability) avoids shortened identifier collisions. +### Compound (flattened) identifiers +`dlt` combines several identifiers in order to name nested tables and flattened columns. For example: +```json +{ + "column": + { + "value": 1 + } +} +``` +generates flattened column name `column__value`. Where `__` is a path separator (in **snake case**). Each component in the combined identifier is normalized +separately and shortened as a whole. + +:::note +Combined identifier is also a valid single identifier. Starting from +`dlt` version above 1.4.0 normalization is fully idempotent and normalized +`column__value` will be still `column__value`. +::: + +:::caution +Previously double underscores were contracted into single underscore. That +prevented using data loaded by `dlt` as a data source without identifier modifications. `dlt` maintains backward compatibility for version >1.4.0 as follows: + +* All schemas stored locally or at destination will be migrated to backward compatible mode by setting a flag `use_break_path_on_normalize` ie.: +```yaml +normalizers: + names: dlt.common.normalizers.names.snake_case + use_break_path_on_normalize: true + json: + module: dlt.common.normalizers.json.relational +``` +* Backward compatible behavior may be explicitly enabled by setting +`SCHEMA__USE_BREAK_PATH_ON_NORMALIZE` to `TRUE` or via `config.toml`: +```toml +[schema] +use_break_path_on_normalize=true +``` +::: + ### 🚧 [WIP] Name convention changes are lossy `dlt` does not store the source identifiers in the schema so when the naming convention changes (or we increase the maximum identifier length), it is not able to generate a fully correct set of new identifiers. Instead, it will re-normalize already normalized identifiers. We are currently working to store the full identifier lineage - source identifiers will be stored and mapped to the destination in the schema. diff --git a/mypy.ini b/mypy.ini index eee4db6126..769e84b13a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -134,4 +134,4 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-time_machine.*] -ignore_missing_imports = True \ No newline at end of file +ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index ccbec5bd8f..6e9eabe109 100644 --- a/poetry.lock +++ b/poetry.lock @@ -10716,4 +10716,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "76b04f900df1025a0a530825b5ee36dab2897ae490c34d77ccf90ce7697249de" +content-hash = "b6e3fdf3464e70df7086c1aeb12fc55e240c619491ccfff14fab769a9da94c43" diff --git a/pyproject.toml b/pyproject.toml index d62e5f4036..f9fc80fdbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "1.4.1a0" +version = "1.4.1" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] @@ -40,7 +40,7 @@ click = ">=7.1" requirements-parser = ">=0.5.0" setuptools = ">=65.6.0" humanize = ">=4.4.0" -astunparse = ">=1.6.3" +astunparse = { "version" = ">=1.6.3", "python" = "<3.9"} gitpython = ">=3.1.29" pytz = ">=2022.6" giturlparse = ">=0.10.0" diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v11.yml b/tests/common/cases/schemas/eth/ethereum_schema_v11.yml new file mode 100644 index 0000000000..fd6717c614 --- /dev/null +++ b/tests/common/cases/schemas/eth/ethereum_schema_v11.yml @@ -0,0 +1,394 @@ +version: 18 +version_hash: XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI= +engine_version: 11 +name: ethereum +tables: + _dlt_loads: + columns: + load_id: + nullable: false + data_type: text + schema_name: + nullable: true + data_type: text + status: + nullable: false + data_type: bigint + inserted_at: + nullable: false + data_type: timestamp + schema_version_hash: + nullable: true + data_type: text + write_disposition: skip + description: Created by DLT. Tracks completed loads + schema_contract: {} + resource: _dlt_loads + _dlt_version: + columns: + version: + nullable: false + data_type: bigint + engine_version: + nullable: false + data_type: bigint + inserted_at: + nullable: false + data_type: timestamp + schema_name: + nullable: false + data_type: text + version_hash: + nullable: false + data_type: text + schema: + nullable: false + data_type: text + write_disposition: skip + description: Created by DLT. Tracks schema updates + schema_contract: {} + resource: _dlt_version + blocks: + description: Ethereum blocks + x-annotation: this will be preserved on save + write_disposition: append + filters: + includes: [] + excludes: [] + columns: + _dlt_load_id: + nullable: false + description: load id coming from the extractor + data_type: text + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + number: + nullable: false + primary_key: true + data_type: bigint + parent_hash: + nullable: true + data_type: text + hash: + nullable: false + cluster: true + unique: true + data_type: text + base_fee_per_gas: + nullable: false + data_type: wei + difficulty: + nullable: false + data_type: wei + extra_data: + nullable: true + data_type: text + gas_limit: + nullable: false + data_type: bigint + gas_used: + nullable: false + data_type: bigint + logs_bloom: + nullable: true + data_type: binary + miner: + nullable: true + data_type: text + mix_hash: + nullable: true + data_type: text + nonce: + nullable: true + data_type: text + receipts_root: + nullable: true + data_type: text + sha3_uncles: + nullable: true + data_type: text + size: + nullable: true + data_type: bigint + state_root: + nullable: false + data_type: text + timestamp: + nullable: false + unique: true + sort: true + data_type: timestamp + total_difficulty: + nullable: true + data_type: wei + transactions_root: + nullable: false + data_type: text + schema_contract: {} + resource: blocks + x-normalizer: + seen-data: true + blocks__transactions: + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + block_number: + nullable: false + primary_key: true + data_type: bigint + merge_key: true + transaction_index: + nullable: false + primary_key: true + data_type: bigint + hash: + nullable: false + unique: true + data_type: text + block_hash: + nullable: false + cluster: true + data_type: text + block_timestamp: + nullable: false + sort: true + data_type: timestamp + chain_id: + nullable: true + data_type: text + from: + nullable: true + data_type: text + gas: + nullable: true + data_type: bigint + gas_price: + nullable: true + data_type: bigint + input: + nullable: true + data_type: text + max_fee_per_gas: + nullable: true + data_type: wei + max_priority_fee_per_gas: + nullable: true + data_type: wei + nonce: + nullable: true + data_type: bigint + r: + nullable: true + data_type: text + s: + nullable: true + data_type: text + status: + nullable: true + data_type: bigint + to: + nullable: true + data_type: text + type: + nullable: true + data_type: text + v: + nullable: true + data_type: bigint + value: + nullable: false + data_type: wei + eth_value: + nullable: true + data_type: decimal + x-normalizer: + seen-data: true + write_disposition: append + resource: blocks__transactions + blocks__transactions__logs: + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + address: + nullable: false + data_type: text + block_timestamp: + nullable: false + sort: true + data_type: timestamp + block_hash: + nullable: false + cluster: true + data_type: text + block_number: + nullable: false + primary_key: true + merge_key: true + data_type: bigint + transaction_index: + nullable: false + primary_key: true + merge_key: true + data_type: bigint + log_index: + nullable: false + primary_key: true + data_type: bigint + data: + nullable: true + data_type: text + removed: + nullable: true + data_type: bool + transaction_hash: + nullable: false + data_type: text + x-normalizer: + seen-data: true + write_disposition: append + resource: blocks__transactions__logs + blocks__transactions__logs__topics: + parent: blocks__transactions__logs + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__transactions__access_list: + parent: blocks__transactions + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + address: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__transactions__access_list__storage_keys: + parent: blocks__transactions__access_list + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__uncles: + parent: blocks + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true +settings: + default_hints: + not_null: + - re:^_dlt_id$ + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + unique: + - _dlt_id + cluster: + - block_hash + partition: + - block_timestamp + root_key: + - _dlt_root_id + row_key: + - _dlt_id + parent_key: + - _dlt_parent_id + preferred_types: + timestamp: timestamp + block_timestamp: timestamp + schema_contract: {} +normalizers: + names: dlt.common.normalizers.names.snake_case + json: + module: dlt.common.normalizers.json.relational + config: + propagation: + root: + _dlt_id: _dlt_root_id + tables: + blocks: + timestamp: block_timestamp + hash: block_hash +previous_hashes: +- oHfYGTI2GHOxuzwVz6+yvMilXUvHYhxrxkanC2T6MAI= +- C5An8WClbavalXDdNSqXbdI7Swqh/mTWMcwWKCF//EE= +- yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE= + diff --git a/tests/common/cases/schemas/github/issues.schema.json b/tests/common/cases/schemas/github/issues.schema.json index 4c4f5425ae..5a1b0c6f84 100644 --- a/tests/common/cases/schemas/github/issues.schema.json +++ b/tests/common/cases/schemas/github/issues.schema.json @@ -1,1322 +1,1100 @@ { - "version": 2, - "version_hash": "IeCTkq8epwbjSy1O3jdkPPUkTPCt4hLj6RYo8uZ02JI=", - "engine_version": 5, - "name": "event", - "tables": { - "_dlt_version": { - "name": "_dlt_version", - "columns": { - "version": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "version", - "data_type": "bigint", - "nullable": false - }, - "engine_version": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "engine_version", - "data_type": "bigint", - "nullable": false - }, - "inserted_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "inserted_at", - "data_type": "timestamp", - "nullable": false - }, - "schema_name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema_name", - "data_type": "text", - "nullable": false - }, - "version_hash": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "version_hash", - "data_type": "text", - "nullable": false - }, - "schema": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema", - "data_type": "text", - "nullable": false - } + "version": 3, + "version_hash": "o6olKmaCAQVWDWR4eT4aZ1V/RiH+003516xq7Zrva+Q=", + "engine_version": 11, + "name": "event", + "tables": { + "_dlt_version": { + "columns": { + "version": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "engine_version": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "inserted_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": false + }, + "schema_name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "version_hash": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "schema": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + } + }, + "write_disposition": "skip", + "description": "Created by DLT. Tracks schema updates", + "schema_contract": {}, + "resource": "_dlt_version" }, - "write_disposition": "skip", - "description": "Created by DLT. Tracks schema updates" - }, - "_dlt_loads": { - "name": "_dlt_loads", - "columns": { - "load_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "load_id", - "data_type": "text", - "nullable": false - }, - "schema_name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema_name", - "data_type": "text", - "nullable": true - }, - "status": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "status", - "data_type": "bigint", - "nullable": false - }, - "inserted_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "inserted_at", - "data_type": "timestamp", - "nullable": false - } + "_dlt_loads": { + "columns": { + "load_id": { + "data_type": "text", + "nullable": false + }, + "schema_name": { + "data_type": "text", + "nullable": true + }, + "status": { + "data_type": "bigint", + "nullable": false + }, + "inserted_at": { + "data_type": "timestamp", + "nullable": false + }, + "schema_version_hash": { + "data_type": "text", + "nullable": true + } + }, + "write_disposition": "skip", + "resource": "_dlt_loads", + "description": "Created by DLT. Tracks completed loads", + "schema_contract": {} }, - "write_disposition": "skip", - "description": "Created by DLT. Tracks completed loads" - }, - "issues": { - "name": "issues", - "columns": { - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "repository_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "repository_url", - "data_type": "text", - "nullable": true - }, - "labels_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "labels_url", - "data_type": "text", - "nullable": true - }, - "comments_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "comments_url", - "data_type": "text", - "nullable": true - }, - "events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "events_url", - "data_type": "text", - "nullable": true - }, - "html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "html_url", - "data_type": "text", - "nullable": true - }, - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "number": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "number", - "data_type": "bigint", - "nullable": true - }, - "title": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "title", - "data_type": "text", - "nullable": true - }, - "user__login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__login", - "data_type": "text", - "nullable": true - }, - "user__id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__id", - "data_type": "bigint", - "nullable": true - }, - "user__node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__node_id", - "data_type": "text", - "nullable": true - }, - "user__avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__avatar_url", - "data_type": "text", - "nullable": true - }, - "user__gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__gravatar_id", - "data_type": "text", - "nullable": true - }, - "user__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__url", - "data_type": "text", - "nullable": true - }, - "user__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__html_url", - "data_type": "text", - "nullable": true - }, - "user__followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__followers_url", - "data_type": "text", - "nullable": true - }, - "user__following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__following_url", - "data_type": "text", - "nullable": true - }, - "user__gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__gists_url", - "data_type": "text", - "nullable": true - }, - "user__starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__starred_url", - "data_type": "text", - "nullable": true - }, - "user__subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__subscriptions_url", - "data_type": "text", - "nullable": true - }, - "user__organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__organizations_url", - "data_type": "text", - "nullable": true - }, - "user__repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__repos_url", - "data_type": "text", - "nullable": true - }, - "user__events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__events_url", - "data_type": "text", - "nullable": true - }, - "user__received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__received_events_url", - "data_type": "text", - "nullable": true - }, - "user__type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__type", - "data_type": "text", - "nullable": true - }, - "user__site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__site_admin", - "data_type": "bool", - "nullable": true - }, - "state": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "state", - "data_type": "text", - "nullable": true - }, - "locked": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "locked", - "data_type": "bool", - "nullable": true - }, - "assignee__login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__login", - "data_type": "text", - "nullable": true - }, - "assignee__id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__id", - "data_type": "bigint", - "nullable": true - }, - "assignee__node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__node_id", - "data_type": "text", - "nullable": true - }, - "assignee__avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__avatar_url", - "data_type": "text", - "nullable": true - }, - "assignee__gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__gravatar_id", - "data_type": "text", - "nullable": true - }, - "assignee__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__url", - "data_type": "text", - "nullable": true - }, - "assignee__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__html_url", - "data_type": "text", - "nullable": true - }, - "assignee__followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__followers_url", - "data_type": "text", - "nullable": true - }, - "assignee__following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__following_url", - "data_type": "text", - "nullable": true - }, - "assignee__gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__gists_url", - "data_type": "text", - "nullable": true - }, - "assignee__starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__starred_url", - "data_type": "text", - "nullable": true - }, - "assignee__subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__subscriptions_url", - "data_type": "text", - "nullable": true - }, - "assignee__organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__organizations_url", - "data_type": "text", - "nullable": true - }, - "assignee__repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__repos_url", - "data_type": "text", - "nullable": true - }, - "assignee__events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__events_url", - "data_type": "text", - "nullable": true - }, - "assignee__received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__received_events_url", - "data_type": "text", - "nullable": true - }, - "assignee__type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__type", - "data_type": "text", - "nullable": true - }, - "assignee__site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__site_admin", - "data_type": "bool", - "nullable": true - }, - "comments": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "comments", - "data_type": "bigint", - "nullable": true - }, - "created_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "created_at", - "data_type": "timestamp", - "nullable": true - }, - "updated_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "updated_at", - "data_type": "timestamp", - "nullable": true - }, - "closed_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "closed_at", - "data_type": "timestamp", - "nullable": true - }, - "author_association": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "author_association", - "data_type": "text", - "nullable": true - }, - "body": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "body", - "data_type": "text", - "nullable": true - }, - "reactions__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__url", - "data_type": "text", - "nullable": true - }, - "reactions__total_count": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__total_count", - "data_type": "bigint", - "nullable": true - }, - "reactions___1": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions___1", - "data_type": "bigint", - "nullable": true - }, - "reactions__laugh": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__laugh", - "data_type": "bigint", - "nullable": true - }, - "reactions__hooray": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__hooray", - "data_type": "bigint", - "nullable": true - }, - "reactions__confused": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__confused", - "data_type": "bigint", - "nullable": true - }, - "reactions__heart": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__heart", - "data_type": "bigint", - "nullable": true - }, - "reactions__rocket": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__rocket", - "data_type": "bigint", - "nullable": true - }, - "reactions__eyes": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__eyes", - "data_type": "bigint", - "nullable": true - }, - "timeline_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "timeline_url", - "data_type": "text", - "nullable": true - }, - "state_reason": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "state_reason", - "data_type": "text", - "nullable": true - }, - "_dlt_load_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_load_id", - "data_type": "text", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false - }, - "draft": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "draft", - "data_type": "bool", - "nullable": true - }, - "pull_request__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__url", - "data_type": "text", - "nullable": true - }, - "pull_request__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__html_url", - "data_type": "text", - "nullable": true - }, - "pull_request__diff_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__diff_url", - "data_type": "text", - "nullable": true - }, - "pull_request__patch_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__patch_url", - "data_type": "text", - "nullable": true - }, - "pull_request__merged_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__merged_at", - "data_type": "timestamp", - "nullable": true - } + "issues": { + "columns": { + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "repository_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "labels_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "comments_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "number": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "title": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "user__node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "state": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "locked": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "assignee__login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "assignee__node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "comments": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "created_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "updated_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "closed_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "author_association": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "body": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "reactions__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "reactions__total_count": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions___1": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__laugh": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__hooray": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__confused": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__heart": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__rocket": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__eyes": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "timeline_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "state_reason": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "_dlt_load_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + }, + "draft": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "pull_request__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__diff_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__patch_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__merged_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + } + }, + "write_disposition": "append", + "schema_contract": {}, + "x-normalizer": { + "seen-data": true + }, + "resource": "issues" }, - "write_disposition": "append" - }, - "issues__labels": { - "name": "issues__labels", - "columns": { - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "name", - "data_type": "text", - "nullable": true - }, - "color": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "color", - "data_type": "text", - "nullable": true - }, - "default": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "default", - "data_type": "bool", - "nullable": true - }, - "description": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "description", - "data_type": "text", - "nullable": true - }, - "_dlt_parent_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": true, - "name": "_dlt_parent_id", - "data_type": "text", - "nullable": false - }, - "_dlt_list_idx": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_list_idx", - "data_type": "bigint", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false + "issues__labels": { + "columns": { + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "color": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "default": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "description": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "_dlt_parent_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "parent_key": true + }, + "_dlt_list_idx": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + } + }, + "parent": "issues", + "x-normalizer": { + "seen-data": true } }, - "parent": "issues" - }, - "issues__assignees": { - "name": "issues__assignees", - "columns": { - "login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "login", - "data_type": "text", - "nullable": true - }, - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "avatar_url", - "data_type": "text", - "nullable": true - }, - "gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "gravatar_id", - "data_type": "text", - "nullable": true - }, - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "html_url", - "data_type": "text", - "nullable": true - }, - "followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "followers_url", - "data_type": "text", - "nullable": true - }, - "following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "following_url", - "data_type": "text", - "nullable": true - }, - "gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "gists_url", - "data_type": "text", - "nullable": true - }, - "starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "starred_url", - "data_type": "text", - "nullable": true - }, - "subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "subscriptions_url", - "data_type": "text", - "nullable": true - }, - "organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "organizations_url", - "data_type": "text", - "nullable": true - }, - "repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "repos_url", - "data_type": "text", - "nullable": true - }, - "events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "events_url", - "data_type": "text", - "nullable": true - }, - "received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "received_events_url", - "data_type": "text", - "nullable": true - }, - "type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "type", - "data_type": "text", - "nullable": true - }, - "site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "site_admin", - "data_type": "bool", - "nullable": true - }, - "_dlt_parent_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": true, - "name": "_dlt_parent_id", - "data_type": "text", - "nullable": false - }, - "_dlt_list_idx": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_list_idx", - "data_type": "bigint", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false + "issues__assignees": { + "columns": { + "login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "_dlt_parent_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "parent_key": true + }, + "_dlt_list_idx": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + } + }, + "parent": "issues", + "x-normalizer": { + "seen-data": true } - }, - "parent": "issues" - } - }, - "settings": { - "detections": [ - "timestamp", - "iso_timestamp", - "iso_date" - ], - "default_hints": { - "not_null": [ - "_dlt_id", - "_dlt_root_id", - "_dlt_parent_id", - "_dlt_list_idx", - "_dlt_load_id" - ], - "foreign_key": [ - "_dlt_parent_id" + } + }, + "settings": { + "detections": [ + "timestamp", + "iso_timestamp", + "iso_date" ], - "unique": [ - "_dlt_id" - ] - } - }, - "normalizers": { - "names": "dlt.common.normalizers.names.snake_case", - "json": { - "module": "dlt.common.normalizers.json.relational" - } + "default_hints": { + "not_null": [ + "_dlt_id", + "_dlt_root_id", + "_dlt_parent_id", + "_dlt_list_idx", + "_dlt_load_id" + ], + "unique": [ + "_dlt_id" + ], + "row_key": [ + "_dlt_id" + ], + "parent_key": [ + "_dlt_parent_id" + ] + }, + "schema_contract": {} + }, + "normalizers": { + "names": "dlt.common.normalizers.names.snake_case", + "json": { + "module": "dlt.common.normalizers.json.relational" + } + }, + "previous_hashes": [ + "IeCTkq8epwbjSy1O3jdkPPUkTPCt4hLj6RYo8uZ02JI=" + ] } -} diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 1553cea04f..35bc80add2 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -6,14 +6,12 @@ from dlt.common.utils import digest128, uniq_id from dlt.common.schema import Schema from dlt.common.schema.utils import new_table - +from dlt.common.normalizers.utils import DLT_ID_LENGTH_BYTES from dlt.common.normalizers.json.relational import ( RelationalNormalizerConfigPropagation, DataItemNormalizer as RelationalNormalizer, - DLT_ID_LENGTH_BYTES, ) - -# _flatten, _get_child_row_hash, _normalize_row, normalize_data_item, +from dlt.common.normalizers.json import helpers as normalize_helpers from tests.utils import create_schema_with_name @@ -420,7 +418,7 @@ def test_list_in_list() -> None: schema.update_table(path_table) assert "zen__webpath" in schema.tables # clear cache with json paths - schema.data_item_normalizer._is_nested_type.cache_clear() # type: ignore[attr-defined] + normalize_helpers.is_nested_type.cache_clear() rows = list(schema.normalize_data_item(chats, "1762162.1212", "zen")) # both lists are json types now @@ -890,7 +888,7 @@ def test_caching_perf(norm: RelationalNormalizer) -> None: table["x-normalizer"] = {} start = time() for _ in range(100000): - norm._is_nested_type(norm.schema, "test", "field", 0) + normalize_helpers.is_nested_type(norm.schema, "test", "field", 0) # norm._get_table_nesting_level(norm.schema, "test") print(f"{time() - start}") diff --git a/tests/common/normalizers/test_naming_snake_case.py b/tests/common/normalizers/test_naming_snake_case.py index ee4f43e7f0..e03de65696 100644 --- a/tests/common/normalizers/test_naming_snake_case.py +++ b/tests/common/normalizers/test_naming_snake_case.py @@ -50,6 +50,14 @@ def test_normalize_path(naming_unlimited: NamingConvention) -> None: assert naming_unlimited.normalize_path("Small Love Potion") == "small_love_potion" assert naming_unlimited.normalize_path("Small Love Potion") == "small_love_potion" + # paths with non normalized underscores + # NOTE: empty idents created during break path are removed so underscores are contracted + assert ( + naming_unlimited.normalize_path("Small___Love____Potion_____x") + == "small___love__potion___x" + ) + assert naming_unlimited.normalize_path("small___love__potion___x") == "small___love__potion___x" + def test_normalize_non_alpha_single_underscore() -> None: assert SnakeCaseNamingConvention.RE_NON_ALPHANUMERIC.sub("_", "-=!*") == "_" diff --git a/tests/common/schema/test_import_normalizers.py b/tests/common/schema/test_import_normalizers.py index a1e3d775f0..d444259946 100644 --- a/tests/common/schema/test_import_normalizers.py +++ b/tests/common/schema/test_import_normalizers.py @@ -16,7 +16,7 @@ ) from dlt.common.schema.normalizers import ( DEFAULT_NAMING_NAMESPACE, - explicit_normalizers, + configured_normalizers, import_normalizers, naming_from_reference, serialize_reference, @@ -26,25 +26,25 @@ def test_explicit_normalizers() -> None: - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] is None assert config["json"] is None # pass explicit - config = explicit_normalizers("direct", {"module": "custom"}) + config = configured_normalizers("direct", {"module": "custom"}) assert config["names"] == "direct" assert config["json"] == {"module": "custom"} # pass modules and types, make sure normalizer config is serialized - config = explicit_normalizers(direct) + config = configured_normalizers(direct) assert config["names"] == f"{DEFAULT_NAMING_NAMESPACE}.direct.NamingConvention" - config = explicit_normalizers(direct.NamingConvention) + config = configured_normalizers(direct.NamingConvention) assert config["names"] == f"{DEFAULT_NAMING_NAMESPACE}.direct.NamingConvention" # use environ os.environ["SCHEMA__NAMING"] = "direct" os.environ["SCHEMA__JSON_NORMALIZER"] = '{"module": "custom"}' - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] == "direct" assert config["json"] == {"module": "custom"} @@ -54,7 +54,7 @@ def test_explicit_normalizers_caps_ignored() -> None: destination_caps = DestinationCapabilitiesContext.generic_capabilities() destination_caps.naming_convention = "direct" with Container().injectable_context(destination_caps): - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] is None @@ -121,7 +121,7 @@ def test_naming_from_reference() -> None: def test_import_normalizers() -> None: - config, naming, json_normalizer = import_normalizers(explicit_normalizers()) + config, naming, json_normalizer = import_normalizers(configured_normalizers()) assert isinstance(naming, snake_case.NamingConvention) # no maximum length: we do not know the destination capabilities assert naming.max_length is None @@ -133,7 +133,7 @@ def test_import_normalizers() -> None: os.environ["SCHEMA__JSON_NORMALIZER"] = ( '{"module": "tests.common.normalizers.custom_normalizers"}' ) - config, naming, json_normalizer = import_normalizers(explicit_normalizers()) + config, naming, json_normalizer = import_normalizers(configured_normalizers()) assert config["names"] == "direct" assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} assert isinstance(naming, direct.NamingConvention) @@ -142,7 +142,7 @@ def test_import_normalizers() -> None: def test_import_normalizers_with_defaults() -> None: - explicit = explicit_normalizers() + explicit = configured_normalizers() default_: TNormalizersConfig = { "names": "dlt.destinations.impl.weaviate.naming", "json": {"module": "tests.common.normalizers.custom_normalizers"}, @@ -170,7 +170,7 @@ def test_config_sections(sections: str) -> None: os.environ[f"{sections}SCHEMA__JSON_NORMALIZER"] = ( '{"module": "tests.common.normalizers.custom_normalizers"}' ) - config, _, _ = import_normalizers(explicit_normalizers(schema_name="test_schema")) + config, _, _ = import_normalizers(configured_normalizers(schema_name="test_schema")) assert config["names"] == "direct" assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} @@ -181,11 +181,11 @@ def test_import_normalizers_with_caps() -> None: destination_caps.naming_convention = "direct" destination_caps.max_identifier_length = 127 with Container().injectable_context(destination_caps): - _, naming, _ = import_normalizers(explicit_normalizers()) + _, naming, _ = import_normalizers(configured_normalizers()) assert isinstance(naming, direct.NamingConvention) assert naming.max_length == 127 - _, naming, _ = import_normalizers(explicit_normalizers(snake_case)) + _, naming, _ = import_normalizers(configured_normalizers(snake_case)) assert isinstance(naming, snake_case.NamingConvention) assert naming.max_length == 127 @@ -196,23 +196,23 @@ def test_import_normalizers_with_caps() -> None: } destination_caps.max_table_nesting = 0 with Container().injectable_context(destination_caps): - config, _, relational = import_normalizers(explicit_normalizers()) + config, _, relational = import_normalizers(configured_normalizers()) assert config["json"]["config"]["max_nesting"] == 0 assert relational is RelationalNormalizer # wrong normalizer - config, _, relational = import_normalizers(explicit_normalizers(), default_) + config, _, relational = import_normalizers(configured_normalizers(), default_) assert "config" not in config["json"] def test_import_invalid_naming_module() -> None: with pytest.raises(UnknownNamingModule) as py_ex: - import_normalizers(explicit_normalizers("unknown")) + import_normalizers(configured_normalizers("unknown")) assert py_ex.value.naming_module == "unknown" with pytest.raises(UnknownNamingModule) as py_ex: - import_normalizers(explicit_normalizers("dlt.common.tests")) + import_normalizers(configured_normalizers("dlt.common.tests")) assert py_ex.value.naming_module == "dlt.common.tests" with pytest.raises(InvalidNamingType) as py_ex2: - import_normalizers(explicit_normalizers("dlt.pipeline.helpers")) + import_normalizers(configured_normalizers("dlt.pipeline.helpers")) assert py_ex2.value.naming_module == "dlt.pipeline" assert py_ex2.value.naming_class == "helpers" diff --git a/tests/common/schema/test_normalize_identifiers.py b/tests/common/schema/test_normalize_identifiers.py index f84d857e26..a1cb181525 100644 --- a/tests/common/schema/test_normalize_identifiers.py +++ b/tests/common/schema/test_normalize_identifiers.py @@ -271,12 +271,7 @@ def test_normalize_table_identifiers_table_reference() -> None: def test_update_normalizers() -> None: - schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") - schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] - # drop seen data - del schema.tables["issues"]["x-normalizer"] - del schema.tables["issues__labels"]["x-normalizer"] - del schema.tables["issues__assignees"]["x-normalizer"] + schema = make_issues_schema_for_normalizers_update() # save default hints in original form default_hints = schema._settings["default_hints"] @@ -307,8 +302,8 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.configuration.container import Container - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") - orig_schema = Schema.from_dict(eth_V9) + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + orig_schema = Schema.from_dict(eth_V11) # save schema schema_storage_no_import.save_schema(orig_schema) @@ -317,7 +312,7 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non ) as caps: assert caps.naming_convention is sql_upper # creating a schema from dict keeps original normalizers - schema = Schema.from_dict(eth_V9) + schema = Schema.from_dict(eth_V11) assert_schema_identifiers_case(schema, str.lower) assert schema._normalizers_config["names"].endswith("snake_case") @@ -350,7 +345,7 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non ) norm_schema = Schema.from_dict( - deepcopy(eth_V9), remove_processing_hints=True, bump_version=False + deepcopy(eth_V11), remove_processing_hints=True, bump_version=False ) norm_schema.update_normalizers() assert_schema_identifiers_case(norm_schema, str.upper) @@ -452,3 +447,50 @@ def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: assert schema.naming.break_path("A__B__!C") == ["A", "B", "!C"] row = list(schema.normalize_data_item({"bool": True}, "load_id", "a_table")) assert row[0] == (("a_table", None), {"bool": True}) + + +def test_update_schema_normalizer_props() -> None: + schema = make_issues_schema_for_normalizers_update() + schema_2 = make_issues_schema_for_normalizers_update() + # remove issues table + del schema_2._schema_tables["issues"] + schema_2.update_schema(schema) + + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper" + # apply normalizers + schema_2.update_normalizers() + + # preserve schema_2 str + schema_2_str = schema_2.to_pretty_json() + + # make sure that normalizer props in original schema are preserved + schema._normalizers_config["allow_identifier_change_on_table_with_data"] = True + schema._normalizers_config["use_break_path_on_normalize"] = True + + # set some fake naming convention. during schema update it should not be used + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper_X" + schema.update_schema(schema_2) + assert isinstance(schema.naming, sql_upper.NamingConvention) + assert_schema_identifiers_case(schema, str.upper) + # make sure norm setting still in schema + assert schema._normalizers_config["allow_identifier_change_on_table_with_data"] is True + assert schema._normalizers_config["use_break_path_on_normalize"] is True + # schema 2 not modified during the update + assert schema_2_str == schema_2.to_pretty_json() + + # make sure that explicit settings are passed + schema_2._normalizers_config["allow_identifier_change_on_table_with_data"] = False + schema_2._normalizers_config["use_break_path_on_normalize"] = False + schema.update_schema(schema_2) + assert schema._normalizers_config["allow_identifier_change_on_table_with_data"] is False + assert schema._normalizers_config["use_break_path_on_normalize"] is False + + +def make_issues_schema_for_normalizers_update() -> Schema: + schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") + schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] + # drop seen data + del schema.tables["issues"]["x-normalizer"] + del schema.tables["issues__labels"]["x-normalizer"] + del schema.tables["issues__assignees"]["x-normalizer"] + return schema diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 7124ca5c80..5cdd42e448 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -570,8 +570,8 @@ def test_update_preferred_types(schema: Schema) -> None: def test_default_table_resource() -> None: """Parent tables without `resource` set default to table name""" - eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") - tables = Schema.from_dict(eth_v5).tables + eth_v11 = load_yml_case("schemas/eth/ethereum_schema_v11") + tables = Schema.from_dict(eth_v11).tables assert tables["blocks"]["resource"] == "blocks" assert all([t.get("resource") is None for t in tables.values() if t.get("parent")]) @@ -737,7 +737,7 @@ def assert_new_schema_props_custom_normalizers(schema: Schema) -> None: def assert_is_new_schema(schema: Schema) -> None: assert schema.stored_version is None assert schema.stored_version_hash is None - assert schema.ENGINE_VERSION == 10 + assert schema.ENGINE_VERSION == 11 assert schema._stored_previous_hashes == [] assert schema.is_modified assert schema.is_new @@ -845,9 +845,9 @@ def test_group_tables_by_resource(schema: Schema) -> None: def test_remove_processing_hints() -> None: - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") # here tables contain processing hints - schema = Schema.from_dict(eth_V9) + schema = Schema.from_dict(eth_V11) assert "x-normalizer" in schema.tables["blocks"] # clone with hints removal, note that clone does not bump version @@ -867,16 +867,10 @@ def test_remove_processing_hints() -> None: assert "x-normalizer" not in to_json # load without hints - no_hints = schema.from_dict(eth_V9, remove_processing_hints=True, bump_version=False) + no_hints = schema.from_dict(eth_V11, remove_processing_hints=True, bump_version=False) assert no_hints.stored_version_hash == cloned.stored_version_hash # now load without hints but with version bump cloned._bump_version() - no_hints = schema.from_dict(eth_V9, remove_processing_hints=True) + no_hints = schema.from_dict(eth_V11, remove_processing_hints=True) assert no_hints.stored_version_hash == cloned.stored_version_hash - - -# def test_get_new_table_columns() -> None: -# pytest.fail(reason="must implement!") -# pass -# get_new_table_columns() diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 39f1ad3211..1577b51115 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -86,10 +86,10 @@ def test_infer_column_bumps_version() -> None: def test_preserve_version_on_load() -> None: - eth_v10: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v10") - version = eth_v10["version"] - version_hash = eth_v10["version_hash"] - schema = Schema.from_dict(eth_v10) # type: ignore[arg-type] + eth_v11: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v11") + version = eth_v11["version"] + version_hash = eth_v11["version_hash"] + schema = Schema.from_dict(eth_v11) # type: ignore[arg-type] # version should not be bumped assert version_hash == schema._stored_version_hash assert version_hash == schema.version_hash @@ -98,8 +98,8 @@ def test_preserve_version_on_load() -> None: @pytest.mark.parametrize("remove_defaults", [True, False]) def test_version_preserve_on_reload(remove_defaults: bool) -> None: - eth_v8: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v8") - schema = Schema.from_dict(eth_v8) # type: ignore[arg-type] + eth_v11: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v11") + schema = Schema.from_dict(eth_v11) # type: ignore[arg-type] to_save_dict = schema.to_dict(remove_defaults=remove_defaults) assert schema.stored_version == to_save_dict["version"] diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 0dcf2930de..2818ea9622 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -3,7 +3,7 @@ import yaml from dlt.common import json -from dlt.common.schema.normalizers import explicit_normalizers +from dlt.common.schema.normalizers import configured_normalizers from dlt.common.schema.schema import Schema from dlt.common.storages.exceptions import ( InStorageSchemaModified, @@ -304,7 +304,7 @@ def test_save_store_schema_over_import_sync(synced_storage: SchemaStorage) -> No def test_save_store_schema(storage: SchemaStorage) -> None: - d_n = explicit_normalizers() + d_n = configured_normalizers() d_n["names"] = "tests.common.normalizers.custom_normalizers" schema = Schema("column_event", normalizers=d_n) assert schema.is_new @@ -357,16 +357,16 @@ def test_save_initial_import_schema(ie_storage: LiveSchemaStorage) -> None: ie_storage.load_schema("ethereum") # save initial import schema where processing hints are removed - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") - schema = Schema.from_dict(eth_V9) + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + schema = Schema.from_dict(eth_V11) ie_storage.save_import_schema_if_not_exists(schema) # should be available now eth = ie_storage.load_schema("ethereum") assert "x-normalizer" not in eth.tables["blocks"] # won't overwrite initial schema - del eth_V9["tables"]["blocks__uncles"] - schema = Schema.from_dict(eth_V9) + del eth_V11["tables"]["blocks__uncles"] + schema = Schema.from_dict(eth_V11) ie_storage.save_import_schema_if_not_exists(schema) # should be available now eth = ie_storage.load_schema("ethereum") diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index a1334ba1da..5366d8b06f 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -218,9 +218,9 @@ def assert_package_info( def prepare_eth_import_folder(storage: SchemaStorage) -> Schema: - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") # remove processing hints before installing as import schema # ethereum schema is a "dirty" schema with processing hints - eth = Schema.from_dict(eth_V9, remove_processing_hints=True) + eth = Schema.from_dict(eth_V11, remove_processing_hints=True) storage._export_schema(eth, storage.config.import_schema_path) return eth diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index e3098a1a77..9eeded1229 100644 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -26,6 +26,7 @@ get_exception_trace, get_exception_trace_chain, update_dict_nested, + removeprefix, ) @@ -440,3 +441,11 @@ def _function_test(a, *, b=None): except Exception as exc: assert str(exc) == "wrong type" assert is_typeerror_due_to_wrong_call(exc, function_typeerror_exc) is False + + +def test_removeprefix() -> None: + assert removeprefix("a_data", "a_") == "data" + assert removeprefix("a_data", "a_data") == "" + assert removeprefix("a_data", "a_data_1") == "a_data" + assert removeprefix("", "a_data_1") == "" + assert removeprefix("a_data", "") == "a_data" diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index f3ebb02b46..6899d8d5fe 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -111,7 +111,7 @@ def test_doc() -> TTestRecord: def test_validate_schema_cases() -> None: with open( - "tests/common/cases/schemas/eth/ethereum_schema_v10.yml", mode="r", encoding="utf-8" + "tests/common/cases/schemas/eth/ethereum_schema_v11.yml", mode="r", encoding="utf-8" ) as f: schema_dict: TStoredSchema = yaml.safe_load(f) diff --git a/tests/common/utils.py b/tests/common/utils.py index 9b5e6bccce..a0760ffe86 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -19,11 +19,11 @@ def IMPORTED_VERSION_HASH_ETH_V10() -> str: # for import schema tests, change when upgrading the schema version - eth_V10 = load_yml_case("schemas/eth/ethereum_schema_v10") - assert eth_V10["version_hash"] == "veEmgbCPXCIiqyfabeQWwz6UIQ2liETv7LLMpyktCos=" + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + assert eth_V11["version_hash"] == "XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI=" # remove processing hints before installing as import schema # ethereum schema is a "dirty" schema with processing hints - eth = Schema.from_dict(eth_V10, remove_processing_hints=True) + eth = Schema.from_dict(eth_V11, remove_processing_hints=True) return eth.stored_version_hash diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index d224088f8b..e20260bfe7 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -1,6 +1,6 @@ version: 18 -version_hash: veEmgbCPXCIiqyfabeQWwz6UIQ2liETv7LLMpyktCos= -engine_version: 10 +version_hash: XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI= +engine_version: 11 name: ethereum tables: _dlt_loads: diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 5dc4304a63..a14b4a9602 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -112,9 +112,9 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash - eth_v9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_v11 = load_yml_case("schemas/eth/ethereum_schema_v11") # source removes processing hints so we do - reference_schema = Schema.from_dict(eth_v9, remove_processing_hints=True) + reference_schema = Schema.from_dict(eth_v11, remove_processing_hints=True) assert schema.stored_version_hash == reference_schema.stored_version_hash diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 30df12ae17..725872b621 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -219,8 +219,74 @@ def some_data(created_at=dlt.sources.incremental("created_at")): assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] +def test_pandas_index_as_dedup_key() -> None: + from dlt.common.libs.pandas import pandas_to_arrow, pandas as pd + + some_data, p = _make_dedup_pipeline("pandas") + + # no index + no_index_r = some_data.with_name(new_name="no_index") + p.run(no_index_r) + p.run(no_index_r) + data_ = p._dataset().no_index.arrow() + assert data_.schema.names == ["created_at", "id"] + assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] + + # unnamed index: explicitly converted + unnamed_index_r = some_data.with_name(new_name="unnamed_index").add_map( + lambda df: pandas_to_arrow(df, preserve_index=True) + ) + # use it (as in arrow table) to deduplicate + unnamed_index_r.incremental.primary_key = "__index_level_0__" + p.run(unnamed_index_r) + p.run(unnamed_index_r) + data_ = p._dataset().unnamed_index.arrow() + assert data_.schema.names == ["created_at", "id", "index_level_0"] + # indexes 2 and 3 are removed from second batch because they were in the previous batch + # and the created_at overlapped so they got deduplicated + assert data_["index_level_0"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] + + def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: + df_.index = pd.RangeIndex(start=0, stop=len(df_), step=1, name="order_id") + return df_ + + # named index explicitly converted + named_index_r = some_data.with_name(new_name="named_index").add_map( + lambda df: pandas_to_arrow(_make_named_index(df), preserve_index=True) + ) + # use it (as in arrow table) to deduplicate + named_index_r.incremental.primary_key = "order_id" + p.run(named_index_r) + p.run(named_index_r) + data_ = p._dataset().named_index.arrow() + assert data_.schema.names == ["created_at", "id", "order_id"] + assert data_["order_id"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] + + # named index explicitly converted + named_index_impl_r = some_data.with_name(new_name="named_index_impl").add_map( + lambda df: _make_named_index(df) + ) + p.run(named_index_impl_r) + p.run(named_index_impl_r) + data_ = p._dataset().named_index_impl.arrow() + assert data_.schema.names == ["created_at", "id"] + assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] + + @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_unique_rows_by_hash_are_deduplicated(item_type: TestDataItemFormat) -> None: + some_data, p = _make_dedup_pipeline(item_type) + p.run(some_data()) + p.run(some_data()) + + with p.sql_client() as c: + with c.execute_query("SELECT created_at, id FROM some_data ORDER BY created_at, id") as cur: + rows = cur.fetchall() + print(rows) + assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] + + +def _make_dedup_pipeline(item_type: TestDataItemFormat): data1 = [ {"created_at": 1, "id": "a"}, {"created_at": 2, "id": "b"}, @@ -235,7 +301,6 @@ def test_unique_rows_by_hash_are_deduplicated(item_type: TestDataItemFormat) -> {"created_at": 3, "id": "f"}, {"created_at": 4, "id": "g"}, ] - source_items1 = data_to_item_format(item_type, data1) source_items2 = data_to_item_format(item_type, data2) @@ -250,14 +315,7 @@ def some_data(created_at=dlt.sources.incremental("created_at")): pipeline_name=uniq_id(), destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")), ) - p.run(some_data()) - p.run(some_data()) - - with p.sql_client() as c: - with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: - rows = cur.fetchall() - - assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] + return some_data, p def test_nested_cursor_path() -> None: diff --git a/tests/libs/pyarrow/test_pyarrow_normalizer.py b/tests/libs/pyarrow/test_pyarrow_normalizer.py index 32ee5fdafc..c81d8cd974 100644 --- a/tests/libs/pyarrow/test_pyarrow_normalizer.py +++ b/tests/libs/pyarrow/test_pyarrow_normalizer.py @@ -5,12 +5,12 @@ from dlt.common.libs.pyarrow import normalize_py_arrow_item, NameNormalizationCollision from dlt.common.schema.utils import new_column, TColumnSchema -from dlt.common.schema.normalizers import explicit_normalizers, import_normalizers +from dlt.common.schema.normalizers import configured_normalizers, import_normalizers from dlt.common.destination import DestinationCapabilitiesContext def _normalize(table: pa.Table, columns: List[TColumnSchema]) -> pa.Table: - _, naming, _ = import_normalizers(explicit_normalizers()) + _, naming, _ = import_normalizers(configured_normalizers()) caps = DestinationCapabilitiesContext() columns_schema = {c["name"]: c for c in columns} return normalize_py_arrow_item(table, columns_schema, naming, caps) diff --git a/tests/libs/test_csv_writer.py b/tests/libs/test_csv_writer.py index a120cd048e..df5de55529 100644 --- a/tests/libs/test_csv_writer.py +++ b/tests/libs/test_csv_writer.py @@ -178,7 +178,7 @@ def test_non_utf8_binary(item_type: TestDataItemFormat) -> None: table = pq.read_table(f) else: table = data - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore[assignment] with pytest.raises(InvalidDataItem) as inv_ex: with get_writer(writer_type, disable_compression=True) as writer: @@ -195,7 +195,7 @@ def test_arrow_struct() -> None: @pytest.mark.parametrize("item_type", ["object", "arrow-table"]) def test_csv_writer_empty(item_type: TestDataItemFormat) -> None: - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore[assignment] with get_writer(writer_type, disable_compression=True) as writer: writer.write_empty_file(TABLE_UPDATE_COLUMNS_SCHEMA) diff --git a/tests/load/clickhouse/test_clickhouse_configuration.py b/tests/load/clickhouse/test_clickhouse_configuration.py index ad33062f11..eabc3094bd 100644 --- a/tests/load/clickhouse/test_clickhouse_configuration.py +++ b/tests/load/clickhouse/test_clickhouse_configuration.py @@ -56,7 +56,8 @@ def test_clickhouse_configuration() -> None: def test_clickhouse_connection_settings(client: ClickHouseClient) -> None: """Test experimental settings are set correctly for the session.""" - conn = client.sql_client.open_connection() + # with client.sql_client.open_connection() as conn: + conn = client.sql_client.native_connection cursor1 = conn.cursor() cursor2 = conn.cursor() @@ -69,3 +70,26 @@ def test_clickhouse_connection_settings(client: ClickHouseClient) -> None: assert ("allow_experimental_lightweight_delete", "1") in res assert ("enable_http_compression", "1") in res assert ("date_time_input_format", "best_effort") in res + + +def test_client_has_dataset(client: ClickHouseClient) -> None: + # with client.sql_client as sql_client: + assert client.sql_client.has_dataset() + separator = client.config.dataset_table_separator + + def _assert_has_dataset() -> None: + assert not client.sql_client.has_dataset() + client.sql_client.create_dataset() + assert client.sql_client.has_dataset() + client.sql_client.drop_dataset() + assert not client.sql_client.has_dataset() + + try: + # change separator + client.config.dataset_table_separator = "_" + _assert_has_dataset() + + client.config.dataset_table_separator = "" + _assert_has_dataset() + finally: + client.config.dataset_table_separator = separator diff --git a/tests/load/conftest.py b/tests/load/conftest.py index 76a7248e5b..c52fea607d 100644 --- a/tests/load/conftest.py +++ b/tests/load/conftest.py @@ -9,7 +9,7 @@ drop_pipeline, empty_schema, ) -from tests.utils import preserve_environ, patch_home_dir +from tests.utils import preserve_environ, patch_home_dir, autouse_test_storage @pytest.fixture(scope="function", params=DEFAULT_BUCKETS) diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index a9479a0bb9..49475ce43f 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -19,7 +19,7 @@ from dlt.pipeline.exceptions import PipelineStepFailed from tests.pipeline.utils import assert_table -from tests.utils import patch_home_dir, autouse_test_storage, TEST_STORAGE_ROOT +from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/filesystem/test_aws_credentials.py b/tests/load/filesystem/test_aws_credentials.py index b782e76b7e..1113b9b35d 100644 --- a/tests/load/filesystem/test_aws_credentials.py +++ b/tests/load/filesystem/test_aws_credentials.py @@ -9,7 +9,6 @@ from tests.common.configuration.utils import environment from tests.load.utils import ALL_FILESYSTEM_DRIVERS -from tests.utils import autouse_test_storage # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index d0a29d03d0..afcd9105a8 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -28,7 +28,6 @@ from tests.common.configuration.utils import environment from tests.common.storages.utils import TEST_SAMPLE_FILES, assert_sample_files from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET, WITH_GDRIVE_BUCKETS -from tests.utils import autouse_test_storage from tests.load.filesystem.utils import self_signed_cert diff --git a/tests/load/pipeline/conftest.py b/tests/load/pipeline/conftest.py index a2ba65494b..80c418ed22 100644 --- a/tests/load/pipeline/conftest.py +++ b/tests/load/pipeline/conftest.py @@ -1,2 +1,2 @@ -from tests.utils import autouse_test_storage, duckdb_pipeline_location +from tests.utils import duckdb_pipeline_location from tests.pipeline.utils import drop_dataset_from_env diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index 2925bfac6f..8b6fc751d9 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -80,7 +80,7 @@ def test_merge_on_keys_in_schema( skip_if_not_supported(merge_strategy, p.destination) - with open("tests/common/cases/schemas/eth/ethereum_schema_v9.yml", "r", encoding="utf-8") as f: + with open("tests/common/cases/schemas/eth/ethereum_schema_v11.yml", "r", encoding="utf-8") as f: schema = dlt.Schema.from_dict(yaml.safe_load(f)) # make block uncles unseen to trigger filtering loader in loader for nested tables diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py index 2a5b9ed296..962c501619 100644 --- a/tests/load/pipeline/test_scd2.py +++ b/tests/load/pipeline/test_scd2.py @@ -11,7 +11,7 @@ from dlt.common.pipeline import LoadInfo from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import DEFAULT_VALIDITY_COLUMN_NAMES -from dlt.common.normalizers.json.relational import DataItemNormalizer +from dlt.common.normalizers.json.helpers import get_row_hash from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention from dlt.common.time import ensure_pendulum_datetime, reduce_pendulum_datetime_precision from dlt.extract.resource import DltResource @@ -30,7 +30,6 @@ from tests.utils import TPythonTableFormat -get_row_hash = DataItemNormalizer.get_row_hash FROM, TO = DEFAULT_VALIDITY_COLUMN_NAMES diff --git a/tests/load/qdrant/utils.py b/tests/load/qdrant/utils.py index e96e06be87..8a3b37dd48 100644 --- a/tests/load/qdrant/utils.py +++ b/tests/load/qdrant/utils.py @@ -61,6 +61,5 @@ def has_collections(client): if has_collections(client): client.drop_storage() - p._wipe_working_folder() # deactivate context Container()[PipelineContext].deactivate() diff --git a/tests/load/redshift/test_redshift_client.py b/tests/load/redshift/test_redshift_client.py index b60c6a8956..ef0acb33a4 100644 --- a/tests/load/redshift/test_redshift_client.py +++ b/tests/load/redshift/test_redshift_client.py @@ -21,7 +21,7 @@ from dlt.destinations.impl.redshift.redshift import RedshiftClient, psycopg2 from tests.common.utils import COMMON_TEST_CASES_PATH -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage, skipifpypy +from tests.utils import TEST_STORAGE_ROOT, skipifpypy from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage # mark all tests as essential, do not remove diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index e1b085893f..9079638586 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -1277,7 +1277,7 @@ def assert_no_precision_columns( ) -> None: actual = list(columns.values()) # we always infer and emit nullability - expected = cast( # type: ignore[redundant-cast] + expected = cast( List[TColumnSchema], deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS), ) diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 9f64722a1e..6f699436b3 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -36,7 +36,7 @@ from dlt.common.time import ensure_pendulum_datetime from tests.cases import table_update_and_row, assert_all_data_types_row -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage +from tests.utils import TEST_STORAGE_ROOT from tests.common.utils import load_json_case from tests.load.utils import ( TABLE_UPDATE, diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index f5a8d51baf..1a9c8a383b 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -10,6 +10,7 @@ from typing import List from functools import reduce +from dlt.common.storages.file_storage import FileStorage from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, @@ -18,7 +19,7 @@ MEMORY_BUCKET, ) from dlt.destinations import filesystem -from tests.utils import TEST_STORAGE_ROOT +from tests.utils import TEST_STORAGE_ROOT, clean_test_storage from dlt.common.destination.reference import TDestinationReferenceArg from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException from tests.load.utils import drop_pipeline_data @@ -48,8 +49,14 @@ def _expected_chunk_count(p: Pipeline) -> List[int]: return [_chunk_size(p), _total_records(p) - _chunk_size(p)] +# this also disables autouse_test_storage on function level which destroys some tests here @pytest.fixture(scope="session") -def populated_pipeline(request) -> Any: +def autouse_test_storage() -> FileStorage: + return clean_test_storage() + + +@pytest.fixture(scope="session") +def populated_pipeline(request, autouse_test_storage) -> Any: """fixture that returns a pipeline object populated with the example data""" destination_config = cast(DestinationTestConfiguration, request.param) diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 05c10a900f..ee48222da9 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -22,7 +22,7 @@ from dlt.destinations.typing import TNativeConn from dlt.common.time import ensure_pendulum_datetime, to_py_datetime -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage +from tests.utils import TEST_STORAGE_ROOT from tests.load.utils import ( yield_client_with_storage, prepare_table, diff --git a/tests/load/weaviate/utils.py b/tests/load/weaviate/utils.py index b391c2fa38..b98b55fcfa 100644 --- a/tests/load/weaviate/utils.py +++ b/tests/load/weaviate/utils.py @@ -95,6 +95,5 @@ def schema_has_classes(client): if schema_has_classes(client): client.drop_storage() - p._wipe_working_folder() # deactivate context Container()[PipelineContext].deactivate() diff --git a/tests/pipeline/cases/github_pipeline/github_rev.py b/tests/pipeline/cases/github_pipeline/github_rev.py new file mode 100644 index 0000000000..4ebe3048f4 --- /dev/null +++ b/tests/pipeline/cases/github_pipeline/github_rev.py @@ -0,0 +1,26 @@ +import dlt + + +@dlt.source +def github(): + @dlt.resource( + table_name="issues__2", + primary_key="id", + ) + def load_issues(): + # return data with path separators + yield [ + { + "id": 100, + "issue__id": 10, + } + ] + + return load_issues + + +if __name__ == "__main__": + p = dlt.pipeline("dlt_github_pipeline", destination="duckdb", dataset_name="github_3") + github_source = github() + info = p.run(github_source) + print(info) diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index a3d8b489c9..fbd4d412b3 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -484,3 +484,59 @@ def test_scd2_pipeline_update(test_storage: FileStorage) -> None: assert len(issues_retired) == 1 assert issues_retired[0][0] == 6272 # print(pipeline.default_schema.to_pretty_yaml()) + + +def test_normalize_path_separator_legacy_behavior(test_storage: FileStorage) -> None: + """Pre 1.4.1 normalized identifiers with path separators into single underscore, + this behavior must be preserved if the schema is updated. + """ + shutil.copytree("tests/pipeline/cases/github_pipeline", TEST_STORAGE_ROOT, dirs_exist_ok=True) + + # execute in test storage + with set_working_dir(TEST_STORAGE_ROOT): + # store dlt data in test storage (like patch_home_dir) + with custom_environ({DLT_DATA_DIR: dlt.current.run().data_dir}): + # save database outside of pipeline dir + with custom_environ( + {"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"} + ): + venv_dir = tempfile.mkdtemp() + # create virtual env with (0.3.0) before the current schema upgrade + with Venv.create(venv_dir, ["dlt[duckdb]==0.3.0"]) as venv: + venv._install_deps(venv.context, ["duckdb" + "==" + pkg_version("duckdb")]) + try: + print( + venv.run_script("../tests/pipeline/cases/github_pipeline/github_rev.py") + ) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise + + venv = Venv.restore_current() + # load same data again + try: + print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_rev.py")) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise + pipeline = dlt.attach(GITHUB_PIPELINE_NAME) + print(pipeline.default_schema.to_pretty_yaml()) + # migration set the backward compat flag + assert ( + pipeline.default_schema._normalizers_config["use_break_path_on_normalize"] + is False + ) + # make sure that schema didn't change + assert pipeline.default_schema.data_table_names() == ["issues_2"] + table_ = pipeline.default_schema.tables["issues_2"] + assert set(table_["columns"].keys()) == { + "id", + "issue_id", + "_dlt_id", + "_dlt_load_id", + } + # datasets must be the same + data_ = pipeline._dataset().issues_2.select("issue_id", "id").fetchall() + print(data_) diff --git a/tests/normalize/test_max_nesting.py b/tests/pipeline/test_max_nesting.py similarity index 100% rename from tests/normalize/test_max_nesting.py rename to tests/pipeline/test_max_nesting.py diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 3832bad81a..e58db64e5e 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1711,6 +1711,111 @@ def nested_resource(): assert pipeline.last_trace.last_normalize_info.row_counts["flattened_dict__values"] == 4 +def test_column_name_with_break_path() -> None: + """Tests how normalization behaves for names with break path ie __ + all the names must be idempotent + """ + pipeline = dlt.pipeline(destination="duckdb", pipeline_name="breaking") + info = pipeline.run( + [{"example_custom_field__c": "custom", "reg_c": "c"}], table_name="custom__path" + ) + assert_load_info(info) + # table name was preserved + table = pipeline.default_schema.get_table("custom__path") + assert pipeline.default_schema.data_table_names() == ["custom__path"] + # column name was preserved + assert table["columns"]["example_custom_field__c"]["data_type"] == "text" + assert set(table["columns"]) == {"example_custom_field__c", "reg_c", "_dlt_id", "_dlt_load_id"} + + # get data + assert_data_table_counts(pipeline, {"custom__path": 1}) + # get data via dataset with dbapi + data_ = pipeline._dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() + assert data_ == [("custom", "c")] + + +def test_column_name_with_break_path_legacy() -> None: + """Tests how normalization behaves for names with break path ie __ + in legacy mode table and column names were normalized as single identifier + """ + os.environ["SCHEMA__USE_BREAK_PATH_ON_NORMALIZE"] = "False" + pipeline = dlt.pipeline(destination="duckdb", pipeline_name="breaking") + info = pipeline.run( + [{"example_custom_field__c": "custom", "reg_c": "c"}], table_name="custom__path" + ) + assert_load_info(info) + # table name was contracted + table = pipeline.default_schema.get_table("custom_path") + assert pipeline.default_schema.data_table_names() == ["custom_path"] + # column name was contracted + assert table["columns"]["example_custom_field_c"]["data_type"] == "text" + assert set(table["columns"]) == {"example_custom_field_c", "reg_c", "_dlt_id", "_dlt_load_id"} + + # get data + assert_data_table_counts(pipeline, {"custom_path": 1}) + # get data via dataset with dbapi + data_ = pipeline._dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() + assert data_ == [("custom", "c")] + + +def test_column_hint_with_break_path() -> None: + """Up form the v 1.4.1 name normalizer is idempotent on break path""" + now = cast(pendulum.DateTime, pendulum.parse("2024-11-29T10:10")) + + @dlt.resource( + name="flattened__dict", columns=[{"name": "value__timestamp", "data_type": "timestamp"}] + ) + def flattened_dict(): + for delta in range(4): + yield { + "delta": delta, + "value": {"timestamp": now.timestamp() + delta}, + } + + pipeline = dlt.pipeline(destination="duckdb") + info = pipeline.run(flattened_dict()) + assert_load_info(info) + + assert pipeline.default_schema.data_table_names() == ["flattened__dict"] + table = pipeline.default_schema.get_table("flattened__dict") + assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} + assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" + + # make sure data is there + data_ = pipeline._dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() + assert data_ == [(0, now)] + + +def test_column_hint_with_break_path_legacy() -> None: + """Up form the v 1.4.1 name normalizer is idempotent on break path""" + + os.environ["SCHEMA__USE_BREAK_PATH_ON_NORMALIZE"] = "False" + now = cast(pendulum.DateTime, pendulum.parse("2024-11-29T10:10")) + + @dlt.resource( + name="flattened__dict", columns=[{"name": "value__timestamp", "data_type": "timestamp"}] + ) + def flattened_dict(): + for delta in range(4): + yield { + "delta": delta, + "value": {"timestamp": now.timestamp() + delta}, + } + + pipeline = dlt.pipeline(destination="duckdb") + info = pipeline.run(flattened_dict()) + assert_load_info(info) + # table name contracted + assert pipeline.default_schema.data_table_names() == ["flattened_dict"] + table = pipeline.default_schema.get_table("flattened_dict") + # hint applied + assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} + assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" + # make sure data is there + data_ = pipeline._dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() + assert data_ == [(0, now)] + + def test_empty_rows_are_included() -> None: """Empty rows where all values are `None` or empty dicts create rows in the dataset with `NULL` in all columns diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index e67ff9c70a..36fe009b93 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -401,7 +401,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response, data): # type: ignore[override] + def update_state(self, response, data): self._next_reference = response.json().get("next_page") def update_request(self, request): From 872432e68d0067f7b74c4b641534e1a5b1adfad3 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 3 Dec 2024 11:49:26 +0400 Subject: [PATCH 63/71] fix lint errors --- dlt/cli/source_detection.py | 3 +-- .../configuration/specs/base_configuration.py | 2 +- dlt/common/data_writers/buffered.py | 2 +- dlt/common/destination/utils.py | 2 +- dlt/common/logger.py | 2 +- dlt/common/metrics.py | 2 +- dlt/common/reflection/utils.py | 14 +++++++------- dlt/common/schema/schema.py | 2 +- dlt/common/typing.py | 2 +- dlt/extract/incremental/lag.py | 2 +- tests/libs/test_csv_writer.py | 4 ++-- .../sql_database/test_sql_database_source.py | 5 +---- tests/sources/helpers/rest_client/test_client.py | 2 +- 13 files changed, 20 insertions(+), 24 deletions(-) diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index 7067f8b896..0769605d01 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -29,8 +29,7 @@ def find_call_arguments_to_replace( if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str): raise CliCommandInnerException( "init", - f"The pipeline script {init_script_name} must pass the {t_arg_name} as" - f" string to '{arg_name}' function in line {dn_node.lineno}", + f"The pipeline script {init_script_name} must pass the {t_arg_name} as string to '{arg_name}' function in line {dn_node.lineno}", # type: ignore[attr-defined] ) else: transformed_nodes.append((dn_node, ast.Constant(value=t_value, kind=None))) diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 8d913d0542..41d1d7a0ca 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: def get_resolvable_fields(cls) -> Dict[str, type]: """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned""" return { - f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type] + f.name: eval(f.type) if isinstance(f.type, str) else f.type for f in cls._get_resolvable_dataclass_fields() } diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index eb7487051f..6ef431a4d0 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if self.writer_spec.is_binary_format: self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore[operator] + self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index 96503c036f..c98344b687 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -38,7 +38,7 @@ def verify_schema_capabilities( exception_log: List[Exception] = [] # combined casing function case_identifier = lambda ident: capabilities.casefold_identifier( - (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore[operator] + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) ) table_name_lookup: DictStrStr = {} # name collision explanation diff --git a/dlt/common/logger.py b/dlt/common/logger.py index 0533713fda..634e305805 100644 --- a/dlt/common/logger.py +++ b/dlt/common/logger.py @@ -47,7 +47,7 @@ def is_logging() -> bool: def log_level() -> str: if not LOGGER: raise RuntimeError("Logger not initialized") - return logging.getLevelName(LOGGER.level) # type: ignore[no-any-return] + return logging.getLevelName(LOGGER.level) def is_json_logging(log_format: str) -> bool: diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index d6acf19d0d..2f9f574dd0 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple): created: float last_modified: float - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: # type: ignore[override] if isinstance(other, DataWriterMetrics): return DataWriterMetrics( self.file_path if self.file_path == other.file_path else "", diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index c612c5a4f1..27c7bd8758 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -90,24 +90,24 @@ def rewrite_python_script( last_line = -1 last_offset = -1 # sort transformed nodes by line and offset - for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): + for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # type: ignore[attr-defined] # do we have a line changed - if last_line != node.lineno - 1: + if last_line != node.lineno - 1: # type: ignore[attr-defined] # add remainder from the previous line if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) # add all new lines from previous line to current - script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) + script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # type: ignore[attr-defined] # add trailing characters until node in current line starts - script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) + script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) # type: ignore[attr-defined] elif last_offset >= 0: # no line change, add the characters from the end of previous node to the current - script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) + script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # type: ignore[attr-defined] # replace node value script_lines.append(ast_unparse(t_value).strip()) - last_line = node.end_lineno - 1 - last_offset = node.end_col_offset + last_line = node.end_lineno - 1 # type: ignore[attr-defined] + last_offset = node.end_col_offset # type: ignore[attr-defined] # add all that was missing if last_offset >= 0: diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index d6031a08fa..276bbe9c09 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -525,7 +525,7 @@ def get_new_table_columns( Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored. Optionally includes incomplete columns (without data type)""" - casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] + casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str casefold_existing = { casefold_f(col_name): col for col_name, col in existing_columns.items() } diff --git a/dlt/common/typing.py b/dlt/common/typing.py index a3364d1b07..8986d753f3 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -446,7 +446,7 @@ def get_generic_type_argument_from_instance( if cls_: orig_param_type = get_args(cls_)[0] if orig_param_type in (Any, CallableAny) and sample_value is not None: - orig_param_type = type(sample_value) + orig_param_type = type(sample_value) # type: ignore[assignment] return orig_param_type # type: ignore diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py index ee102a9961..dfafa2cd11 100644 --- a/dlt/extract/incremental/lag.py +++ b/dlt/extract/incremental/lag.py @@ -20,7 +20,7 @@ def _apply_lag_to_value( parsed_value = ensure_pendulum_datetime(value) if is_str else value if isinstance(parsed_value, (datetime, date)): - parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) + parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment] # go back to string or pass exact type value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment] diff --git a/tests/libs/test_csv_writer.py b/tests/libs/test_csv_writer.py index df5de55529..a120cd048e 100644 --- a/tests/libs/test_csv_writer.py +++ b/tests/libs/test_csv_writer.py @@ -178,7 +178,7 @@ def test_non_utf8_binary(item_type: TestDataItemFormat) -> None: table = pq.read_table(f) else: table = data - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore[assignment] + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with pytest.raises(InvalidDataItem) as inv_ex: with get_writer(writer_type, disable_compression=True) as writer: @@ -195,7 +195,7 @@ def test_arrow_struct() -> None: @pytest.mark.parametrize("item_type", ["object", "arrow-table"]) def test_csv_writer_empty(item_type: TestDataItemFormat) -> None: - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore[assignment] + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with get_writer(writer_type, disable_compression=True) as writer: writer.write_empty_file(TABLE_UPDATE_COLUMNS_SCHEMA) diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 9079638586..29fbbaa7de 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -1277,10 +1277,7 @@ def assert_no_precision_columns( ) -> None: actual = list(columns.values()) # we always infer and emit nullability - expected = cast( - List[TColumnSchema], - deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS), - ) + expected = deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS) if backend == "pyarrow": expected = cast( List[TColumnSchema], diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 36fe009b93..e67ff9c70a 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -401,7 +401,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response, data): + def update_state(self, response, data): # type: ignore[override] self._next_reference = response.json().get("next_page") def update_request(self, request): From 9c4429041fc4629f77b2b3d7e2fecc70bffa8338 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 3 Dec 2024 12:13:05 +0400 Subject: [PATCH 64/71] re-add pyiceberg dependency --- poetry.lock | 96 ++++++++++++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 6 ++++ 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index e0a83984c4..4aa3296a88 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7525,6 +7525,84 @@ files = [ [package.extras] plugins = ["importlib-metadata"] +[[package]] +name = "pyiceberg" +version = "0.7.1" +description = "Apache Iceberg is an open table format for huge analytic datasets" +optional = true +python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" +files = [ + {file = "pyiceberg-0.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:9e0cc837d41e100df81f1f5e580a89668aade694d8c616941d6e11c3a27e49cb"}, + {file = "pyiceberg-0.7.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:71c053c2d484505d1beabd7d5167fe2e835ca865f52ad91ef4852f0d91fa4a25"}, + {file = "pyiceberg-0.7.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:0549ab1843bc07037a7d212c2db527ff1755f5d8f80420907952b5b080eb3663"}, + {file = "pyiceberg-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec4a8000f0bb6ce6ec47f3368ca99f3191e9105662eeef7be2fbb493363cba96"}, + {file = "pyiceberg-0.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0ef6636d3cf370b796529f9a8dbd84e892a2151f0310a8015b9a1e702647ad90"}, + {file = "pyiceberg-0.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:9b49320f3e9624075879a4ddb4fa5ddff7d4a03f6561ad6fd73d514c63095367"}, + {file = "pyiceberg-0.7.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:27e9b4033691411ef7c49d93df7b3b7f3ed85fe8019cbf0dab5a5ba888b27f34"}, + {file = "pyiceberg-0.7.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:7262ba4f95e05a1421567e24c0db57288dc59974c94676aba34afef121544694"}, + {file = "pyiceberg-0.7.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3eb1fc1d47085b16973999c2111d252fab2a394625c0f25da6515b8c3233c853"}, + {file = "pyiceberg-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1856c5d64197c9335817b8cf7081e490b601385623e5178cb094ee645d4fb24c"}, + {file = "pyiceberg-0.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b6b64006c361220ce103b5bb2f50381a3f851452668adf5a6c61d39f5611e832"}, + {file = "pyiceberg-0.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:57a0b1fb390d26a5b7155de011300300058343e5c2561f4839d69c1775df1d7e"}, + {file = "pyiceberg-0.7.1-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:84f2119705e31929aa75beb9a8ce97210e56f498e863e31dc499a2120c2842bd"}, + {file = "pyiceberg-0.7.1-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:f99ab8d71a2968af0b512fff1d3dcbd145705a95a26b05121c0df712683c9e0c"}, + {file = "pyiceberg-0.7.1-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:5dc17aa1f53f5b8be12eae35dbcb9885b2534138bdecd31a0088680651fbb98e"}, + {file = "pyiceberg-0.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:917fdfd372922f9534fe9b6652881a79f82f94d7d3645ddb1925688e3d9aaf4d"}, + {file = "pyiceberg-0.7.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:910fab27c039d62a1fe4a199aaea63d08ada30ead6fd27d56bf038c487837691"}, + {file = "pyiceberg-0.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:98db6d18dca335682c32b25406d7ab5afad8f1baea4fbdafda166cbc6557409c"}, + {file = "pyiceberg-0.7.1-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:c76ea44cc1b02c15b65e1b0cc81b5b3f813ba40a4e262416d7a1e84345f44cf1"}, + {file = "pyiceberg-0.7.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:57485e9331c7e8b1771ea1b2ecdc417dc7a13c7a9a538d74f3f00de98676958b"}, + {file = "pyiceberg-0.7.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:bbc79698292482360be86f8d728237b78ef8eb416e21aea9d53e4a1b4f429ce7"}, + {file = "pyiceberg-0.7.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f84d912fc12866f22882f5de157cbbfab3dcbad8e0a4378557e5b84a0c3f360"}, + {file = "pyiceberg-0.7.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:f86c535735e57f1a0c76fd0f505e0b172cc212c96a3789f3845220695e792157"}, + {file = "pyiceberg-0.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:d8bee5aa4b34e6028f0465cf405bc4e963e160ac52efbe4bdbc499bb55bc2780"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_12_0_x86_64.whl", hash = "sha256:9ae56197db8570553491173adfd2e01a03ae116a1f9fa78ba5a1a1c4e2ad3dbf"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_13_0_x86_64.whl", hash = "sha256:e28adc58500ca72e45a07ee4dcd90b63699a8875f178001bd12ace37294c5814"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_14_0_arm64.whl", hash = "sha256:1ae47f2d0e87dccd158ae8dafc47125f9739858068fc3add8940f5585ea40ead"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb94c3e11354f85daafb2b2f3e13a245bcb35848135b5ed4e8c83e61393c36ea"}, + {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4fe212b0594128d183711c6efb1a40ea5f17372e11595a84f4565eb9fe97c703"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_12_0_x86_64.whl", hash = "sha256:35ce27243b86f7057fbd4594dbe5d6b2a1ccd738ba6b65c2a4f3af249f1e8364"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_13_0_x86_64.whl", hash = "sha256:56e254623669ab03e779e4b696b7e36cd1c6973e8523200ccc232695742e269d"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_14_0_arm64.whl", hash = "sha256:e07b59a5998c6d4cac258763c6c160234e1e3362a2097808bd02e05e0c16208a"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde005aa075fc0e5ed0095438b0a4d39534e3cb84889b93d6aa265dd2e072eff"}, + {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:1950f2186f1c99e0d187ffee86e2f8d6bbbad9b0079573a7255b85ffaaa82e79"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_12_0_x86_64.whl", hash = "sha256:273b4b642168a5e64fedc0073e18fd481b11d6891f9e44ceb5ce27126fe418f7"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_13_0_x86_64.whl", hash = "sha256:9a2dbc621cdd4f0c92f5b2520f2b266b976317ff8a984aec2ce9240ee3d80471"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_14_0_arm64.whl", hash = "sha256:34c2d6e9d027b66f8d531fcefeb5cda8b2a37e70170c01f6f1c977954d733c45"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3e97fb65862db191685355e1eb8d97d41d00679a3df1fbd7a1c2560b9e3e6d8"}, + {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:98a0de3c2f194907b07522769facbcacdff0ec9577f9710273ba7e0aa8465652"}, + {file = "pyiceberg-0.7.1.tar.gz", hash = "sha256:2fd8f9717b02673cb9cabe7aed82fc38933241b2bd15cbdc1ff7371e70317a47"}, +] + +[package.dependencies] +click = ">=7.1.1,<9.0.0" +fsspec = ">=2023.1.0,<2025.1.0" +mmh3 = ">=4.0.0,<5.0.0" +pydantic = ">=2.0,<2.4.0 || >2.4.0,<2.4.1 || >2.4.1,<3.0" +pyparsing = ">=3.1.0,<4.0.0" +requests = ">=2.20.0,<3.0.0" +rich = ">=10.11.0,<14.0.0" +sortedcontainers = "2.4.0" +strictyaml = ">=1.7.0,<2.0.0" +tenacity = ">=8.2.3,<9.0.0" + +[package.extras] +adlfs = ["adlfs (>=2023.1.0,<2024.8.0)"] +daft = ["getdaft (>=0.2.12)"] +duckdb = ["duckdb (>=0.5.0,<2.0.0)", "numpy (>=1.22.4,<2.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] +dynamodb = ["boto3 (>=1.24.59)"] +gcsfs = ["gcsfs (>=2023.1.0,<2024.1.0)"] +glue = ["boto3 (>=1.24.59)", "mypy-boto3-glue (>=1.28.18)"] +hive = ["thrift (>=0.13.0,<1.0.0)"] +pandas = ["numpy (>=1.22.4,<2.0.0)", "pandas (>=1.0.0,<3.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] +pyarrow = ["numpy (>=1.22.4,<2.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] +ray = ["numpy (>=1.22.4,<2.0.0)", "pandas (>=1.0.0,<3.0.0)", "pyarrow (>=9.0.0,<18.0.0)", "ray (==2.10.0)", "ray (>=2.10.0,<3.0.0)"] +s3fs = ["s3fs (>=2023.1.0,<2024.1.0)"] +snappy = ["python-snappy (>=0.6.0,<1.0.0)"] +sql-postgres = ["psycopg2-binary (>=2.9.6)", "sqlalchemy (>=2.0.18,<3.0.0)"] +sql-sqlite = ["sqlalchemy (>=2.0.18,<3.0.0)"] +zstandard = ["zstandard (>=0.13.0,<1.0.0)"] + [[package]] name = "pyjwt" version = "2.8.0" @@ -9331,6 +9409,20 @@ files = [ [package.dependencies] pbr = ">=2.0.0,<2.1.0 || >2.1.0" +[[package]] +name = "strictyaml" +version = "1.7.3" +description = "Strict, typed YAML parser" +optional = true +python-versions = ">=3.7.0" +files = [ + {file = "strictyaml-1.7.3-py3-none-any.whl", hash = "sha256:fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7"}, + {file = "strictyaml-1.7.3.tar.gz", hash = "sha256:22f854a5fcab42b5ddba8030a0e4be51ca89af0267961c8d6cfa86395586c407"}, +] + +[package.dependencies] +python-dateutil = ">=2.6.0" + [[package]] name = "sympy" version = "1.12" @@ -10610,7 +10702,7 @@ mssql = ["pyodbc"] parquet = ["pyarrow"] postgis = ["psycopg2-binary", "psycopg2cffi"] postgres = ["psycopg2-binary", "psycopg2cffi"] -pyiceberg = ["pyarrow", "sqlalchemy"] +pyiceberg = ["pyarrow", "pyiceberg", "sqlalchemy"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] @@ -10624,4 +10716,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "68e1fbc32e7c354f70c4c39e575b0ef66f356f7f11b1d4b33da2e52d8232c74f" +content-hash = "68ecd75e0095f12efed208c2e630eda044b94d0014e7ca1f55bd9d023e7d4a59" diff --git a/pyproject.toml b/pyproject.toml index e7bdeb96fb..77c53647d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,12 @@ alembic = {version = ">1.10.0", optional = true} paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } +# `sql-sqlite` extra leads to dependency conflict with `apache-airflow` because `apache-airflow` +# requires `sqlalchemy<2.0.0` while the extra requires `sqlalchemy>=2.0.18` +# https://github.com/apache/airflow/issues/28723 +# pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } +# we will rely on manual installation of `sqlalchemy>=2.0.18` instead +pyiceberg = { version = ">=0.7.1", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] From c129b9ee0e7f7b2fb653ce8cd464ae3a5730ab9c Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Fri, 6 Dec 2024 12:33:52 +0100 Subject: [PATCH 65/71] enabled iceberg in dbt-duckdb --- dlt/helpers/dbt/profiles.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml index a2a0014e4e..fd114478fb 100644 --- a/dlt/helpers/dbt/profiles.yml +++ b/dlt/helpers/dbt/profiles.yml @@ -83,6 +83,7 @@ duckdb: extensions: - httpfs - parquet + - iceberg # TODO: emit the config of duck db motherduck: From 6992d56887b2b6c432af7fbbfcd566bed71f3a6f Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 10 Dec 2024 11:36:22 +0400 Subject: [PATCH 66/71] upgrade pyiceberg version --- .../destinations/delta-iceberg.md | 2 +- poetry.lock | 110 ++++++++---------- pyproject.toml | 2 +- 3 files changed, 52 insertions(+), 62 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md index d062317f1c..6c0dfa50ee 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md +++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md @@ -35,7 +35,7 @@ pip install 'pyarrow>=17.0.0' ## Iceberg dependencies -You need the `pyiceberg` package to use this format: +You need Python version 3.9 or higher and the `pyiceberg` package to use this format: ```sh pip install "dlt[pyiceberg]" diff --git a/poetry.lock b/poetry.lock index 4aa3296a88..e239c2b13c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1543,13 +1543,13 @@ files = [ [[package]] name = "cachetools" -version = "5.3.1" +version = "5.5.0" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" files = [ - {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, - {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, + {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, + {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, ] [[package]] @@ -7527,77 +7527,67 @@ plugins = ["importlib-metadata"] [[package]] name = "pyiceberg" -version = "0.7.1" +version = "0.8.1" description = "Apache Iceberg is an open table format for huge analytic datasets" optional = true -python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" -files = [ - {file = "pyiceberg-0.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:9e0cc837d41e100df81f1f5e580a89668aade694d8c616941d6e11c3a27e49cb"}, - {file = "pyiceberg-0.7.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:71c053c2d484505d1beabd7d5167fe2e835ca865f52ad91ef4852f0d91fa4a25"}, - {file = "pyiceberg-0.7.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:0549ab1843bc07037a7d212c2db527ff1755f5d8f80420907952b5b080eb3663"}, - {file = "pyiceberg-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec4a8000f0bb6ce6ec47f3368ca99f3191e9105662eeef7be2fbb493363cba96"}, - {file = "pyiceberg-0.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0ef6636d3cf370b796529f9a8dbd84e892a2151f0310a8015b9a1e702647ad90"}, - {file = "pyiceberg-0.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:9b49320f3e9624075879a4ddb4fa5ddff7d4a03f6561ad6fd73d514c63095367"}, - {file = "pyiceberg-0.7.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:27e9b4033691411ef7c49d93df7b3b7f3ed85fe8019cbf0dab5a5ba888b27f34"}, - {file = "pyiceberg-0.7.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:7262ba4f95e05a1421567e24c0db57288dc59974c94676aba34afef121544694"}, - {file = "pyiceberg-0.7.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3eb1fc1d47085b16973999c2111d252fab2a394625c0f25da6515b8c3233c853"}, - {file = "pyiceberg-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1856c5d64197c9335817b8cf7081e490b601385623e5178cb094ee645d4fb24c"}, - {file = "pyiceberg-0.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b6b64006c361220ce103b5bb2f50381a3f851452668adf5a6c61d39f5611e832"}, - {file = "pyiceberg-0.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:57a0b1fb390d26a5b7155de011300300058343e5c2561f4839d69c1775df1d7e"}, - {file = "pyiceberg-0.7.1-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:84f2119705e31929aa75beb9a8ce97210e56f498e863e31dc499a2120c2842bd"}, - {file = "pyiceberg-0.7.1-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:f99ab8d71a2968af0b512fff1d3dcbd145705a95a26b05121c0df712683c9e0c"}, - {file = "pyiceberg-0.7.1-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:5dc17aa1f53f5b8be12eae35dbcb9885b2534138bdecd31a0088680651fbb98e"}, - {file = "pyiceberg-0.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:917fdfd372922f9534fe9b6652881a79f82f94d7d3645ddb1925688e3d9aaf4d"}, - {file = "pyiceberg-0.7.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:910fab27c039d62a1fe4a199aaea63d08ada30ead6fd27d56bf038c487837691"}, - {file = "pyiceberg-0.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:98db6d18dca335682c32b25406d7ab5afad8f1baea4fbdafda166cbc6557409c"}, - {file = "pyiceberg-0.7.1-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:c76ea44cc1b02c15b65e1b0cc81b5b3f813ba40a4e262416d7a1e84345f44cf1"}, - {file = "pyiceberg-0.7.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:57485e9331c7e8b1771ea1b2ecdc417dc7a13c7a9a538d74f3f00de98676958b"}, - {file = "pyiceberg-0.7.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:bbc79698292482360be86f8d728237b78ef8eb416e21aea9d53e4a1b4f429ce7"}, - {file = "pyiceberg-0.7.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f84d912fc12866f22882f5de157cbbfab3dcbad8e0a4378557e5b84a0c3f360"}, - {file = "pyiceberg-0.7.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:f86c535735e57f1a0c76fd0f505e0b172cc212c96a3789f3845220695e792157"}, - {file = "pyiceberg-0.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:d8bee5aa4b34e6028f0465cf405bc4e963e160ac52efbe4bdbc499bb55bc2780"}, - {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_12_0_x86_64.whl", hash = "sha256:9ae56197db8570553491173adfd2e01a03ae116a1f9fa78ba5a1a1c4e2ad3dbf"}, - {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_13_0_x86_64.whl", hash = "sha256:e28adc58500ca72e45a07ee4dcd90b63699a8875f178001bd12ace37294c5814"}, - {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-macosx_14_0_arm64.whl", hash = "sha256:1ae47f2d0e87dccd158ae8dafc47125f9739858068fc3add8940f5585ea40ead"}, - {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb94c3e11354f85daafb2b2f3e13a245bcb35848135b5ed4e8c83e61393c36ea"}, - {file = "pyiceberg-0.7.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4fe212b0594128d183711c6efb1a40ea5f17372e11595a84f4565eb9fe97c703"}, - {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_12_0_x86_64.whl", hash = "sha256:35ce27243b86f7057fbd4594dbe5d6b2a1ccd738ba6b65c2a4f3af249f1e8364"}, - {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_13_0_x86_64.whl", hash = "sha256:56e254623669ab03e779e4b696b7e36cd1c6973e8523200ccc232695742e269d"}, - {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-macosx_14_0_arm64.whl", hash = "sha256:e07b59a5998c6d4cac258763c6c160234e1e3362a2097808bd02e05e0c16208a"}, - {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde005aa075fc0e5ed0095438b0a4d39534e3cb84889b93d6aa265dd2e072eff"}, - {file = "pyiceberg-0.7.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:1950f2186f1c99e0d187ffee86e2f8d6bbbad9b0079573a7255b85ffaaa82e79"}, - {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_12_0_x86_64.whl", hash = "sha256:273b4b642168a5e64fedc0073e18fd481b11d6891f9e44ceb5ce27126fe418f7"}, - {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_13_0_x86_64.whl", hash = "sha256:9a2dbc621cdd4f0c92f5b2520f2b266b976317ff8a984aec2ce9240ee3d80471"}, - {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-macosx_14_0_arm64.whl", hash = "sha256:34c2d6e9d027b66f8d531fcefeb5cda8b2a37e70170c01f6f1c977954d733c45"}, - {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3e97fb65862db191685355e1eb8d97d41d00679a3df1fbd7a1c2560b9e3e6d8"}, - {file = "pyiceberg-0.7.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:98a0de3c2f194907b07522769facbcacdff0ec9577f9710273ba7e0aa8465652"}, - {file = "pyiceberg-0.7.1.tar.gz", hash = "sha256:2fd8f9717b02673cb9cabe7aed82fc38933241b2bd15cbdc1ff7371e70317a47"}, -] - -[package.dependencies] +python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" +files = [ + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c121d1d3baf64510db94740ad870ae4b6eb9eb59a5ff7ecb4e96f7510666b2f"}, + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a6f14aa588a3883fc7fddc136ca75b75660b4abb0b55b4c541619953f8971e7"}, + {file = "pyiceberg-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c720c2a191ac6faf01fe4c0f4c01c64b94bf064185b0292003d42939049277c"}, + {file = "pyiceberg-0.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d421d6e51ac1c581cba9fce96aa6b9118cf4a02270066a7fdc9490ab5d57ece9"}, + {file = "pyiceberg-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:ae11fb0515ea0a046370e09a7f6039a7e86622ab910360eaa732f0106b8f00c7"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9488954c9eb5ce42ca6b816fc61873f219414cfdb9e9928d1c4a302702be1d89"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:44179e0fb844887b440c162279ba526dfe0e0f72d32945236528838518b55af0"}, + {file = "pyiceberg-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e121c6f5505d8ec711a1dd1690e07156cd54fb3d0844d5d991e02f1593f2708"}, + {file = "pyiceberg-0.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5961a288f2d4bbb2ab300c803da1bf0e70cea837e3f14b14108827cc821af252"}, + {file = "pyiceberg-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbe192324a6fb552c2fd29cab51086e21fa248ea2a0b95fbab921dede49e5a69"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:60430f0d8f6d650ed7d1893d038b847565a8e9ac135a1cc812e57d24f0482f6c"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f0f697977dac672d8b00e125836423585a97ebf59a28b865b1296a2b6ee81c51"}, + {file = "pyiceberg-0.8.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:370de7c230970ff858f713d150164d492ba8450e771e59a0c520520b13ea6226"}, + {file = "pyiceberg-0.8.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3036ed226020d50e30648a71f968cf78bde5d6b609294508e60754e100e5ef36"}, + {file = "pyiceberg-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:9ac9555f3bd25a31059229089ae639cf738a8e8286a175cea128561ac1ed9452"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:51da3a553d3a881042bf436e66a91cc2b6c4a3fea0e174cd73af2eb6ed255323"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:863f1dce7340e6ed870706a3fa4a73457178dae8529725bb80522ddcd4253afb"}, + {file = "pyiceberg-0.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dbf52b39080a6a2cda6a5126a74e3a88d5b206f609c128d001a728b36b81075"}, + {file = "pyiceberg-0.8.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb77d65e8efbb883c163817e4a9c373d907110ab6343c1b816b48f336955d4d7"}, + {file = "pyiceberg-0.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:1fcd35b7de0eddc3fd8fd0c38b98741217ef6de4eeb0e72b798b4007692aa76c"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6f0f56f8fc61bcd795f6a3d03e8ce6bee09ebaa64425eb08327e975f906d98be"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d7099c6631743ad29c451de2bebd9ed3c96c42bcb1fe5d5d5c93aec895858e3f"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6436f5a782491115f64131882a737d77c9dc0040493e1b7f9b3081ea8cf6a26"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c1d75b40a98a327f7436eb0d6187c51834c44b79adf61c6945b33645f4afbf17"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8de988fa2363e6a51b40b85b5ff1e8261cda5bfc14ac54dd4ebe58391b95acae"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:dd06c5b606011155aa0b76e7b001e30f1c40ab2fb3eeb8a0652b88629259c2bb"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8142f0dbc12dda0e6d7aaf564a3fbb0f17fc934630e7cf866773c8caaebf666"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6126ee3a46ff975f15abf2085f184591d21643bffb96330907e003eea0b63005"}, + {file = "pyiceberg-0.8.1.tar.gz", hash = "sha256:4502f0cfddf6f7cd48b9cd54016bce0ab94052b0ab01efcfa515879074f4c8e3"}, +] + +[package.dependencies] +cachetools = ">=5.5.0,<6.0.0" click = ">=7.1.1,<9.0.0" -fsspec = ">=2023.1.0,<2025.1.0" -mmh3 = ">=4.0.0,<5.0.0" +fsspec = ">=2023.1.0" +mmh3 = ">=4.0.0,<6.0.0" pydantic = ">=2.0,<2.4.0 || >2.4.0,<2.4.1 || >2.4.1,<3.0" pyparsing = ">=3.1.0,<4.0.0" requests = ">=2.20.0,<3.0.0" rich = ">=10.11.0,<14.0.0" sortedcontainers = "2.4.0" strictyaml = ">=1.7.0,<2.0.0" -tenacity = ">=8.2.3,<9.0.0" +tenacity = ">=8.2.3,<10.0.0" [package.extras] -adlfs = ["adlfs (>=2023.1.0,<2024.8.0)"] +adlfs = ["adlfs (>=2023.1.0)"] daft = ["getdaft (>=0.2.12)"] -duckdb = ["duckdb (>=0.5.0,<2.0.0)", "numpy (>=1.22.4,<2.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] +duckdb = ["duckdb (>=0.5.0,<2.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] dynamodb = ["boto3 (>=1.24.59)"] -gcsfs = ["gcsfs (>=2023.1.0,<2024.1.0)"] +gcsfs = ["gcsfs (>=2023.1.0)"] glue = ["boto3 (>=1.24.59)", "mypy-boto3-glue (>=1.28.18)"] hive = ["thrift (>=0.13.0,<1.0.0)"] -pandas = ["numpy (>=1.22.4,<2.0.0)", "pandas (>=1.0.0,<3.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] -pyarrow = ["numpy (>=1.22.4,<2.0.0)", "pyarrow (>=9.0.0,<18.0.0)"] -ray = ["numpy (>=1.22.4,<2.0.0)", "pandas (>=1.0.0,<3.0.0)", "pyarrow (>=9.0.0,<18.0.0)", "ray (==2.10.0)", "ray (>=2.10.0,<3.0.0)"] -s3fs = ["s3fs (>=2023.1.0,<2024.1.0)"] +pandas = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +pyarrow = ["pyarrow (>=14.0.0,<19.0.0)"] +ray = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)", "ray (==2.10.0)", "ray (>=2.10.0,<3.0.0)"] +s3fs = ["s3fs (>=2023.1.0)"] snappy = ["python-snappy (>=0.6.0,<1.0.0)"] sql-postgres = ["psycopg2-binary (>=2.9.6)", "sqlalchemy (>=2.0.18,<3.0.0)"] sql-sqlite = ["sqlalchemy (>=2.0.18,<3.0.0)"] @@ -10716,4 +10706,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "68ecd75e0095f12efed208c2e630eda044b94d0014e7ca1f55bd9d023e7d4a59" +content-hash = "10be8914f675d06db2b2d65b7f981bc538976238e99ad3b9eaef1844cb1f1b68" diff --git a/pyproject.toml b/pyproject.toml index 77c53647d6..c669aa23af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,7 +94,7 @@ db-dtypes = { version = ">=1.2.0", optional = true } # https://github.com/apache/airflow/issues/28723 # pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } # we will rely on manual installation of `sqlalchemy>=2.0.18` instead -pyiceberg = { version = ">=0.7.1", optional = true } +pyiceberg = { version = ">=0.8.1", python = ">=3.9", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] From aa19f1365d97993f89b28d492f7194eebd7e96d5 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 10 Dec 2024 16:58:25 +0400 Subject: [PATCH 67/71] remove pyiceberg mypy errors across python version --- mypy.ini | 3 +++ tests/load/pipeline/test_filesystem_pipeline.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/mypy.ini b/mypy.ini index 769e84b13a..5d59c72792 100644 --- a/mypy.ini +++ b/mypy.ini @@ -135,3 +135,6 @@ ignore_missing_imports = True [mypy-time_machine.*] ignore_missing_imports = True + +[mypy-pyiceberg.*] +ignore_missing_imports = True diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 05c8541e73..c70fa5ab5d 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -1005,6 +1005,7 @@ def test_table_format_get_tables_helper( destination_config: DestinationTestConfiguration, ) -> None: """Tests `get_delta_tables` / `get_iceberg_tables` helper functions.""" + get_tables: Any if destination_config.table_format == "delta": from dlt.common.libs.deltalake import DeltaTable, get_delta_tables @@ -1013,7 +1014,7 @@ def test_table_format_get_tables_helper( elif destination_config.table_format == "iceberg": from dlt.common.libs.pyiceberg import IcebergTable, get_iceberg_tables - get_tables = get_iceberg_tables # type: ignore[assignment] + get_tables = get_iceberg_tables get_num_rows = lambda table: table.scan().to_arrow().num_rows @dlt.resource(table_format=destination_config.table_format) From b7f6dbf87f5e067a3b16e92956058c69e3300f68 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 10 Dec 2024 23:23:04 +0100 Subject: [PATCH 68/71] does not install airflow group for dev --- Makefile | 2 +- dlt/common/configuration/specs/config_providers_context.py | 7 +------ dlt/helpers/airflow_helper.py | 4 ++-- mypy.ini | 3 +++ 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 2a7f6dac0a..0ca8a2e0c3 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk,airflow + poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk lint: ./tools/check-package.sh diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py index 5d1a5b7f26..29280dc7a8 100644 --- a/dlt/common/configuration/specs/config_providers_context.py +++ b/dlt/common/configuration/specs/config_providers_context.py @@ -1,5 +1,4 @@ import contextlib -import dataclasses import io from typing import ClassVar, List @@ -8,10 +7,6 @@ ConfigProvider, ContextProvider, ) -from dlt.common.configuration.specs.base_configuration import ( - ContainerInjectableContext, - NotResolved, -) from dlt.common.configuration.specs import ( GcpServiceAccountCredentials, BaseConfiguration, @@ -137,7 +132,7 @@ def _airflow_providers() -> List[ConfigProvider]: # check if we are in task context and provide more info from airflow.operators.python import get_current_context # noqa - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # log outside of stderr/out redirect if secrets_toml_var is None: diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 99458a3949..6f38fd657c 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -18,7 +18,7 @@ from airflow.configuration import conf from airflow.models import TaskInstance from airflow.utils.task_group import TaskGroup - from airflow.operators.dummy import DummyOperator # type: ignore + from airflow.operators.dummy import DummyOperator from airflow.operators.python import PythonOperator, get_current_context except ModuleNotFoundError: raise MissingDependencyException("Airflow", ["apache-airflow>=2.5"]) @@ -255,7 +255,7 @@ def _run( # use task logger if self.use_task_logger: - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] logger.LOGGER = ti.log # set global number of buffered items diff --git a/mypy.ini b/mypy.ini index 5d59c72792..fdf0ceb1e6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -138,3 +138,6 @@ ignore_missing_imports = True [mypy-pyiceberg.*] ignore_missing_imports = True + +[mypy-airflow.*] +ignore_missing_imports = True From 9cad3ec7ae173834c1d5d8d024e4ce04544e3437 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 10 Dec 2024 23:24:01 +0100 Subject: [PATCH 69/71] fixes gcp oauth iceberg credentials handling --- dlt/common/configuration/specs/gcp_credentials.py | 6 ++++++ tests/load/utils.py | 9 +-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 21ae2587ed..17519b032a 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -356,3 +356,9 @@ def parse_native_representation(self, native_value: Any) -> None: except NativeValueError: pass GcpOAuthCredentialsWithoutDefaults.parse_native_representation(self, native_value) + + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + if self.has_default_credentials(): + return GcpDefaultCredentials.to_pyiceberg_fileio_config(self) + else: + return GcpOAuthCredentialsWithoutDefaults.to_pyiceberg_fileio_config(self) diff --git a/tests/load/utils.py b/tests/load/utils.py index 7835927f5c..5660202ec3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -636,14 +636,7 @@ def destinations_configs( table_format="iceberg", supports_merge=False, file_format="parquet", - credentials=( - resolve_configuration( - GcpOAuthCredentialsWithoutDefaults(), - sections=("destination", "fsgcpoauth"), - ) - if bucket == GCS_BUCKET - else None - ), + destination_name="fsgcpoauth" if bucket == GCS_BUCKET else None, ) ] From 346b270da077f6a43a2822794560cc6f2a6bc7dc Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 10 Dec 2024 23:24:48 +0100 Subject: [PATCH 70/71] fixes ca cert bundle duckdb azure on ci --- .github/workflows/test_destinations.yml | 4 +++- dlt/destinations/impl/filesystem/sql_client.py | 3 ++- .../website/docs/dlt-ecosystem/destinations/delta-iceberg.md | 5 +++-- .../docs/general-usage/dataset-access/ibis-backend.md | 3 +-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index f6204ab0aa..c2c66f539e 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -77,9 +77,11 @@ jobs: # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift - name: Install dependencies - # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake -E pyiceberg + - name: enable certificates for azure and duckdb + run: sudo mkdir -p /etc/pki/tls/certs && sudo ln -s /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + - name: Upgrade sqlalchemy run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index 6409167e29..d39f4c3431 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -13,6 +13,7 @@ from dlt.common.destination.reference import DBApiCursor +from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS from dlt.destinations.sql_client import raise_database_error from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient @@ -193,7 +194,7 @@ def open_connection(self) -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux # see duckdb docs - if self.fs_client.config.protocol in ["az", "abfss"]: + if self.fs_client.config.protocol in AZURE_BLOB_STORAGE_PROTOCOLS: self._conn.sql("SET azure_transport_option_type = 'curl';") return self._conn diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md index 6c0dfa50ee..7a056d6b40 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md +++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md @@ -10,12 +10,13 @@ keywords: [delta, iceberg, destination, data warehouse] ## How it works `dlt` uses the [deltalake](https://pypi.org/project/deltalake/) and [pyiceberg](https://pypi.org/project/pyiceberg/) libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`. -## Iceberg catalog +## Iceberg single-user ephemeral catalog `dlt` uses single-table, ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/)s. These catalogs are created "on demand" when a pipeline is run, and do not persist afterwards. If a table already exists in the filesystem, it gets registered into the catalog using its latest metadata file. This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. :::caution While ephemeral catalogs make it easy to get started with Iceberg, it comes with limitations: - concurrent writes are not handled and may lead to corrupt table state +- we cannot guarantee that reads concurrent with writes are clean - the latest manifest file needs to be searched for using file listing—this can become slow with large tables, especially in cloud object stores ::: @@ -69,7 +70,7 @@ pipeline.run(my_resource, table_format="delta") ## Table format partitioning -Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: +Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: ```py @dlt.resource( diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md index 8f4b0fb6b6..9f9b65e9c0 100644 --- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md +++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md @@ -6,7 +6,7 @@ keywords: [data, dataset, ibis] # Ibis -Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). +Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). `dlt` provides an easy way to hand over your loaded dataset to an Ibis backend connection. @@ -46,4 +46,3 @@ print(table.limit(10).execute()) # Visit the ibis docs to learn more about the available methods ``` - From accb62defa79e1f5ed754571ddd6247ea5c6a62b Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Wed, 11 Dec 2024 01:33:37 +0100 Subject: [PATCH 71/71] allow for airflow dep to be present during type check --- dlt/common/configuration/specs/config_providers_context.py | 2 +- dlt/helpers/airflow_helper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py index 29280dc7a8..a244ab571f 100644 --- a/dlt/common/configuration/specs/config_providers_context.py +++ b/dlt/common/configuration/specs/config_providers_context.py @@ -132,7 +132,7 @@ def _airflow_providers() -> List[ConfigProvider]: # check if we are in task context and provide more info from airflow.operators.python import get_current_context # noqa - ti: TaskInstance = get_current_context()["ti"] + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] # log outside of stderr/out redirect if secrets_toml_var is None: diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 6f38fd657c..aaa19ea97d 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -255,7 +255,7 @@ def _run( # use task logger if self.use_task_logger: - ti: TaskInstance = get_current_context()["ti"] + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] logger.LOGGER = ti.log # set global number of buffered items