dlt-hub · rudolfix · Jun 26, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/dlt/destinations/impl/athena/__init__.py b/dlt/destinations/impl/athena/__init__.py
@@ -11,7 +11,8 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.preferred_staging_file_format = "parquet"
     caps.supported_staging_file_formats = ["parquet", "jsonl"]
     caps.escape_identifier = escape_athena_identifier
-    caps.case_identifier = str.lower
+    caps.casefold_identifier = str.lower
+    caps.has_case_sensitive_identifiers = False
     caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE)
     caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0)
     caps.max_identifier_length = 255

diff --git a/dlt/destinations/impl/bigquery/__init__.py b/dlt/destinations/impl/bigquery/__init__.py
@@ -1,4 +1,4 @@
-from dlt.common.data_writers.escape import escape_bigquery_identifier
+from dlt.common.data_writers.escape import escape_hive_identifier
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
 
@@ -9,8 +9,11 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.supported_loader_file_formats = ["jsonl", "parquet"]
     caps.preferred_staging_file_format = "parquet"
     caps.supported_staging_file_formats = ["parquet", "jsonl"]
-    caps.escape_identifier = escape_bigquery_identifier
+    caps.escape_identifier = escape_hive_identifier
     caps.escape_literal = None
+    caps.has_case_sensitive_identifiers = (
+        True  # there are case insensitive identifiers but dlt does not use them
+    )
     caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE)
     caps.wei_precision = (76, 38)
     caps.max_identifier_length = 1024
@@ -21,5 +24,6 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.is_max_text_data_type_length_in_bytes = True
     caps.supports_ddl_transactions = False
     caps.supports_clone_table = True
+    caps.schema_supports_numeric_precision = False  # no precision information in BigQuery
 
     return caps
diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
@@ -43,17 +43,18 @@
 from dlt.destinations.job_impl import NewReferenceJob
 from dlt.destinations.sql_jobs import SqlMergeJob
 from dlt.destinations.type_mapping import TypeMapper
+from dlt.destinations.utils import parse_db_data_type_str_with_precision
 
 
 class BigQueryTypeMapper(TypeMapper):
     sct_to_unbound_dbt = {
         "complex": "JSON",
         "text": "STRING",
         "double": "FLOAT64",
-        "bool": "BOOLEAN",
+        "bool": "BOOL",
         "date": "DATE",
         "timestamp": "TIMESTAMP",
-        "bigint": "INTEGER",
+        "bigint": "INT64",
         "binary": "BYTES",
         "wei": "BIGNUMERIC",  # non-parametrized should hold wei values
         "time": "TIME",
@@ -66,11 +67,11 @@ class BigQueryTypeMapper(TypeMapper):
 
     dbt_to_sct = {
         "STRING": "text",
-        "FLOAT": "double",
-        "BOOLEAN": "bool",
+        "FLOAT64": "double",
+        "BOOL": "bool",
         "DATE": "date",
         "TIMESTAMP": "timestamp",
-        "INTEGER": "bigint",
+        "INT64": "bigint",
         "BYTES": "binary",
         "NUMERIC": "decimal",
         "BIGNUMERIC": "decimal",
@@ -89,9 +90,10 @@ def to_db_decimal_type(self, precision: Optional[int], scale: Optional[int]) ->
     def from_db_type(
         self, db_type: str, precision: Optional[int], scale: Optional[int]
     ) -> TColumnType:
-        if db_type == "BIGNUMERIC" and precision is None:
+        # precision is present in the type name
+        if db_type == "BIGNUMERIC":
             return dict(data_type="wei")
-        return super().from_db_type(db_type, precision, scale)
+        return super().from_db_type(*parse_db_data_type_str_with_precision(db_type))
 
 
 class BigQueryLoadJob(LoadJob, FollowupJob):
@@ -231,7 +233,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
                 reason = BigQuerySqlClient._get_reason_from_errors(gace)
                 if reason == "notFound":
                     # google.api_core.exceptions.NotFound: 404 – table not found
-                    raise UnknownTableException(table["name"]) from gace
+                    raise UnknownTableException(self.schema.name, table["name"]) from gace
                 elif (
                     reason == "duplicate"
                 ):  # google.api_core.exceptions.Conflict: 409 PUT – already exists
@@ -337,31 +339,6 @@ def _get_column_def_sql(self, column: TColumnSchema, table_format: TTableFormat
             column_def_sql += " OPTIONS (rounding_mode='ROUND_HALF_AWAY_FROM_ZERO')"
         return column_def_sql
 
-    def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]:
-        schema_table: TTableSchemaColumns = {}
-        try:
-            table = self.sql_client.native_connection.get_table(
-                self.sql_client.make_qualified_table_name(table_name, escape=False),
-                retry=self.sql_client._default_retry,
-                timeout=self.config.http_timeout,
-            )
-            partition_field = table.time_partitioning.field if table.time_partitioning else None
-            for c in table.schema:
-                schema_c: TColumnSchema = {
-                    "name": c.name,
-                    "nullable": c.is_nullable,
-                    "unique": False,
-                    "sort": False,
-                    "primary_key": False,
-                    "foreign_key": False,
-                    "cluster": c.name in (table.clustering_fields or []),
-                    "partition": c.name == partition_field,
-                    **self._from_db_type(c.field_type, c.precision, c.scale),
-                }
-                schema_table[c.name] = schema_c
-            return True, schema_table
-        except gcp_exceptions.NotFound:
-            return False, schema_table
 
     def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.LoadJob:
         # append to table for merge loads (append to stage) and regular appends.

diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py
@@ -234,8 +234,8 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB
                 conn.close()
 
     def fully_qualified_dataset_name(self, escape: bool = True) -> str:
-        project_id = self.capabilities.case_identifier(self.credentials.project_id)
-        dataset_name = self.capabilities.case_identifier(self.dataset_name)
+        project_id = self.capabilities.casefold_identifier(self.credentials.project_id)
+        dataset_name = self.capabilities.casefold_identifier(self.dataset_name)
         if escape:
             project_id = self.capabilities.escape_identifier(project_id)
             dataset_name = self.capabilities.escape_identifier(dataset_name)

diff --git a/dlt/destinations/impl/databricks/__init__.py b/dlt/destinations/impl/databricks/__init__.py
@@ -2,8 +2,6 @@
 from dlt.common.data_writers.escape import escape_databricks_identifier, escape_databricks_literal
 from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
 
-from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration
-
 
 def capabilities() -> DestinationCapabilitiesContext:
     caps = DestinationCapabilitiesContext()
@@ -13,6 +11,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.supported_staging_file_formats = ["jsonl", "parquet"]
     caps.escape_identifier = escape_databricks_identifier
     caps.escape_literal = escape_databricks_literal
+    caps.has_case_sensitive_identifiers = False
     caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE)
     caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0)
     caps.max_identifier_length = 255

diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
@@ -316,7 +316,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non
 
     def _get_storage_table_query_columns(self) -> List[str]:
         fields = super()._get_storage_table_query_columns()
-        fields[1] = (  # Override because this is the only way to get data type with precision
+        fields[2] = (  # Override because this is the only way to get data type with precision
             "full_data_type"
         )
         return fields
diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py
@@ -134,8 +134,8 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB
             yield DatabricksCursorImpl(curr)  # type: ignore[abstract]
 
     def fully_qualified_dataset_name(self, escape: bool = True) -> str:
-        catalog = self.capabilities.case_identifier(self.credentials.catalog)
-        dataset_name = self.capabilities.case_identifier(self.dataset_name)
+        catalog = self.capabilities.casefold_identifier(self.credentials.catalog)
+        dataset_name = self.capabilities.casefold_identifier(self.dataset_name)
         if escape:
             catalog = self.capabilities.escape_identifier(catalog)
             dataset_name = self.capabilities.escape_identifier(dataset_name)

diff --git a/dlt/destinations/impl/duckdb/__init__.py b/dlt/destinations/impl/duckdb/__init__.py
@@ -11,6 +11,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.supported_staging_file_formats = []
     caps.escape_identifier = escape_postgres_identifier
     caps.escape_literal = escape_duckdb_literal
+    caps.has_case_sensitive_identifiers = False
     caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE)
     caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0)
     caps.max_identifier_length = 65536

diff --git a/dlt/destinations/impl/dummy/__init__.py b/dlt/destinations/impl/dummy/__init__.py
@@ -28,6 +28,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.supported_loader_file_formats = additional_formats + [config.loader_file_format]
     caps.preferred_staging_file_format = None
     caps.supported_staging_file_formats = additional_formats + [config.loader_file_format]
+    caps.has_case_sensitive_identifiers = True
     caps.max_identifier_length = 127
     caps.max_column_identifier_length = 127
     caps.max_query_length = 8 * 1024 * 1024

diff --git a/dlt/destinations/impl/motherduck/__init__.py b/dlt/destinations/impl/motherduck/__init__.py
@@ -9,6 +9,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.supported_loader_file_formats = ["parquet", "insert_values", "jsonl"]
     caps.escape_identifier = escape_postgres_identifier
     caps.escape_literal = escape_duckdb_literal
+    caps.has_case_sensitive_identifiers = False
     caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE)
     caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0)
     caps.max_identifier_length = 65536

diff --git a/dlt/destinations/impl/motherduck/sql_client.py b/dlt/destinations/impl/motherduck/sql_client.py
@@ -31,7 +31,7 @@ def __init__(self, dataset_name: str, credentials: MotherDuckCredentials) -> Non
 
     def fully_qualified_dataset_name(self, escape: bool = True) -> str:
         dataset_name = super().fully_qualified_dataset_name(escape)
-        database_name = self.capabilities.case_identifier(self.database_name)
+        database_name = self.capabilities.casefold_identifier(self.database_name)
         if escape:
             database_name = self.capabilities.escape_identifier(database_name)
         return f"{database_name}.{dataset_name}"
diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py
@@ -95,14 +95,14 @@ def drop_dataset(self) -> None:
         # Drop all views
         rows = self.execute_sql(
             "SELECT table_name FROM information_schema.views WHERE table_schema = %s;",
-            self.capabilities.case_identifier(self.dataset_name),
+            self.capabilities.casefold_identifier(self.dataset_name),
         )
         view_names = [row[0] for row in rows]
         self._drop_views(*view_names)
         # Drop all tables
         rows = self.execute_sql(
             "SELECT table_name FROM information_schema.tables WHERE table_schema = %s;",
-            self.capabilities.case_identifier(self.dataset_name),
+            self.capabilities.casefold_identifier(self.dataset_name),
         )
         table_names = [row[0] for row in rows]
         self.drop_tables(*table_names)

diff --git a/dlt/destinations/impl/postgres/__init__.py b/dlt/destinations/impl/postgres/__init__.py
@@ -1,6 +1,5 @@
 from dlt.common.data_writers.escape import escape_postgres_identifier, escape_postgres_literal
 from dlt.common.destination import DestinationCapabilitiesContext
-from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration
 from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
 from dlt.common.wei import EVM_DECIMAL_PRECISION
 
@@ -14,7 +13,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.supported_staging_file_formats = []
     caps.escape_identifier = escape_postgres_identifier
     caps.escape_literal = escape_postgres_literal
-    caps.case_identifier = str.lower
+    caps.has_case_sensitive_identifiers = True
     caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE)
     caps.wei_precision = (2 * EVM_DECIMAL_PRECISION, EVM_DECIMAL_PRECISION)
     caps.max_identifier_length = 63

diff --git a/dlt/destinations/impl/qdrant/__init__.py b/dlt/destinations/impl/qdrant/__init__.py
@@ -6,7 +6,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps = DestinationCapabilitiesContext()
     caps.preferred_loader_file_format = "jsonl"
     caps.supported_loader_file_formats = ["jsonl"]
-
+    caps.has_case_sensitive_identifiers = True
     caps.max_identifier_length = 200
     caps.max_column_identifier_length = 1024
     caps.max_query_length = 8 * 1024 * 1024

diff --git a/dlt/destinations/impl/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py
@@ -3,7 +3,11 @@
 
 from dlt.common import json, pendulum, logger
 from dlt.common.schema import Schema, TTableSchema, TSchemaTables
-from dlt.common.schema.utils import get_columns_names_with_prop, pipeline_state_table
+from dlt.common.schema.utils import (
+    get_columns_names_with_prop,
+    normalize_table_identifiers,
+    pipeline_state_table,
+)
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.destination.reference import TLoadJobState, LoadJob, JobClientBase, WithStateSync
 from dlt.common.storages import FileStorage
@@ -152,7 +156,8 @@ def __init__(self, schema: Schema, config: QdrantClientConfiguration) -> None:
         )
         # get definition of state table (may not be present in the schema)
         state_table = schema.tables.get(
-            schema.state_table_name, schema.normalize_table_identifiers(pipeline_state_table())
+            schema.state_table_name,
+            normalize_table_identifiers(pipeline_state_table(), schema.naming),
         )
         # column names are pipeline properties
         self.pipeline_state_properties = list(state_table["columns"].keys())

diff --git a/dlt/destinations/impl/redshift/__init__.py b/dlt/destinations/impl/redshift/__init__.py
@@ -11,7 +11,8 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.supported_staging_file_formats = ["jsonl", "parquet"]
     caps.escape_identifier = escape_redshift_identifier
     caps.escape_literal = escape_redshift_literal
-    caps.case_identifier = str.lower
+    caps.casefold_identifier = str.lower
+    caps.has_case_sensitive_identifiers = False
     caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE)
     caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0)
     caps.max_identifier_length = 127

diff --git a/dlt/destinations/impl/snowflake/__init__.py b/dlt/destinations/impl/snowflake/__init__.py
@@ -10,7 +10,8 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.preferred_staging_file_format = "jsonl"
     caps.supported_staging_file_formats = ["jsonl", "parquet"]
     caps.escape_identifier = escape_snowflake_identifier
-    caps.case_identifier = str.upper
+    caps.casefold_identifier = str.upper
+    caps.has_case_sensitive_identifiers = True
     caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE)
     caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0)
     caps.max_identifier_length = 255

diff --git a/dlt/destinations/impl/synapse/__init__.py b/dlt/destinations/impl/synapse/__init__.py
@@ -18,6 +18,7 @@ def capabilities() -> DestinationCapabilitiesContext:
 
     caps.escape_identifier = escape_postgres_identifier
     caps.escape_literal = escape_mssql_literal
+    caps.has_case_sensitive_identifiers = False
 
     # Synapse has a max precision of 38
     # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#DataTypes

diff --git a/dlt/destinations/impl/weaviate/__init__.py b/dlt/destinations/impl/weaviate/__init__.py
@@ -6,7 +6,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps = DestinationCapabilitiesContext()
     caps.preferred_loader_file_format = "jsonl"
     caps.supported_loader_file_formats = ["jsonl"]
-
+    caps.has_case_sensitive_identifiers = False
     caps.max_identifier_length = 200
     caps.max_column_identifier_length = 1024
     caps.max_query_length = 8 * 1024 * 1024

diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py
@@ -29,7 +29,11 @@
 from dlt.common.time import ensure_pendulum_datetime
 from dlt.common.schema import Schema, TTableSchema, TSchemaTables, TTableSchemaColumns
 from dlt.common.schema.typing import TColumnSchema, TColumnType
-from dlt.common.schema.utils import get_columns_names_with_prop, pipeline_state_table
+from dlt.common.schema.utils import (
+    get_columns_names_with_prop,
+    normalize_table_identifiers,
+    pipeline_state_table,
+)
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.destination.reference import TLoadJobState, LoadJob, JobClientBase, WithStateSync
 from dlt.common.data_types import TDataType
@@ -243,7 +247,8 @@ def __init__(self, schema: Schema, config: WeaviateClientConfiguration) -> None:
         )
         # get definition of state table (may not be present in the schema)
         state_table = schema.tables.get(
-            schema.state_table_name, schema.normalize_table_identifiers(pipeline_state_table())
+            schema.state_table_name,
+            normalize_table_identifiers(pipeline_state_table(), schema.naming),
         )
         # column names are pipeline properties
         self.pipeline_state_properties = list(state_table["columns"].keys())
@@ -453,7 +458,11 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None:
         for table_name in only_tables or self.schema.tables:
             exists, existing_columns = self.get_storage_table(table_name)
             # TODO: detect columns where vectorization was added or removed and modify it. currently we ignore change of hints
-            new_columns = self.schema.get_new_table_columns(table_name, existing_columns)
+            new_columns = self.schema.get_new_table_columns(
+                table_name,
+                existing_columns,
+                case_sensitive=self.capabilities.has_case_sensitive_identifiers,
+            )
             logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}")
             if len(new_columns) > 0:
                 if exists: