From 39c0a01f58058f0f6df0ed3d4e16079ec84196c9 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:26:05 +0100 Subject: [PATCH] add databricks oauth authentication (#2138) * add databricks oauth authentication * improve auth databricks test * force token-based auth for azure external location tests --- .../impl/databricks/configuration.py | 12 ++++ .../impl/databricks/sql_client.py | 20 ++++++- .../dlt-ecosystem/destinations/databricks.md | 25 ++++++++- poetry.lock | 26 ++++++++- pyproject.toml | 3 +- .../test_databricks_configuration.py | 10 ++++ .../load/pipeline/test_databricks_pipeline.py | 56 +++++++++++++++++++ 7 files changed, 145 insertions(+), 7 deletions(-) diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index c95b6eba4c..21338bd310 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -4,6 +4,7 @@ from dlt.common.typing import TSecretStrValue from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration +from dlt.common.configuration.exceptions import ConfigurationValueError DATABRICKS_APPLICATION_ID = "dltHub_dlt" @@ -15,6 +16,8 @@ class DatabricksCredentials(CredentialsConfiguration): server_hostname: str = None http_path: str = None access_token: Optional[TSecretStrValue] = None + client_id: Optional[TSecretStrValue] = None + client_secret: Optional[TSecretStrValue] = None http_headers: Optional[Dict[str, str]] = None session_configuration: Optional[Dict[str, Any]] = None """Dict of session parameters that will be passed to `databricks.sql.connect`""" @@ -27,9 +30,18 @@ class DatabricksCredentials(CredentialsConfiguration): "server_hostname", "http_path", "catalog", + "client_id", + "client_secret", "access_token", ] + def on_resolved(self) -> None: + if not ((self.client_id and self.client_secret) or self.access_token): + raise ConfigurationValueError( + "No valid authentication method detected. Provide either 'client_id' and" + " 'client_secret' for OAuth, or 'access_token' for token-based authentication." + ) + def to_connector_params(self) -> Dict[str, Any]: conn_params = dict( catalog=self.catalog, diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 8bff4e0d73..16e1e73d93 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -11,10 +11,12 @@ Tuple, Union, Dict, + cast, + Callable, ) - -from databricks import sql as databricks_lib +from databricks.sdk.core import Config, oauth_service_principal +from databricks import sql as databricks_lib # type: ignore[attr-defined] from databricks.sql.client import ( Connection as DatabricksSqlConnection, Cursor as DatabricksSqlCursor, @@ -73,8 +75,22 @@ def __init__( self._conn: DatabricksSqlConnection = None self.credentials = credentials + def _get_oauth_credentials(self) -> Optional[Callable[[], Dict[str, str]]]: + config = Config( + host=f"https://{self.credentials.server_hostname}", + client_id=self.credentials.client_id, + client_secret=self.credentials.client_secret, + ) + return cast(Callable[[], Dict[str, str]], oauth_service_principal(config)) + def open_connection(self) -> DatabricksSqlConnection: conn_params = self.credentials.to_connector_params() + + if self.credentials.client_id and self.credentials.client_secret: + conn_params["credentials_provider"] = self._get_oauth_credentials + else: + conn_params["access_token"] = self.credentials.access_token + self._conn = databricks_lib.connect( **conn_params, schema=self.dataset_name, use_inline_params="silent" ) diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index 513a3b792f..dd046ce28a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -90,6 +90,29 @@ If you already have your Databricks workspace set up, you can skip to the [Loade Click your email in the top right corner and go to "User Settings". Go to "Developer" -> "Access Tokens". Generate a new token and save it. You will use it in your `dlt` configuration. +## OAuth M2M (Machine-to-Machine) Authentication + +You can authenticate to Databricks using a service principal via OAuth M2M. This method allows for secure, programmatic access to Databricks resources without requiring a user-managed personal access token. + +### Create a Service Principal in Databricks +Follow the instructions in the Databricks documentation to create a service principal and retrieve the client_id and client_secret: + +[Authenticate access to Databricks using OAuth M2M](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) + +Once you have the service principal credentials, update your secrets.toml as shown bellow. + +### Configuration + +Add the following fields to your `.dlt/secrets.toml` file: +```toml +[destination.databricks.credentials] +server_hostname = "MY_DATABRICKS.azuredatabricks.net" +http_path = "/sql/1.0/warehouses/12345" +catalog = "my_catalog" +client_id = "XXX" +client_secret = "XXX" +``` + ## Loader setup guide **1. Initialize a project with a pipeline that loads to Databricks by running** @@ -118,7 +141,7 @@ Example: [destination.databricks.credentials] server_hostname = "MY_DATABRICKS.azuredatabricks.net" http_path = "/sql/1.0/warehouses/12345" -access_token = "MY_ACCESS_TOKEN" +access_token = "MY_ACCESS_TOKEN" # Replace for client_id and client_secret when using OAuth catalog = "my_catalog" ``` diff --git a/poetry.lock b/poetry.lock index 83090360b0..82d9bf90f8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "about-time" @@ -2208,6 +2208,26 @@ nr-date = ">=2.0.0,<3.0.0" typeapi = ">=2.0.1,<3.0.0" typing-extensions = ">=3.10.0" +[[package]] +name = "databricks-sdk" +version = "0.39.0" +description = "Databricks SDK for Python (Beta)" +optional = true +python-versions = ">=3.7" +files = [ + {file = "databricks_sdk-0.39.0-py3-none-any.whl", hash = "sha256:915fbf12b249264f74ddae2ca739530e3c4a9c5a454617ac403115d6466c2f99"}, + {file = "databricks_sdk-0.39.0.tar.gz", hash = "sha256:2e04edbb9e050f4362da804fb5dad07637c5adecfcffb4d0ca8abb5aefa36d06"}, +] + +[package.dependencies] +google-auth = ">=2.0,<3.0" +requests = ">=2.28.1,<3" + +[package.extras] +dev = ["autoflake", "databricks-connect", "httpx", "ipython", "ipywidgets", "isort", "langchain-openai", "openai", "pycodestyle", "pyfakefs", "pytest", "pytest-cov", "pytest-mock", "pytest-rerunfailures", "pytest-xdist", "requests-mock", "wheel", "yapf"] +notebook = ["ipython (>=8,<9)", "ipywidgets (>=8,<9)"] +openai = ["httpx", "langchain-openai", "openai"] + [[package]] name = "databricks-sql-connector" version = "2.9.6" @@ -10680,7 +10700,7 @@ az = ["adlfs"] bigquery = ["db-dtypes", "gcsfs", "google-cloud-bigquery", "grpcio", "pyarrow"] cli = ["cron-descriptor", "pipdeptree"] clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyarrow", "s3fs"] -databricks = ["databricks-sql-connector"] +databricks = ["databricks-sdk", "databricks-sql-connector"] deltalake = ["deltalake", "pyarrow"] dremio = ["pyarrow"] duckdb = ["duckdb"] @@ -10707,4 +10727,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "84e8b8eccd9b8ee104a2dc08f5b83987aeb06540d61330390ce849cc1ad6acb4" +content-hash = "5513aca05ae04d7941f2a890d0fefa86a08371508a2d319c1e558c29ff8a45f3" diff --git a/pyproject.toml b/pyproject.toml index bfa830cd06..d12073601d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,6 +95,7 @@ db-dtypes = { version = ">=1.2.0", optional = true } # pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } # we will rely on manual installation of `sqlalchemy>=2.0.18` instead pyiceberg = { version = ">=0.8.1", python = ">=3.9", optional = true } +databricks-sdk = {version = ">=0.38.0", optional = true} [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] @@ -117,7 +118,7 @@ weaviate = ["weaviate-client"] mssql = ["pyodbc"] synapse = ["pyodbc", "adlfs", "pyarrow"] qdrant = ["qdrant-client"] -databricks = ["databricks-sql-connector"] +databricks = ["databricks-sql-connector", "databricks-sdk"] clickhouse = ["clickhouse-driver", "clickhouse-connect", "s3fs", "gcsfs", "adlfs", "pyarrow"] dremio = ["pyarrow"] lancedb = ["lancedb", "pyarrow", "tantivy"] diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index e27da4db2a..8b3beed2b3 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -4,6 +4,7 @@ pytest.importorskip("databricks") from dlt.common.exceptions import TerminalValueError +from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob from dlt.common.configuration import resolve_configuration @@ -86,3 +87,12 @@ def test_databricks_abfss_converter() -> None: abfss_url == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" ) + + +def test_databricks_auth_invalid() -> None: + with pytest.raises(ConfigurationValueError, match="No valid authentication method detected.*"): + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"] = "" + bricks = databricks() + bricks.configuration(None, accept_partial=True) diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py index e802cde693..078dce3a7f 100644 --- a/tests/load/pipeline/test_databricks_pipeline.py +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -2,6 +2,7 @@ import os from dlt.common.utils import uniq_id +from dlt.destinations import databricks from tests.load.utils import ( GCS_BUCKET, DestinationTestConfiguration, @@ -23,6 +24,10 @@ ids=lambda x: x.name, ) def test_databricks_external_location(destination_config: DestinationTestConfiguration) -> None: + # force token-based authentication + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + # do not interfere with state os.environ["RESTORE_FROM_DESTINATION"] = "False" # let the package complete even with failed jobs @@ -145,3 +150,54 @@ def test_databricks_gcs_external_location(destination_config: DestinationTestCon assert ( "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message ) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=("databricks",)), + ids=lambda x: x.name, +) +def test_databricks_auth_oauth(destination_config: DestinationTestConfiguration) -> None: + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"] = "" + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.credentials.client_id and config.credentials.client_secret + assert not config.credentials.access_token + + dataset_name = "test_databricks_oauth" + uniq_id() + pipeline = destination_config.setup_pipeline( + "test_databricks_oauth", dataset_name=dataset_name, destination=bricks + ) + + info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) + assert info.has_failed_jobs is False + + with pipeline.sql_client() as client: + rows = client.execute_sql(f"select * from {dataset_name}.digits") + assert len(rows) == 3 + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=("databricks",)), + ids=lambda x: x.name, +) +def test_databricks_auth_token(destination_config: DestinationTestConfiguration) -> None: + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.credentials.access_token + assert not (config.credentials.client_secret and config.credentials.client_id) + + dataset_name = "test_databricks_token" + uniq_id() + pipeline = destination_config.setup_pipeline( + "test_databricks_token", dataset_name=dataset_name, destination=bricks + ) + + info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) + assert info.has_failed_jobs is False + + with pipeline.sql_client() as client: + rows = client.execute_sql(f"select * from {dataset_name}.digits") + assert len(rows) == 3