mlverse · zacdav-db · Sep 27, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -7,6 +7,11 @@
 * No longer install 'rpy2' by default. It will prompt user for installation
 the first time `spark_apply()` is called (#125)
 
+* Adding support for Databricks serverless interactive compute (#127)
+
+* Extended authentication method support for Databricks by deferring to SDK
+(#127)
+
 # pysparklyr 0.1.5
 
 ### Improvements

diff --git a/R/databricks-utils.R b/R/databricks-utils.R
@@ -42,18 +42,17 @@ databricks_token <- function(token = NULL, fail = FALSE) {
       }
     }
   }
-  # Checks for OAuth Databricks token inside the RStudio API
-  if (is.null(token) && exists(".rs.api.getDatabricksToken")) {
-    getDatabricksToken <- get(".rs.api.getDatabricksToken")
-    token <- set_names(getDatabricksToken(databricks_host()), "oauth")
-  }
+  # # Checks for OAuth Databricks token inside the RStudio API
+  # if (is.null(token) && exists(".rs.api.getDatabricksToken")) {
+  #   getDatabricksToken <- get(".rs.api.getDatabricksToken")
+  #   token <- set_names(getDatabricksToken(databricks_host()), "oauth")
+  # }
   if (is.null(token)) {
     if (fail) {
       rlang::abort(c(
         paste0(
           "No authentication token was identified: \n",
           " - No 'DATABRICKS_TOKEN' environment variable found \n",
-          " - No Databricks OAuth token found \n",
           " - Not passed as a function argument"
         ),
         "Please add your Token to 'DATABRICKS_TOKEN' inside your .Renviron file."
@@ -249,3 +248,4 @@ sanitize_host <- function(url, silent = FALSE) {
   }
   ret
 }
+
diff --git a/R/python-install.R b/R/python-install.R
@@ -217,7 +217,8 @@ install_environment <- function(
     "PyArrow",
     "grpcio",
     "google-api-python-client",
-    "grpcio_status"
+    "grpcio_status",
+    "databricks-sdk"
   )
 
   if (add_torch && install_ml) {

diff --git a/R/sparklyr-spark-connect.R b/R/sparklyr-spark-connect.R
@@ -57,6 +57,7 @@ spark_connect_method.spark_method_databricks_connect <- function(
     config,
     app_name,
     version = NULL,
+    serverless = FALSE,
     hadoop_version,
     extensions,
     scala_version,
@@ -71,22 +72,28 @@ spark_connect_method.spark_method_databricks_connect <- function(
   method <- method[[1]]
   token <- databricks_token(token, fail = FALSE)
   cluster_id <- cluster_id %||% Sys.getenv("DATABRICKS_CLUSTER_ID")
+  cli_path <- Sys.getenv("DATABRICKS_CLI_PATH", "databricks")
   master <- databricks_host(master, fail = FALSE)
   if (host_sanitize && master != "") {
     master <- sanitize_host(master, silent)
   }
 
+  # if serverless is TRUE, cluster_id is overruled (set to NULL)
   cluster_info <- NULL
-  if (cluster_id != "" && master != "" && token != "") {
-    cluster_info <- databricks_dbr_version_name(
-      cluster_id = cluster_id,
-      host = master,
-      token = token,
-      silent = silent
-    )
-    if (is.null(version)) {
-      version <- cluster_info$version
+  if (!serverless) {
+    if (cluster_id != "" && master != "" && token != "") {
+      cluster_info <- databricks_dbr_version_name(
+        cluster_id = cluster_id,
+        host = master,
+        token = token,
+        silent = silent
+      )
+      if (is.null(version)) {
+        version <- cluster_info$version
+      }
     }
+  } else {
+    cluster_id <- NULL
   }
 
   envname <- use_envname(
@@ -102,34 +109,61 @@ spark_connect_method.spark_method_databricks_connect <- function(
     return(invisible)
   }
 
-  db <- import_check("databricks.connect", envname, silent)
+  dbc <- import_check("databricks.connect", envname, silent)
+  db_sdk <- import_check("databricks.sdk", envname, silent = TRUE)
 
   if (!is.null(cluster_info)) {
     msg <- "{.header Connecting to} {.emph '{cluster_info$name}'}"
     msg_done <- "{.header Connected to:} {.emph '{cluster_info$name}'}"
     master_label <- glue("{cluster_info$name} ({cluster_id})")
-  } else {
+  } else if (!serverless) {
     msg <- "{.header Connecting to} {.emph '{cluster_id}'}"
     msg_done <- "{.header Connected to:} '{.emph '{cluster_id}'}'"
     master_label <- glue("Databricks Connect - Cluster: {cluster_id}")
+  } else if (serverless) {
+    msg <- "{.header Connecting to} {.emph 'serverless'}"
+    msg_done <- "{.header Connected to:} '{.emph 'serverless'}'"
+    master_label <- glue("Databricks Connect - Cluster: serverless")
   }
 
   if (!silent) {
     cli_div(theme = cli_colors())
     cli_progress_step(msg, msg_done)
   }
 
-  remote_args <- list()
-  if (master != "") remote_args$host <- master
-  if (token != "") remote_args$token <- token
-  if (cluster_id != "") remote_args$cluster_id <- cluster_id
+  # sdk config
+  conf_args <- list(host = master)
+  # if token is found, propagate
+  # otherwise trust in sdk to detect and do what it can?
+  if (token != "") {
+    conf_args$token <- token
+    conf_args$auth_type <- "pat"
+  }
+
+  if (serverless) {
+    conf_args$serverless_compute_id <- "auto"
+  } else {
+    conf_args$cluster_id <- cluster_id
+  }
 
-  databricks_session <- function(...) {
-    user_agent <- build_user_agent()
-    db$DatabricksSession$builder$remote(...)$userAgent(user_agent)
+  sdk_config <- db_sdk$core$Config(!!!conf_args)
+
+  # unsure if this iss needed anymore?
+  if (!httr2:::is_hosted_session() && nchar(Sys.which(cli_path)) != 0) {
+    # When on desktop, try using the Databricks CLI for auth.
+    output <- suppressWarnings(
+      system2(
+        cli_path,
+        c("auth", "login", "--host", master),
+        stdout = TRUE,
+        stderr = TRUE
+      )
+    )
   }
 
-  conn <- exec(databricks_session, !!!remote_args)
+  user_agent <- build_user_agent()
+  conn <- dbc$DatabricksSession$builder$sdkConfig(sdk_config)$userAgent(user_agent)
+
 
   if (!silent) {
     cli_progress_done()
@@ -141,6 +175,7 @@ spark_connect_method.spark_method_databricks_connect <- function(
     master_label = master_label,
     con_class = "connect_databricks",
     cluster_id = cluster_id,
+    serverless = serverless,
     method = method,
     config = config
   )
@@ -151,6 +186,7 @@ initialize_connection <- function(
     master_label,
     con_class,
     cluster_id = NULL,
+    serverless = NULL,
     method = NULL,
     config = NULL) {
   warnings <- import("warnings")
@@ -173,12 +209,17 @@ initialize_connection <- function(
     "ignore",
     message = "Index.format is deprecated and will be removed in a future version"
   )
+
+  assign("conn", conn, .GlobalEnv)
+
   session <- conn$getOrCreate()
   get_version <- try(session$version, silent = TRUE)
   if (inherits(get_version, "try-error")) databricks_dbr_error(get_version)
-  session$conf$set("spark.sql.session.localRelationCacheThreshold", 1048576L)
-  session$conf$set("spark.sql.execution.arrow.pyspark.enabled", "true")
-  session$conf$set("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")
+  if (!serverless) {
+    session$conf$set("spark.sql.session.localRelationCacheThreshold", 1048576L)
+    session$conf$set("spark.sql.execution.arrow.pyspark.enabled", "true")
+    session$conf$set("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")
+  }
 
   # do we need this `spark_context` object?
   spark_context <- list(spark_context = session)