Skip to content

Commit

Permalink
Add a helper class for using the Databricks ODBC driver. (#615)
Browse files Browse the repository at this point in the history
The official Databricks ODBC driver supports a number of authentication
methods but these are difficult to use correctly and require users to
pass them manually rather than picking up on ambient credentials as
other Databricks SDKs do.

This commit adds a high-level helper class that implements (a subset of)
the Databricks client unified authentication model [0] in its
dbConnect() method and should make connection code that runs on
Databricks-aware environments work automatically without manual
configuration.

Compare the interface from this commit:

    DBI::dbConnect(
      odbc::databricks(),
      http_path = "sql/protocolv1/o/4425955464597947/1026-023828-vn51jugj"
    )

with filling all of the required parameters out manually:

    DBI::dbConnect(
      odbc::odbc(),
      driver = "/opt/simba/spark/lib/64/libsparkodbc_sb64.so",
      host = gsub("https://", "", Sys.getenv("DATABRICKS_HOST")),
      HTTPPath = "sql/protocolv1/o/4425955464597947/1026-023828-vn51jugj"
      ThriftTransport = 2,
      UserAgentEntry = Sys.getenv("SPARK_CONNECT_USER_AGENT"),
      port = 443,
      Protocol = "https",
      SSL = 1,
      AuthMech = 11,
      Auth_Flow = 0,
      Auth_AccessToken = "<your OAuth token>"
    )

[0]: https://docs.databricks.com/en/dev-tools/auth.html#databricks-client-unified-authentication

Signed-off-by: Aaron Jacobs <[email protected]>
  • Loading branch information
atheriel authored Nov 14, 2023
1 parent 5c69008 commit 1ed1faf
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 0 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ Collate:
'Result.R'
'Table.R'
'Viewer.R'
'databricks.R'
'db.R'
'hidden.R'
'utils.R'
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ S3method(odbcListColumns,OdbcConnection)
S3method(odbcListObjectTypes,default)
S3method(odbcListObjects,OdbcConnection)
S3method(odbcPreviewObject,OdbcConnection)
export(databricks)
export(isTempTable)
export(odbc)
export(odbcConnectionActions)
Expand All @@ -34,6 +35,7 @@ export(odbcListObjectTypes)
export(odbcListObjects)
export(odbcPreviewObject)
export(odbcSetTransactionIsolationLevel)
exportClasses(DatabricksOdbcDriver)
exportClasses(OdbcConnection)
exportClasses(OdbcDriver)
exportClasses(OdbcResult)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
* Oracle: Fix regression when falling back to odbcConnectionColumns to
describe column data types (@detule, #587)

* Add a new, specialised `odbc::databricks()` class with its own `dbConnect()`
method to make using Databricks's ODBC driver easier (@atheriel, #615).

# odbc 1.3.5

* Various fixes for `R CMD check`.
Expand Down
166 changes: 166 additions & 0 deletions R/databricks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#' Helper for Connecting to Databricks via ODBC
#'
#' @description
#'
#' Connect to Databricks clusters and SQL warehouses via the [Databricks ODBC
#' driver](https://www.databricks.com/spark/odbc-drivers-download).
#'
#' In particular, the custom `dbConnect()` method for the Databricks ODBC driver
#' implements a subset of the [Databricks client unified authentication](https://docs.databricks.com/en/dev-tools/auth.html#databricks-client-unified-authentication)
#' model, with support for personal access tokens, OAuth machine-to-machine
#' credentials, and OAuth user-to-machine credentials supplied via Posit
#' Workbench or the Databricks CLI on desktop.
#'
#' All of these credentials are detected automatically if present using
#' [standard environment variables](https://docs.databricks.com/en/dev-tools/auth.html#environment-variables-and-fields-for-client-unified-authentication).
#'
#' @inheritParams DBI::dbConnect
#' @param http_path To query a cluster, use the HTTP Path value found under
#' `Advanced Options > JDBC/ODBC` in the Databricks UI. For SQL warehouses,
#' this is found under `Connection Details` instead.
#' @param workspace The URL of a Databricks workspace, e.g.
#' `"https://example.cloud.databricks.com"`.
#' @param driver The name of the Databricks ODBC driver, or `NULL` to use the
#' default name.
#' @param ... Further arguments passed on to [`dbConnect()`].
#'
#' @returns An `OdbcConnection` object with an active connection to a Databricks
#' cluster or SQL warehouse.
#'
#' @examples
#' \dontrun{
#' DBI::dbConnect(
#' odbc::databricks(),
#' http_path = "sql/protocolv1/o/4425955464597947/1026-023828-vn51jugj"
#' )
#' }
#' @export
databricks <- function() {
new("DatabricksOdbcDriver")
}

#' @rdname databricks
#' @export
setClass("DatabricksOdbcDriver", contains = "OdbcDriver")

#' @rdname databricks
#' @export
setMethod(
"dbConnect", "DatabricksOdbcDriver",
function(drv, http_path, workspace = Sys.getenv("DATABRICKS_HOST"),
driver = NULL, ...) {
args <- databricks_args(
http_path = http_path,
workspace = workspace,
driver = driver
)
args <- c(args, ...)
inject(dbConnect(odbc(), !!!args))
}
)

databricks_args <- function(http_path,
workspace = Sys.getenv("DATABRICKS_HOST"),
driver = NULL) {
if (nchar(workspace) == 0) {
stop("No Databricks workspace URL provided")
}
hostname <- gsub("https://", "", workspace)
driver <- driver %||% default_databricks_driver()

# Check some standard Databricks environment variables. This is used to
# implement a subset of the "Databricks client unified authentication" model.
token <- Sys.getenv("DATABRICKS_TOKEN")
client_id <- Sys.getenv("DATABRICKS_CLIENT_ID")
client_secret <- Sys.getenv("DATABRICKS_CLIENT_SECRET")
cli_path <- Sys.getenv("DATABRICKS_CLI_PATH", "databricks")

user_agent <- paste0("r-odbc/", utils::packageVersion("odbc"))
if (nchar(Sys.getenv("SPARK_CONNECT_USER_AGENT")) != 0) {
# Respect the existing user-agent if present. Normally we'd append, but the
# Databricks ODBC driver does not yet support space-separated entries in
# this field.
user_agent <- Sys.getenv("SPARK_CONNECT_USER_AGENT")
}

args <- list(
driver = driver,
Host = hostname,
HTTPPath = http_path,
ThriftTransport = 2,
UserAgentEntry = user_agent,
# Connections to Databricks are always over HTTPS.
Port = 443,
Protocol = "https",
SSL = 1
)

# Check for Workbench-provided credentials.
wb_token <- NULL
if (exists(".rs.api.getDatabricksToken")) {
getDatabricksToken <- get(".rs.api.getDatabricksToken")
wb_token <- getDatabricksToken(workspace)
}

if (nchar(token) != 0) {
# An explicit PAT takes precedence over everything else.
args <- c(args, AuthMech = 3, uid = "token", pwd = token)
} else if (nchar(client_id) != 0) {
# Next up are explicit OAuth2 M2M credentials.
args <- c(
args,
AuthMech = 11,
Auth_Flow = 1,
Auth_Client_ID = client_id,
Auth_Client_Secret = client_secret
)
} else if (!is.null(wb_token)) {
# Next up are Workbench-provided credentials.
args <- c(args, AuthMech = 11, Auth_Flow = 0, Auth_AccessToken = wb_token)
} else if (!is_hosted_session() && nchar(Sys.which(cli_path)) != 0) {
# When on desktop, try using the Databricks CLI for auth.
output <- suppressWarnings(
system2(
cli_path,
c("auth", "token", "--host", workspace),
stdout = TRUE,
stderr = TRUE
)
)
output <- paste(output, collapse = "\n")
# If we don't get an error message, try to extract the token from the JSON-
# formatted output.
if (grepl("access_token", output, fixed = TRUE)) {
token <- gsub(".*access_token\":\\s?\"([^\"]+).*", "\\1", output)
args <- c(args, AuthMech = 11, Auth_Flow = 0, Auth_AccessToken = token)
}
}

args
}

# Try to determine whether we can redirect the user's browser to a server on
# localhost, which isn't possible if we are running on a hosted platform.
#
# This is based on the strategy pioneered by the {gargle} package and {httr2}.
is_hosted_session <- function() {
# If RStudio Server or Posit Workbench is running locally (which is possible,
# though unusual), it's not acting as a hosted environment.
Sys.getenv("RSTUDIO_PROGRAM_MODE") == "server" &&
!grepl("localhost", Sys.getenv("RSTUDIO_HTTP_REFERER"), fixed = TRUE)
}

# Returns a sensible driver name even if odbc.ini and odbcinst.ini do not
# contain an entry for the Databricks ODBC driver.
default_databricks_driver <- function() {
# For Linux and macOS we can default to the shared library paths used by the
# official installers. On Windows we use the official driver name instead.
default_path <- ""
if (Sys.info()["sysname"] == "Linux") {
default_path <- "/opt/simba/spark/lib/64/libsparkodbc_sb64.so"
} else if (Sys.info()["sysname"] == "Darwin") {
default_path <- "/Library/simba/spark/lib/libsparkodbc_sbu.dylib"
}

if (file.exists(default_path)) default_path else "Simba Spark ODBC Driver"
}
60 changes: 60 additions & 0 deletions man/databricks.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1ed1faf

Please sign in to comment.