From 21b20304e9870f1ba7f903c5a08fc19c05c1101c Mon Sep 17 00:00:00 2001 From: Hong Ooi Date: Tue, 4 Dec 2018 15:57:26 +0800 Subject: [PATCH] Adlsv2 (#1) * adls_endpoint * adls client * typo * cleanup * more adls funcs * adls_endpoint * adls client * typo * cleanup * more adls funcs * merge break * Revert "merge break" This reverts commit c451b2e1da980ed78b120819050c091a204d30f1. * more adls fs ops * kibozing for error messages * 1st cut, adls file ops * more adls ops * adls upload working * adls download working * everything seems to be working * adls doc * doc update * more doc update * clarify hns setting * adls testing * more doc update * adls tests working * update docs for end of limited access preview * readme edit * reword readme * update vignette * oopsie * oopsie again --- NAMESPACE | 23 +++ R/add_methods.R | 21 +- R/adls_client_funcs.R | 369 ++++++++++++++++++++++++++++++++++ R/blob_client_funcs.R | 2 + R/client.R | 54 ++++- R/storage.R | 5 + R/storage_utils.R | 25 ++- README.md | 102 +++------- man/adls.Rd | 69 +++++++ man/adls_filesystem.Rd | 106 ++++++++++ man/blob_container.Rd | 2 + man/create_storage_account.Rd | 11 +- man/storage_endpoint.Rd | 23 ++- tests/testthat/test04_adls.R | 109 ++++++++++ vignettes/intro.rmd | 46 +++-- 15 files changed, 852 insertions(+), 115 deletions(-) create mode 100644 R/adls_client_funcs.R create mode 100644 man/adls.Rd create mode 100644 man/adls_filesystem.Rd create mode 100644 tests/testthat/test04_adls.R diff --git a/NAMESPACE b/NAMESPACE index 7531bc5..e9cfec0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,21 @@ # Generated by roxygen2: do not edit by hand +S3method(adls_filesystem,adls_endpoint) +S3method(adls_filesystem,character) S3method(blob_container,blob_endpoint) S3method(blob_container,character) +S3method(create_adls_filesystem,adls_endpoint) +S3method(create_adls_filesystem,adls_filesystem) +S3method(create_adls_filesystem,character) S3method(create_blob_container,blob_container) S3method(create_blob_container,blob_endpoint) S3method(create_blob_container,character) S3method(create_file_share,character) S3method(create_file_share,file_endpoint) S3method(create_file_share,file_share) +S3method(delete_adls_filesystem,adls_endpoint) +S3method(delete_adls_filesystem,adls_filesystem) +S3method(delete_adls_filesystem,character) S3method(delete_blob_container,blob_container) S3method(delete_blob_container,blob_endpoint) S3method(delete_blob_container,character) @@ -19,27 +27,39 @@ S3method(file_share,file_endpoint) S3method(get_storage_properties,blob_container) S3method(get_storage_properties,file_share) S3method(get_storage_properties,storage_endpoint) +S3method(list_adls_filesystems,adls_endpoint) +S3method(list_adls_filesystems,character) S3method(list_blob_containers,blob_endpoint) S3method(list_blob_containers,character) S3method(list_file_shares,character) S3method(list_file_shares,file_endpoint) +S3method(print,adls_endpoint) +S3method(print,adls_filesystem) S3method(print,blob_container) S3method(print,file_share) S3method(print,storage_endpoint) export(acquire_lease) +export(adls_endpoint) +export(adls_filesystem) export(az_storage) export(blob_container) export(blob_endpoint) export(break_lease) export(change_lease) +export(create_adls_dir) +export(create_adls_filesystem) export(create_azure_dir) export(create_blob_container) export(create_file_share) +export(delete_adls_dir) +export(delete_adls_file) +export(delete_adls_filesystem) export(delete_azure_dir) export(delete_azure_file) export(delete_blob) export(delete_blob_container) export(delete_file_share) +export(download_adls_file) export(download_azure_file) export(download_blob) export(download_from_url) @@ -49,6 +69,8 @@ export(get_azure_dir_properties) export(get_azure_file_properties) export(get_blob_properties) export(get_storage_properties) +export(list_adls_files) +export(list_adls_filesystems) export(list_azure_files) export(list_blob_containers) export(list_blobs) @@ -56,6 +78,7 @@ export(list_file_shares) export(release_lease) export(renew_lease) export(storage_endpoint) +export(upload_adls_file) export(upload_azure_file) export(upload_blob) export(upload_to_url) diff --git a/R/add_methods.R b/R/add_methods.R index 0cbdee6..a19cbc5 100644 --- a/R/add_methods.R +++ b/R/add_methods.R @@ -20,7 +20,7 @@ #' - `replication`: The replication strategy for the account. The default is locally-redundant storage (LRS). #' - `access_tier`: The access tier, either `"hot"` or `"cool"`, for blobs. #' - `https_only`: Whether a HTTPS connection is required to access the storage. -#' - `hierarchical_namespace_enabled`: Whether to enable hierarchical namespaces, which are a feature of Azure Data Lake Storage Gen 2 and provide more a efficient way to manage storage. Note that ADLS Gen2 is currently (as of November 2018) in limited-access public preview. if you are not enrolled in the preview program, this argument has no effect. +#' - `hierarchical_namespace_enabled`: Whether to enable hierarchical namespaces, which are a feature of Azure Data Lake Storage Gen 2 and provide more a efficient way to manage storage. ADLS Gen2 is currently (as of December 2018) in general-access public preview. #' - `properties`: A list of other properties for the storage account. #' - ... Other named arguments to pass to the [az_storage] initialization function. #' @@ -33,12 +33,17 @@ #' #' Accounts created with `kind = "BlobStorage"` can only host blob storage, while those with `kind = "FileStorage"` can only host file storage. Accounts with `kind = "StorageV2"` can host all types of storage. Currently, AzureStor provides an R interface only to blob and file storage. #' +#' If hierarchical namespaces are enabled, there is no interoperability of the blob and ADLSgen2 storage systems. Blob containers will show up in listings of ADLS filesystems, and vice-versa, but the _contents_ of the storage are independent: files that are uploaded as blobs cannot be accessed via ADLS methods, and similarly, files and directories created via ADLS will be invisible to blob methods. Full interoperability between blobs and ADLS is planned for 2019. +#' #' @section Value: #' An object of class `az_storage` representing the created storage account. #' #' @seealso -#' [get_storage_account], [delete_storage_account], [az_storage], -#' [Azure Storage Provider API reference](https://docs.microsoft.com/en-us/rest/api/storagerp/) +#' [get_storage_account], [delete_storage_account], [az_storage] +#' +#' [Azure Storage documentation](https://docs.microsoft.com/en-us/azure/storage/), +#' [Azure Storage Provider API reference](https://docs.microsoft.com/en-us/rest/api/storagerp/), +#' [Azure Data Lake Storage hierarchical namespaces](https://docs.microsoft.com/en-us/azure/storage/data-lake-storage/namespace) #' #' @examples #' \dontrun{ @@ -146,20 +151,22 @@ NULL .onLoad <- function(libname, pkgname) { api <- "2018-03-28" + adls_api <- "2018-06-17" options(azure_storage_api_version=api) + options(azure_adls_api_version=adls_api) ## extending AzureRMR classes AzureRMR::az_resource_group$set("public", "create_storage_account", overwrite=TRUE, function(name, location=self$location, kind="StorageV2", replication="Standard_LRS", - access_tier="hot", https_only=TRUE, hierarchical_namespace_enabled=FALSE, + access_tier="hot", https_only=TRUE, hierarchical_namespace_enabled=TRUE, properties=list(), ...) { properties <- modifyList(properties, - list(accessTier=access_tier, supportsHttpsTrafficOnly=https_only)) - if(isTRUE(hierarchical_namespace_enabled)) - properties <- modifyList(properties, list(isHnsEnabled=TRUE)) + list(accessTier=access_tier, + supportsHttpsTrafficOnly=https_only, + isHnsEnabled=hierarchical_namespace_enabled)) az_storage$new(self$token, self$subscription, self$name, type="Microsoft.Storage/storageAccounts", name=name, location=location, diff --git a/R/adls_client_funcs.R b/R/adls_client_funcs.R new file mode 100644 index 0000000..f8ce2c3 --- /dev/null +++ b/R/adls_client_funcs.R @@ -0,0 +1,369 @@ +#' Operations on an Azure Data Lake Storage Gen2 endpoint +#' +#' Get, list, create, or delete ADLSgen2 filesystems. Currently (as of December 2018) ADLSgen2 is in general-access public preview. +#' +#' @param endpoint Either an ADLSgen2 endpoint object as created by [storage_endpoint] or [adls_endpoint], or a character string giving the URL of the endpoint. +#' @param key,sas If an endpoint object is not supplied, authentication details. Currently the `sas` argument is unused. +#' @param api_version If an endpoint object is not supplied, the storage API version to use when interacting with the host. Currently defaults to `"2018-06-17"`. +#' @param name The name of the filesystem to get, create, or delete. +#' @param confirm For deleting a filesystem, whether to ask for confirmation. +#' @param x For the print method, a file share object. +#' @param ... Further arguments passed to lower-level functions. +#' +#' @details +#' You can call these functions in a couple of ways: by passing the full URL of the share, or by passing the endpoint object and the name of the share as a string. +#' +#' If hierarchical namespaces are enabled, there is no interoperability of the blob and ADLSgen2 storage systems. Blob containers will show up in listings of ADLS filesystems, and vice-versa, but the _contents_ of the storage are independent: files that are uploaded as blobs cannot be accessed via ADLS methods, and similarly, files and directories created via ADLS will be invisible to blob methods. Full interoperability between blobs and ADLS is planned for 2019. +#' +#' @return +#' For `adls_filesystem` and `create_adls_filesystem`, an S3 object representing an existing or created filesystem respectively. +#' +#' For `list_adls_filesystems`, a list of such objects. +#' +#' @seealso [storage_endpoint], [az_storage] +#' +#' @examples +#' \dontrun{ +#' +#' endp <- adls_endpoint("https://mystorage.dfs.core.windows.net/", key="access_key") +#' +#' # list ADLSgen2 filesystems +#' list_adls_filesystems(endp) +#' +#' # get, create, and delete a filesystem +#' adls_filesystem(endp, "myfs") +#' create_adls_filesystem(endp, "newfs") +#' delete_adls_filesystem(endp, "newfs") +#' +#' # alternative way to do the same +#' adls_filesystem("https://mystorage.dfs.core.windows.net/myfs", key="access_key") +#' create_adls_filesystem("https://mystorage.dfs.core.windows.net/newfs", key="access_key") +#' delete_adls_filesystem("https://mystorage.dfs.core.windows.net/newfs", key="access_key") +#' +#' } +#' @rdname adls_filesystem +#' @export +adls_filesystem <- function(endpoint, ...) +{ + UseMethod("adls_filesystem") +} + +#' @rdname adls_filesystem +#' @export +adls_filesystem.character <- function(endpoint, key=NULL, sas=NULL, + api_version=getOption("azure_storage_api_version"), + ...) +{ + do.call(adls_filesystem, generate_endpoint_container(endpoint, key, sas, api_version)) +} + +#' @rdname adls_filesystem +#' @export +adls_filesystem.adls_endpoint <- function(endpoint, name, ...) +{ + obj <- list(name=name, endpoint=endpoint) + class(obj) <- "adls_filesystem" + obj +} + +#' @rdname adls_filesystem +#' @export +print.adls_filesystem <- function(x, ...) +{ + cat("Azure Data Lake Storage Gen2 filesystem '", x$name, "'\n", sep="") + cat(sprintf("URL: %s\n", paste0(x$endpoint$url, x$name))) + if(!is_empty(x$endpoint$key)) + cat("Access key: \n") + else cat("Access key: \n") + if(!is_empty(x$endpoint$sas)) + cat("Account shared access signature: \n") + else cat("Account shared access signature: \n") + cat(sprintf("Storage API version: %s\n", x$endpoint$api_version)) + invisible(x) +} + + + +#' @rdname adls_filesystem +#' @export +list_adls_filesystems <- function(endpoint, ...) +{ + UseMethod("list_adls_filesystems") +} + +#' @rdname adls_filesystem +#' @export +list_adls_filesystems.character <- function(endpoint, key=NULL, sas=NULL, + api_version=getOption("azure_adls_api_version"), + ...) +{ + do.call(list_adls_filesystems, generate_endpoint_container(endpoint, key, sas, api_version)) +} + +#' @rdname adls_filesystem +#' @export +list_adls_filesystems.adls_endpoint <- function(endpoint, ...) +{ + lst <- do_storage_call(endpoint$url, "/", options=list(resource="account"), + key=endpoint$key, sas=endpoint$sas, api_version=endpoint$api_version) + + sapply(lst$filesystems$name, function(fs) adls_filesystem(endpoint, fs), simplify=FALSE) +} + + + +#' @rdname adls_filesystem +#' @export +create_adls_filesystem <- function(endpoint, ...) +{ + UseMethod("create_adls_filesystem") +} + +#' @rdname adls_filesystem +#' @export +create_adls_filesystem.character <- function(endpoint, key=NULL, sas=NULL, + api_version=getOption("azure_adls_api_version"), + ...) +{ + endp <- generate_endpoint_container(endpoint, key, sas, api_version) + create_adls_filesystem(endp$endpoint, endp$name, ...) +} + +#' @rdname adls_filesystem +#' @export +create_adls_filesystem.adls_filesystem <- function(endpoint, ...) +{ + create_adls_filesystem(endpoint$endpoint, endpoint$name) +} + +#' @rdname adls_filesystem +#' @export +create_adls_filesystem.adls_endpoint <- function(endpoint, name, ...) +{ + obj <- adls_filesystem(endpoint, name) + do_container_op(obj, options=list(resource="filesystem"), http_verb="PUT") + obj +} + + + +#' @rdname adls_filesystem +#' @export +delete_adls_filesystem <- function(endpoint, ...) +{ + UseMethod("delete_adls_filesystem") +} + +#' @rdname adls_filesystem +#' @export +delete_adls_filesystem.character <- function(endpoint, key=NULL, sas=NULL, + api_version=getOption("azure_adls_api_version"), + ...) +{ + endp <- generate_endpoint_container(endpoint, key, sas, api_version) + delete_adls_filesystem(endp$endpoint, endp$name, ...) +} + +#' @rdname adls_filesystem +#' @export +delete_adls_filesystem.adls_filesystem <- function(endpoint, ...) +{ + delete_adls_filesystem(endpoint$endpoint, endpoint$name, ...) +} + +#' @rdname adls_filesystem +#' @export +delete_adls_filesystem.adls_endpoint <- function(endpoint, name, confirm=TRUE, ...) +{ + if(confirm && interactive()) + { + path <- paste0(endpoint$url, name) + yn <- readline(paste0("Are you sure you really want to delete the filesystem '", path, "'? (y/N) ")) + if(tolower(substr(yn, 1, 1)) != "y") + return(invisible(NULL)) + } + + obj <- adls_filesystem(endpoint, name) + do_container_op(obj, options=list(resource="filesystem"), http_verb="DELETE") +} + + +#' Operations on an Azure Data Lake Storage Gen2 filesystem +#' +#' Upload, download, or delete a file; list files in a directory; create or delete directories. +#' +#' @param filesystem An ADLSgen2 filesystem object. +#' @param dir,file A string naming a directory or file respectively. +#' @param info Whether to return names only, or all information in a directory listing. +#' @param src,dest The source and destination filenames for uploading and downloading. Paths are allowed. +#' @param confirm Whether to ask for confirmation on deleting a file or directory. +#' @param blocksize The number of bytes to upload per HTTP(S) request. +#' @param lease The lease for a file, if present. +#' @param overwrite When downloading, whether to overwrite an existing destination file. +#' @param recursive For `list_adls_files`, and `delete_adls_dir`, whether the operation should recurse through subdirectories. For `delete_adls_dir`, this must be TRUE to delete a non-empty directory. +#' +#' @return +#' For `list_adls_files`, if `info="name"`, a vector of file/directory names. If `info="all"`, a data frame giving the file size and whether each object is a file or directory. +#' +#' @seealso +#' [adls_filesystem], [az_storage] +#' +#' @examples +#' \dontrun{ +#' +#' fs <- adls_filesystem("https://mystorage.dfs.core.windows.net/myfilesystem", key="access_key") +#' +#' list_adls_files(fs, "/") +#' +#' create_adls_dir(fs, "/newdir") +#' +#' upload_adls_file(fs, "~/bigfile.zip", dest="/newdir/bigfile.zip") +#' download_adls_file(fs, "/newdir/bigfile.zip", dest="~/bigfile_downloaded.zip") +#' +#' delete_adls_file(fs, "/newdir/bigfile.zip") +#' delete_adls_dir(fs, "/newdir") +#' +#' } +#' @rdname adls +#' @export +list_adls_files <- function(filesystem, dir="/", info=c("all", "name"), + recursive=FALSE) +{ + info <- match.arg(info) + + opts <- list(recursive=tolower(as.character(recursive)), resource="filesystem") + opts <- c(opts, directory=as.character(dir)) + + lst <- do_container_op(filesystem, "", options=opts) + if(info == "all") + { + out <- lst$paths + + # cater for null output + if(is_empty(out)) + return(data.frame( + name=character(0), + contentLength=numeric(0), + isDirectory=logical(0), + lastModified=numeric(0))) + + # normalise output + if(is.null(out$isDirectory)) + out$isDirectory <- FALSE + else out$isDirectory <- !is.na(out$isDirectory) + if(is.null(out$contentLength)) + out$contentLength <- 0 + else out$contentLength[is.na(out$contentLength)] <- 0 + if(is.null(out$etag)) + out$etag <- "" + else out$etag[is.na(out$etag)] <- "" + if(is.null(out$permissions)) + out$permissions <- "" + else out$permissions[is.na(out$permissions)] <- "" + out <- out[c("name", "contentLength", "isDirectory", "lastModified", "permissions", "etag")] + + if(all(out$permissions == "")) + out$permissions <- NULL + if(all(out$etag == "")) + out$etag <- NULL + out + } + else as.character(lst$paths$name) +} + + +#' @rdname adls +#' @export +upload_adls_file <- function(filesystem, src, dest, blocksize=2^24, lease=NULL) +{ + con <- if(inherits(src, "textConnection")) + rawConnection(charToRaw(paste0(readLines(src), collapse="\n"))) + else file(src, open="rb") + on.exit(close(con)) + + # create the file + content_type <- mime::guess_type(src) + headers <- list(`x-ms-content-type`=content_type) + #if(!is.null(lease)) + #headers[["x-ms-lease-id"]] <- as.character(lease) + do_container_op(filesystem, dest, options=list(resource="file"), headers=headers, http_verb="PUT") + + # transfer the contents + blocklist <- list() + pos <- 0 + while(1) + { + body <- readBin(con, "raw", blocksize) + thisblock <- length(body) + if(thisblock == 0) + break + + headers <- list( + `content-type`="application/octet-stream", + `content-length`=sprintf("%.0f", thisblock) + ) + opts <- list(action="append", position=sprintf("%.0f", pos)) + + do_container_op(filesystem, dest, options=opts, headers=headers, body=body, http_verb="PATCH") + pos <- pos + thisblock + } + + # flush contents + do_container_op(filesystem, dest, + options=list(action="flush", position=sprintf("%.0f", pos)), + http_verb="PATCH") +} + + +#' @rdname adls +#' @export +download_adls_file <- function(filesystem, src, dest, overwrite=FALSE) +{ + do_container_op(filesystem, src, config=httr::write_disk(dest, overwrite)) +} + + + +#' @rdname adls +#' @export +delete_adls_file <- function(filesystem, file, confirm=TRUE) +{ + if(confirm && interactive()) + { + endp <- filesystem$endpoint + path <- paste0(endp$url, filesystem$name, "/", file) + yn <- readline(paste0("Are you sure you really want to delete '", path, "'? (y/N) ")) + if(tolower(substr(yn, 1, 1)) != "y") + return(invisible(NULL)) + } + + do_container_op(filesystem, file, http_verb="DELETE") +} + + + +#' @rdname adls +#' @export +create_adls_dir <- function(filesystem, dir) +{ + do_container_op(filesystem, dir, options=list(resource="directory"), http_verb="PUT") +} + + +#' @rdname adls +#' @export +delete_adls_dir <- function(filesystem, dir, recursive=FALSE, confirm=TRUE) +{ + if(confirm && interactive()) + { + endp <- filesystem$endpoint + path <- paste0(endp$url, filesystem$name, "/", dir) + yn <- readline(paste0("Are you sure you really want to delete directory '", path, "'? (y/N) ")) + if(tolower(substr(yn, 1, 1)) != "y") + return(invisible(NULL)) + } + + opts <- list(recursive=tolower(as.character(recursive))) + do_container_op(filesystem, dir, options=opts, http_verb="DELETE") +} + diff --git a/R/blob_client_funcs.R b/R/blob_client_funcs.R index 9c5fa63..80a3d5f 100644 --- a/R/blob_client_funcs.R +++ b/R/blob_client_funcs.R @@ -15,6 +15,8 @@ #' @details #' You can call these functions in a couple of ways: by passing the full URL of the share, or by passing the endpoint object and the name of the container as a string. #' +#' If hierarchical namespaces are enabled, there is no interoperability of the blob and ADLSgen2 storage systems. Blob containers will show up in listings of ADLS filesystems, and vice-versa, but the _contents_ of the storage are independent: files that are uploaded as blobs cannot be accessed via ADLS methods, and similarly, files and directories created via ADLS will be invisible to blob methods. +#' #' @return #' For `blob_container` and `create_blob_container`, an S3 object representing an existing or created container respectively. #' diff --git a/R/client.R b/R/client.R index b075e85..4918b39 100644 --- a/R/client.R +++ b/R/client.R @@ -1,20 +1,24 @@ #' Create a storage endpoint object #' -#' @param endpoint The URL (hostname) for the endpoint. This must be of the form `http[s]://{account-name}.{type}.{core-host-name}`, where `type` is one of `"blob"`, `"file"`, `"queue"` or `"table"`. On the public Azure cloud, endpoints will be of the form `https://{account-name}.{type}.core.windows.net`. +#' Create a storage endpoint object, for interacting with blob, file, table, queue or ADLSgen2 storage. Currently (as of December 2018) ADLSgen2 is in general-access public preview. +#' +#' @param endpoint The URL (hostname) for the endpoint. This must be of the form `http[s]://{account-name}.{type}.{core-host-name}`, where `type` is one of `"dfs"` (corresponding to ADLSgen2), `"blob"`, `"file"`, `"queue"` or `"table"`. On the public Azure cloud, endpoints will be of the form `https://{account-name}.{type}.core.windows.net`. #' @param key The access key for the storage account. -#' @param sas A shared access signature (SAS) for the account. If `key` is also provided, the SAS is not used. If neither `key` nor `sas` are provided, only public (anonymous) access to the endpoint is possible. -#' @param api_version The storage API version to use when interacting with the host. Currently defaults to `"2018-03-28"`. +#' @param sas A shared access signature (SAS) for the account. If `key` is also provided, the SAS is not used. If neither `key` nor `sas` are provided, only public (anonymous) access to the endpoint is possible. Note that authentication with a SAS is not supported by ADLSgen2. +#' @param api_version The storage API version to use when interacting with the host. Defaults to `"2018-06-17"` for the ADLSgen2 endpoint, and `"2018-03-28"` for the others. #' @param x For the print method, a storage endpoint object. #' @param ... For the print method, further arguments passed to lower-level functions. #' #' @details -#' This is the starting point for the client-side storage interface in AzureRMR. `storage_endpoint` is a generic function to create an endpoint for any type of Azure storage while `blob_endpoint` and `file_endpoint` create endpoints for those types. +#' This is the starting point for the client-side storage interface in AzureRMR. `storage_endpoint` is a generic function to create an endpoint for any type of Azure storage while `adls_endpoint`, `blob_endpoint` and `file_endpoint` create endpoints for those types. #' #' @return -#' `storage_endpoint` returns an object of S3 class `"blob_endpoint"`, `"file_endpoint"`, `"queue_endpoint"` or `"table_endpoint"` depending on the type of endpoint. All of these also inherit from class `"storage_endpoint"`. `blob_endpoint` and `file_endpoint` return an object of the respective type. +#' `storage_endpoint` returns an object of S3 class `"adls_endpoint"`, `"blob_endpoint"`, `"file_endpoint"`, `"queue_endpoint"` or `"table_endpoint"` depending on the type of endpoint. All of these also inherit from class `"storage_endpoint"`. `adls_endpoint`, `blob_endpoint` and `file_endpoint` return an object of the respective class. +#' +#' Currently AzureStor only includes methods for interacting with ADLSgen2 (experimental), blob and file storage. #' #' @seealso -#' [az_storage], [file_share], [create_file_share], [blob_container], [create_blob_container] +#' [az_storage], [adls_filesystem], [create_adls_filesystem], [file_share], [create_file_share], [blob_container], [create_blob_container] #' #' @examples #' \dontrun{ @@ -30,12 +34,15 @@ #' @export storage_endpoint <- function(endpoint, key=NULL, sas=NULL, api_version=getOption("azure_storage_api_version")) { - type <- sapply(c("blob", "file", "queue", "table"), + type <- sapply(c("blob", "file", "queue", "table", "adls"), function(x) is_endpoint_url(endpoint, x)) if(!any(type)) stop("Unknown endpoint type", call.=FALSE) type <- names(type)[type] + if(type == "adls" && !is_empty(sas)) + warning("ADLSgen2 does not support authentication with a shared access signature") + obj <- list(url=endpoint, key=key, sas=sas, api_version=api_version) class(obj) <- c(paste0(type, "_endpoint"), "storage_endpoint") obj @@ -65,6 +72,21 @@ file_endpoint <- function(endpoint, key=NULL, sas=NULL, api_version=getOption("a obj } +#' @rdname storage_endpoint +#' @export +adls_endpoint <- function(endpoint, key=NULL, sas=NULL, api_version=getOption("azure_adls_api_version")) +{ + if(!is_endpoint_url(endpoint, "adls")) + stop("Not an ADLS Gen2 endpoint", call.=FALSE) + + if(!is_empty(sas)) + warning("ADLSgen2 does not support authentication with a shared access signature") + + obj <- list(url=endpoint, key=key, sas=sas, api_version=api_version) + class(obj) <- c("adls_endpoint", "storage_endpoint") + obj +} + #' @rdname storage_endpoint #' @export @@ -84,6 +106,24 @@ print.storage_endpoint <- function(x, ...) } +#' @rdname storage_endpoint +#' @export +print.adls_endpoint <- function(x, ...) +{ + cat("Azure Data Lake Storage Gen2 endpoint\n") + cat(sprintf("URL: %s\n", x$url)) + if(!is_empty(x$key)) + cat("Access key: \n") + else cat("Access key: \n") + if(!is_empty(x$sas)) + cat("Account shared access signature: \n") + else cat("Account shared access signature: \n") + cat(sprintf("Storage API version: %s\n", x$api_version)) + invisible(x) +} + + + #' Generic upload and download #' #' @param src,dest The source and destination files/URLs. Paths are allowed. diff --git a/R/storage.R b/R/storage.R index 792c9fb..7c0ccbf 100644 --- a/R/storage.R +++ b/R/storage.R @@ -116,6 +116,11 @@ public=list( sapply(keys, `[[`, "value") }, + get_adls_endpoint=function(key=self$list_keys()[1], sas=NULL) + { + adls_endpoint(self$properties$primaryEndpoints$dfs, key=key, sas=sas) + }, + print=function(...) { cat("\n", sep="") diff --git a/R/storage_utils.R b/R/storage_utils.R index c3b4ac2..746de48 100644 --- a/R/storage_utils.R +++ b/R/storage_utils.R @@ -1,7 +1,12 @@ do_container_op <- function(container, path="", options=list(), headers=list(), http_verb="GET", ...) { endp <- container$endpoint - path <- sub("//", "/", paste0(container$name, "/", path)) + + # don't add trailing / if no within-container path supplied: ADLS will complain + path <- if(nchar(path) > 0) + sub("//", "/", paste0(container$name, "/", path)) + else container$name + invisible(do_storage_call(endp$url, path, options=options, headers=headers, key=endp$key, sas=endp$sas, api_version=endp$api_version, http_verb=http_verb, ...)) @@ -43,7 +48,7 @@ do_storage_call <- function(endpoint_url, path, options=list(), headers=list(), return(NULL) # silence message about missing encoding - cont <- suppressMessages(httr::content(response)) + cont <- suppressMessages(httr::content(response, simplifyVector=TRUE)) if(is_empty(cont)) NULL else if(inherits(cont, "xml_node")) @@ -118,7 +123,14 @@ storage_error_message <- function(response, for_httr=TRUE) cont <- xml_to_list(cont) paste0(unlist(cont), collapse="\n") } - else NULL + else if(is.character(cont)) + cont + else if(is.list(cont) && is.character(cont$message)) + cont$message + else if(is.list(cont) && is.list(cont$error) && is.character(cont$error$message)) + cont$error$message + else "" + if(for_httr) paste0("complete Storage Services operation. Message:\n", sub("\\.$", "", msg)) else msg @@ -144,8 +156,15 @@ parse_storage_url <- function(url) is_endpoint_url <- function(url, type) { + # handle cases where type != uri string + if(type == "adls") + type <- "dfs" + else if(type == "web") + type <- "z26\\.web" + # endpoint URL must be of the form {scheme}://{acctname}.{type}.{etc} type <- sprintf("^https?://[a-z0-9]+\\.%s\\.", type) + is_url(url) && grepl(type, url) } diff --git a/README.md b/README.md index 4d9a11c..4f99317 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,9 @@ library(AzureRMR) library(AzureStor) # authenticate with Resource Manager -az <- az_rm$new(tenant="xxx-xxx-xxx", app="yyy-yyy-yyy", secret="{secret goes here}") +az <- az_rm$new(tenant="myaadtenant.onmicrosoft.com", app="app_id", password="password") -sub1 <- az$get_subscription("5710aa44-281f-49fe-bfa6-69e66bb55b11") +sub1 <- az$get_subscription("subscription_id") rg <- sub1$get_resource_group("rdev1") @@ -26,20 +26,7 @@ rdevstor1 # queue: https://rdevstor1.queue.core.windows.net/ # table: https://rdevstor1.table.core.windows.net/ # file: https://rdevstor1.file.core.windows.net/ -#--- -# id: /subscriptions/5710aa44-281f-49fe-bfa6-69e66bb55b11/resourceGroups/rdev1/providers/Microsoft.Sto ... -# identity: NULL -# is_synced: TRUE -# location: australiasoutheast -# managed_by: NULL -# plan: NULL -# properties: list(networkAcls, trustedDirectories, supportsHttpsTrafficOnly, encryption, -# provisioningState, creationTime, primaryEndpoints, primaryLocation, statusOfPrimary) -# tags: list() -#--- -# Methods: -# check, delete, do_operation, get_account_sas, get_blob_endpoint, get_file_endpoint, list_keys, -# set_api_version, sync_fields, update +# ... # retrieve admin keys rdevstor1$list_keys() @@ -65,6 +52,8 @@ blobstor2$delete() The user-side interface in AzureStor is implemented using S3 classes. This is for consistency with other data access packages in R, which mostly use S3. It also emphasises the distinction between Resource Manager (which is for interacting with the storage account itself) and the user client (which is for accessing files and data stored in the account). +AzureStor includes client methods for blob storage, file storage, and Azure Data Lake Storage Gen2 (experimental). + Accessing blob storage: ```r @@ -72,84 +61,49 @@ Accessing blob storage: bl <- rdevstor1$get_blob_endpoint() # for users without ARM credentials, use the storage_endpoint() function and provide a key -bl <- storage_endpoint("https://rdevstor1.blob.core.windows.net", key="/Uq3rxh0lbYErt...") +bl <- storage_endpoint("https://rdevstor1.blob.core.windows.net", key="access_key") # can also provide a shared access signature # providing neither a key nor SAS allows only public access -bl <- storage_endpoint("https://rdevstor1.blob.core.windows.net", sas="sv=2015-04-05&ss=...") - +bl <- storage_endpoint("https://rdevstor1.blob.core.windows.net", sas="my_sas") +# list of blob containers in this account list_blob_containers(bl) -#$container2 -#Azure blob container -#Endpoint URL: https://rdevstor1.blob.core.windows.net/ -#Access key: -#Account shared access signature: -#Storage API version: 2017-07-29 -# -#$privcontainer -#Azure blob container -#Endpoint URL: https://rdevstor1.blob.core.windows.net/ -#Access key: -#Account shared access signature: -#Storage API version: 2017-07-29 +# using pipes library(magrittr) -priv <- bl %>% blob_container("privcontainer") -priv %>% upload_blob("../downloads/test.file.gz", "test.gz") - -priv %>% list_blobs() -#[1] "test.gz" - -priv %>% download_blob("test.gz", "../downloads/test.file2.gz") +# create a new blob container and transfer a file +cont <- bl %>% create_blob_container("newcontainer") +cont %>% upload_blob("../downloads/test.file.gz", "test.gz") +cont %>% list_blobs() +cont %>% download_blob("test.gz", "../downloads/test.file2.gz") # you can also do an authenticated download from a full URL download_from_url("https://rdevstor1.blob.core.windows.net/privcontainer/test.gz", "../downloads/test.file3.gz", - key="/Uq3rxh0lbYErt...") + key="access_key") ``` -Accessing file storage works much the same way: +Accessing ADLSgen2 and file storage works much the same way, but with the addition of being able to manipulate directories: ```r -# get the file endpoint, either from ARM or standalone -fs <- rdevstor1$get_file_endpoint() -fs <- storage_endpoint("https://rdevstor1.file.core.windows.net", key="/Uq3rxh0lbYErt...") -fs <- storage_endpoint("https://rdevstor1.file.core.windows.net", sas="sv=2015-04-05&ss=...") +# get the ADLSgen2 endpoint, either from the resource object or standalone +ad <- rdevstor1$get_adls_endpoint() +ad <- storage_endpoint("https://rdevstor1.dfs.core.windows.net", key="access_key") +ad <- storage_endpoint("https://rdevstor1.dfs.core.windows.net", sas="my_sas") +# ADLS filesystems are analogous to blob containers +ad %>% list_adls_filesystems() -fs %>% list_file_shares() -#$share1 -#Azure file share -#Endpoint URL: https://rdevstor1.file.core.windows.net/ -#Access key: -#Account shared access signature: -#Storage API version: 2017-07-29 -# -#$share2 -#Azure file share -#Endpoint URL: https://rdevstor1.file.core.windows.net/ -#Access key: -#Account shared access signature: -#Storage API version: 2017-07-29 +# create a new filesystem and transfer some files +fs1 <- ad %>% create_file_filesystem("filesystem1") -sh1 <- fs %>% file_share("share1") +fs1 %>% list_adls_files("/") -sh1 %>% list_azure_files("/") -# name type size -#1 mydir Directory NA -#2 irisrf_dput.txt File 731930 -#3 storage.R File 3189 - -sh1 %>% download_azure_file("irisrf_dput.txt", "misc/file.txt") -sh1 %>% upload_azure_file("misc/file.txt", "foobar/upload.txt") -sh1 %>% delete_azure_file("/foobar/upload.txt") - -# authenticated file transfer to/from a URL also works with file shares -download_from_url("https://rdevstor1.file.core.windows.net/share1/irisrf_dput.txt", - "misc/file2.txt", - key="/Uq3rxh0lbYErt...") +fs1 %>% create_adls_directory("foobar") +fs1 %>% upload_adls_file("file.txt", "foobar/upload.txt") +fs1 %>% download_adls_file("foobar/upload.txt", "file_dl.txt") ``` --- diff --git a/man/adls.Rd b/man/adls.Rd new file mode 100644 index 0000000..b8849f1 --- /dev/null +++ b/man/adls.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/adls_client_funcs.R +\name{list_adls_files} +\alias{list_adls_files} +\alias{upload_adls_file} +\alias{download_adls_file} +\alias{delete_adls_file} +\alias{create_adls_dir} +\alias{delete_adls_dir} +\title{Operations on an Azure Data Lake Storage Gen2 filesystem} +\usage{ +list_adls_files(filesystem, dir = "/", info = c("all", "name"), + recursive = FALSE) + +upload_adls_file(filesystem, src, dest, blocksize = 2^24, lease = NULL) + +download_adls_file(filesystem, src, dest, overwrite = FALSE) + +delete_adls_file(filesystem, file, confirm = TRUE) + +create_adls_dir(filesystem, dir) + +delete_adls_dir(filesystem, dir, recursive = FALSE, confirm = TRUE) +} +\arguments{ +\item{filesystem}{An ADLSgen2 filesystem object.} + +\item{dir, file}{A string naming a directory or file respectively.} + +\item{info}{Whether to return names only, or all information in a directory listing.} + +\item{recursive}{For \code{list_adls_files}, and \code{delete_adls_dir}, whether the operation should recurse through subdirectories. For \code{delete_adls_dir}, this must be TRUE to delete a non-empty directory.} + +\item{src, dest}{The source and destination filenames for uploading and downloading. Paths are allowed.} + +\item{blocksize}{The number of bytes to upload per HTTP(S) request.} + +\item{lease}{The lease for a file, if present.} + +\item{overwrite}{When downloading, whether to overwrite an existing destination file.} + +\item{confirm}{Whether to ask for confirmation on deleting a file or directory.} +} +\value{ +For \code{list_adls_files}, if \code{info="name"}, a vector of file/directory names. If \code{info="all"}, a data frame giving the file size and whether each object is a file or directory. +} +\description{ +Upload, download, or delete a file; list files in a directory; create or delete directories. +} +\examples{ +\dontrun{ + +fs <- adls_filesystem("https://mystorage.dfs.core.windows.net/myfilesystem", key="access_key") + +list_adls_files(fs, "/") + +create_adls_dir(fs, "/newdir") + +upload_adls_file(fs, "~/bigfile.zip", dest="/newdir/bigfile.zip") +download_adls_file(fs, "/newdir/bigfile.zip", dest="~/bigfile_downloaded.zip") + +delete_adls_file(fs, "/newdir/bigfile.zip") +delete_adls_dir(fs, "/newdir") + +} +} +\seealso{ +\link{adls_filesystem}, \link{az_storage} +} diff --git a/man/adls_filesystem.Rd b/man/adls_filesystem.Rd new file mode 100644 index 0000000..c017abe --- /dev/null +++ b/man/adls_filesystem.Rd @@ -0,0 +1,106 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/adls_client_funcs.R +\name{adls_filesystem} +\alias{adls_filesystem} +\alias{adls_filesystem.character} +\alias{adls_filesystem.adls_endpoint} +\alias{print.adls_filesystem} +\alias{list_adls_filesystems} +\alias{list_adls_filesystems.character} +\alias{list_adls_filesystems.adls_endpoint} +\alias{create_adls_filesystem} +\alias{create_adls_filesystem.character} +\alias{create_adls_filesystem.adls_filesystem} +\alias{create_adls_filesystem.adls_endpoint} +\alias{delete_adls_filesystem} +\alias{delete_adls_filesystem.character} +\alias{delete_adls_filesystem.adls_filesystem} +\alias{delete_adls_filesystem.adls_endpoint} +\title{Operations on an Azure Data Lake Storage Gen2 endpoint} +\usage{ +adls_filesystem(endpoint, ...) + +\method{adls_filesystem}{character}(endpoint, key = NULL, sas = NULL, + api_version = getOption("azure_storage_api_version"), ...) + +\method{adls_filesystem}{adls_endpoint}(endpoint, name, ...) + +\method{print}{adls_filesystem}(x, ...) + +list_adls_filesystems(endpoint, ...) + +\method{list_adls_filesystems}{character}(endpoint, key = NULL, + sas = NULL, api_version = getOption("azure_adls_api_version"), ...) + +\method{list_adls_filesystems}{adls_endpoint}(endpoint, ...) + +create_adls_filesystem(endpoint, ...) + +\method{create_adls_filesystem}{character}(endpoint, key = NULL, + sas = NULL, api_version = getOption("azure_adls_api_version"), ...) + +\method{create_adls_filesystem}{adls_filesystem}(endpoint, ...) + +\method{create_adls_filesystem}{adls_endpoint}(endpoint, name, ...) + +delete_adls_filesystem(endpoint, ...) + +\method{delete_adls_filesystem}{character}(endpoint, key = NULL, + sas = NULL, api_version = getOption("azure_adls_api_version"), ...) + +\method{delete_adls_filesystem}{adls_filesystem}(endpoint, ...) + +\method{delete_adls_filesystem}{adls_endpoint}(endpoint, name, + confirm = TRUE, ...) +} +\arguments{ +\item{endpoint}{Either an ADLSgen2 endpoint object as created by \link{storage_endpoint} or \link{adls_endpoint}, or a character string giving the URL of the endpoint.} + +\item{...}{Further arguments passed to lower-level functions.} + +\item{key, sas}{If an endpoint object is not supplied, authentication details. Currently the \code{sas} argument is unused.} + +\item{api_version}{If an endpoint object is not supplied, the storage API version to use when interacting with the host. Currently defaults to \code{"2018-06-17"}.} + +\item{name}{The name of the filesystem to get, create, or delete.} + +\item{x}{For the print method, a file share object.} + +\item{confirm}{For deleting a filesystem, whether to ask for confirmation.} +} +\value{ +For \code{adls_filesystem} and \code{create_adls_filesystem}, an S3 object representing an existing or created filesystem respectively. + +For \code{list_adls_filesystems}, a list of such objects. +} +\description{ +Get, list, create, or delete ADLSgen2 filesystems. Currently (as of December 2018) ADLSgen2 is in general-access public preview. +} +\details{ +You can call these functions in a couple of ways: by passing the full URL of the share, or by passing the endpoint object and the name of the share as a string. + +If hierarchical namespaces are enabled, there is no interoperability of the blob and ADLSgen2 storage systems. Blob containers will show up in listings of ADLS filesystems, and vice-versa, but the \emph{contents} of the storage are independent: files that are uploaded as blobs cannot be accessed via ADLS methods, and similarly, files and directories created via ADLS will be invisible to blob methods. Full interoperability between blobs and ADLS is planned for 2019. +} +\examples{ +\dontrun{ + +endp <- adls_endpoint("https://mystorage.dfs.core.windows.net/", key="access_key") + +# list ADLSgen2 filesystems +list_adls_filesystems(endp) + +# get, create, and delete a filesystem +adls_filesystem(endp, "myfs") +create_adls_filesystem(endp, "newfs") +delete_adls_filesystem(endp, "newfs") + +# alternative way to do the same +adls_filesystem("https://mystorage.dfs.core.windows.net/myfs", key="access_key") +create_adls_filesystem("https://mystorage.dfs.core.windows.net/newfs", key="access_key") +delete_adls_filesystem("https://mystorage.dfs.core.windows.net/newfs", key="access_key") + +} +} +\seealso{ +\link{storage_endpoint}, \link{az_storage} +} diff --git a/man/blob_container.Rd b/man/blob_container.Rd index 5ed578f..ad48390 100644 --- a/man/blob_container.Rd +++ b/man/blob_container.Rd @@ -86,6 +86,8 @@ Get, list, create, or delete blob containers. } \details{ You can call these functions in a couple of ways: by passing the full URL of the share, or by passing the endpoint object and the name of the container as a string. + +If hierarchical namespaces are enabled, there is no interoperability of the blob and ADLSgen2 storage systems. Blob containers will show up in listings of ADLS filesystems, and vice-versa, but the \emph{contents} of the storage are independent: files that are uploaded as blobs cannot be accessed via ADLS methods, and similarly, files and directories created via ADLS will be invisible to blob methods. } \examples{ \dontrun{ diff --git a/man/create_storage_account.Rd b/man/create_storage_account.Rd index 9e1a60f..9557964 100644 --- a/man/create_storage_account.Rd +++ b/man/create_storage_account.Rd @@ -22,7 +22,7 @@ Method for the \link[AzureRMR:az_resource_group]{AzureRMR::az_resource_group} cl \item \code{replication}: The replication strategy for the account. The default is locally-redundant storage (LRS). \item \code{access_tier}: The access tier, either \code{"hot"} or \code{"cool"}, for blobs. \item \code{https_only}: Whether a HTTPS connection is required to access the storage. -\item \code{hierarchical_namespace_enabled}: Whether to enable hierarchical namespaces, which are a feature of Azure Data Lake Storage Gen 2 and provide more a efficient way to manage storage. Note that ADLS Gen2 is currently (as of November 2018) in limited-access public preview. if you are not enrolled in the preview program, this argument has no effect. +\item \code{hierarchical_namespace_enabled}: Whether to enable hierarchical namespaces, which are a feature of Azure Data Lake Storage Gen 2 and provide more a efficient way to manage storage. ADLS Gen2 is currently (as of December 2018) in general-access public preview. \item \code{properties}: A list of other properties for the storage account. \item ... Other named arguments to pass to the \link{az_storage} initialization function. } @@ -39,6 +39,8 @@ This method deploys a new storage account resource, with parameters given by the } Accounts created with \code{kind = "BlobStorage"} can only host blob storage, while those with \code{kind = "FileStorage"} can only host file storage. Accounts with \code{kind = "StorageV2"} can host all types of storage. Currently, AzureStor provides an R interface only to blob and file storage. + +If hierarchical namespaces are enabled, there is no interoperability of the blob and ADLSgen2 storage systems. Blob containers will show up in listings of ADLS filesystems, and vice-versa, but the \emph{contents} of the storage are independent: files that are uploaded as blobs cannot be accessed via ADLS methods, and similarly, files and directories created via ADLS will be invisible to blob methods. Full interoperability between blobs and ADLS is planned for 2019. } \section{Value}{ @@ -65,6 +67,9 @@ rg$create_storage_account("myblobstorage", } } \seealso{ -\link{get_storage_account}, \link{delete_storage_account}, \link{az_storage}, -\href{https://docs.microsoft.com/en-us/rest/api/storagerp/}{Azure Storage Provider API reference} +\link{get_storage_account}, \link{delete_storage_account}, \link{az_storage} + +\href{https://docs.microsoft.com/en-us/azure/storage/}{Azure Storage documentation}, +\href{https://docs.microsoft.com/en-us/rest/api/storagerp/}{Azure Storage Provider API reference}, +\href{https://docs.microsoft.com/en-us/azure/storage/data-lake-storage/namespace}{Azure Data Lake Storage hierarchical namespaces} } diff --git a/man/storage_endpoint.Rd b/man/storage_endpoint.Rd index 7d57a6f..f9db44d 100644 --- a/man/storage_endpoint.Rd +++ b/man/storage_endpoint.Rd @@ -7,7 +7,9 @@ \alias{file_endpoint} \alias{queue_endpoint} \alias{table_endpoint} +\alias{adls_endpoint} \alias{print.storage_endpoint} +\alias{print.adls_endpoint} \title{Create a storage endpoint object} \usage{ storage_endpoint(endpoint, key = NULL, sas = NULL, @@ -19,29 +21,36 @@ blob_endpoint(endpoint, key = NULL, sas = NULL, file_endpoint(endpoint, key = NULL, sas = NULL, api_version = getOption("azure_storage_api_version")) +adls_endpoint(endpoint, key = NULL, sas = NULL, + api_version = getOption("azure_adls_api_version")) + \method{print}{storage_endpoint}(x, ...) + +\method{print}{adls_endpoint}(x, ...) } \arguments{ -\item{endpoint}{The URL (hostname) for the endpoint. This must be of the form \code{http[s]://{account-name}.{type}.{core-host-name}}, where \code{type} is one of \code{"blob"}, \code{"file"}, \code{"queue"} or \code{"table"}. On the public Azure cloud, endpoints will be of the form \code{https://{account-name}.{type}.core.windows.net}.} +\item{endpoint}{The URL (hostname) for the endpoint. This must be of the form \code{http[s]://{account-name}.{type}.{core-host-name}}, where \code{type} is one of \code{"dfs"} (corresponding to ADLSgen2), \code{"blob"}, \code{"file"}, \code{"queue"} or \code{"table"}. On the public Azure cloud, endpoints will be of the form \code{https://{account-name}.{type}.core.windows.net}.} \item{key}{The access key for the storage account.} -\item{sas}{A shared access signature (SAS) for the account. If \code{key} is also provided, the SAS is not used. If neither \code{key} nor \code{sas} are provided, only public (anonymous) access to the endpoint is possible.} +\item{sas}{A shared access signature (SAS) for the account. If \code{key} is also provided, the SAS is not used. If neither \code{key} nor \code{sas} are provided, only public (anonymous) access to the endpoint is possible. Note that authentication with a SAS is not supported by ADLSgen2.} -\item{api_version}{The storage API version to use when interacting with the host. Currently defaults to \code{"2018-03-28"}.} +\item{api_version}{The storage API version to use when interacting with the host. Defaults to \code{"2018-06-17"} for the ADLSgen2 endpoint, and \code{"2018-03-28"} for the others.} \item{x}{For the print method, a storage endpoint object.} \item{...}{For the print method, further arguments passed to lower-level functions.} } \value{ -\code{storage_endpoint} returns an object of S3 class \code{"blob_endpoint"}, \code{"file_endpoint"}, \code{"queue_endpoint"} or \code{"table_endpoint"} depending on the type of endpoint. All of these also inherit from class \code{"storage_endpoint"}. \code{blob_endpoint} and \code{file_endpoint} return an object of the respective type. +\code{storage_endpoint} returns an object of S3 class \code{"adls_endpoint"}, \code{"blob_endpoint"}, \code{"file_endpoint"}, \code{"queue_endpoint"} or \code{"table_endpoint"} depending on the type of endpoint. All of these also inherit from class \code{"storage_endpoint"}. \code{adls_endpoint}, \code{blob_endpoint} and \code{file_endpoint} return an object of the respective class. + +Currently AzureStor only includes methods for interacting with ADLSgen2 (experimental), blob and file storage. } \description{ -Create a storage endpoint object +Create a storage endpoint object, for interacting with blob, file, table, queue or ADLSgen2 storage. Currently (as of December 2018) ADLSgen2 is in general-access public preview. } \details{ -This is the starting point for the client-side storage interface in AzureRMR. \code{storage_endpoint} is a generic function to create an endpoint for any type of Azure storage while \code{blob_endpoint} and \code{file_endpoint} create endpoints for those types. +This is the starting point for the client-side storage interface in AzureRMR. \code{storage_endpoint} is a generic function to create an endpoint for any type of Azure storage while \code{adls_endpoint}, \code{blob_endpoint} and \code{file_endpoint} create endpoints for those types. } \examples{ \dontrun{ @@ -55,5 +64,5 @@ endp <- blob_endpoint("https://mystorage.blob.core.windows.net/", key="access_ke } } \seealso{ -\link{az_storage}, \link{file_share}, \link{create_file_share}, \link{blob_container}, \link{create_blob_container} +\link{az_storage}, \link{adls_filesystem}, \link{create_adls_filesystem}, \link{file_share}, \link{create_file_share}, \link{blob_container}, \link{create_blob_container} } diff --git a/tests/testthat/test04_adls.R b/tests/testthat/test04_adls.R new file mode 100644 index 0000000..09e0de9 --- /dev/null +++ b/tests/testthat/test04_adls.R @@ -0,0 +1,109 @@ +context("ADLSgen2 client interface") + +tenant <- Sys.getenv("AZ_TEST_TENANT_ID") +app <- Sys.getenv("AZ_TEST_APP_ID") +password <- Sys.getenv("AZ_TEST_PASSWORD") +subscription <- Sys.getenv("AZ_TEST_SUBSCRIPTION") + +if(tenant == "" || app == "" || password == "" || subscription == "") + skip("Authentication tests skipped: ARM credentials not set") + + +sub <- az_rm$new(tenant=tenant, app=app, password=password)$get_subscription(subscription) +rgname <- paste(sample(letters, 20, replace=TRUE), collapse="") +rg <- sub$create_resource_group(rgname, location="australiaeast") + +test_that("ADLSgen2 client interface works", +{ + + storname <- paste(sample(letters, 20, replace=TRUE), collapse="") + stor <- rg$create_storage_account(storname, hierarchical_namespace_enabled=TRUE) + + # wait until provisioning is complete + for(i in 1:100) + { + Sys.sleep(5) + state <- stor$sync_fields() + if(state == "Succeeded") + break + } + if(state != "Succeeded") + stop("Unable to create storage account") + + ad <- stor$get_adls_endpoint() + ad2 <- adls_endpoint(stor$properties$primaryEndpoints$dfs, key=stor$list_keys()[1]) + expect_is(ad, "adls_endpoint") + expect_identical(ad, ad2) + + expect_true(is_empty(list_adls_filesystems(ad))) + + # listing files in a nonexistent filesystem + expect_error(list_adls_files(adls_filesystem(ad, "newfs"))) + + # ways of creating a filesystem + fs <- adls_filesystem(ad, "newfs1") + create_adls_filesystem(fs) + create_adls_filesystem(ad, "newfs2") + create_adls_filesystem(paste0(ad$url, "newfs3"), key=ad$key) + + lst <- list_adls_filesystems(ad) + # list will include $xns autogenerated filesystem if hierarchical namespace enabled + expect_true(is.list(lst) && inherits(lst[[1]], "adls_filesystem") && length(lst) == 4) + + expect_identical(fs, lst[["newfs1"]]) + + expect_true(is_empty(list_adls_files(fs, "/", info="name"))) + orig_file <- "../resources/iris.csv" + new_file <- file.path(tempdir(), "iris.csv") + upload_adls_file(fs, orig_file, "iris.csv", blocksize=1000) + + expect_is(list_adls_files(fs, "/"), "data.frame") + expect_is(list_adls_files(fs, "/", info="name"), "character") + + # download with and without overwrite + suppressWarnings(file.remove(new_file)) + download_adls_file(fs, "iris.csv", new_file) + expect_error(download_adls_file(fs, "iris.csv", new_file, overwrite=FALSE)) + writeLines("foo", new_file) + expect_silent(download_adls_file(fs, "iris.csv", new_file, overwrite=TRUE)) + expect_identical(readBin(orig_file, "raw", n=1e5), readBin(new_file, "raw", n=1e5)) + + # directory manipulation + create_adls_dir(fs, "dir1") + create_adls_dir(fs, "/dir_with_root") + create_adls_dir(fs, "dir1/dir2") + + upload_adls_file(fs, orig_file, "dir1/iris.csv") + upload_adls_file(fs, orig_file, "/dir_with_root/iris.csv") + upload_adls_file(fs, orig_file, "/dir1/dir2/iris.csv") + + suppressWarnings(file.remove(new_file)) + download_adls_file(fs, "/dir1/iris.csv", new_file) + suppressWarnings(file.remove(new_file)) + download_adls_file(fs, "dir_with_root/iris.csv", new_file) + suppressWarnings(file.remove(new_file)) + download_adls_file(fs, "dir1/dir2/iris.csv", new_file) + expect_identical(readBin(orig_file, "raw", n=1e5), readBin(new_file, "raw", n=1e5)) + + # exact-100k upload: check content-size header works + file_100k <- "../resources/100k.out" + upload_adls_file(fs, file_100k, "100k_single.out") + upload_adls_file(fs, file_100k, "100k_blocked.out", blocksize=1e4) + single_dl <- file.path(tempdir(), "100k_single.out") + blocked_dl <- file.path(tempdir(), "100k_blocked.out") + suppressWarnings(file.remove(single_dl, blocked_dl)) + download_adls_file(fs, "100k_single.out", single_dl) + download_adls_file(fs, "100k_blocked.out", blocked_dl) + expect_identical(readBin(file_100k, "raw", n=2e5), readBin(single_dl, "raw", n=2e5)) + expect_identical(readBin(file_100k, "raw", n=2e5), readBin(blocked_dl, "raw", n=2e5)) + + # ways of deleting a filesystem + delete_adls_filesystem(fs, confirm=FALSE) + delete_adls_filesystem(ad, "newfs2", confirm=FALSE) + delete_adls_filesystem(paste0(ad$url, "newfs3"), key=ad$key, confirm=FALSE) + Sys.sleep(5) + # $xns autogenerated filesystem will remain if hierarchical namespace enabled + expect_true(length(list_adls_filesystems(ad)) == 1) +}) + +rg$delete(confirm=FALSE) diff --git a/vignettes/intro.rmd b/vignettes/intro.rmd index 3acb885..95c1267 100644 --- a/vignettes/intro.rmd +++ b/vignettes/intro.rmd @@ -103,7 +103,7 @@ rg$delete_storage_account("mynewblobstorage") Perhaps the more relevant part of AzureStor for most users is its client interface to storage. With this, you can upload and download files and blobs, create containers and shares, list files, and so on. Unlike the ARM interface, the client interface uses S3 classes. This is for a couple of reasons: it is more familiar to most R users, and it is consistent with most other data manipulation packages in R, in particular the [tidyverse](https://tidyverse.org/). -The starting point for client access is the `storage_endpoint` object, which stores information about the endpoint of a storage account: the URL that you use to access storage, along with any authentication information needed. The easiest way to obtain an endpoint object is via the storage account resource object's `get_blob_endpoint()` and `get_file_endpoint()` methods: +The starting point for client access is the `storage_endpoint` object, which stores information about the endpoint of a storage account: the URL that you use to access storage, along with any authentication information needed. The easiest way to obtain an endpoint object is via the storage account resource object's `get_blob_endpoint()`, `get_file_endpoint()` and `get_adls_endpoint()` methods: ```{r, eval=FALSE} # create the storage account @@ -126,11 +126,18 @@ stor$get_file_endpoint() # Access key: # Account shared access signature: # Storage API version: 2018-03-28 + +stor$get_adls_endpoint() +# Azure Data Lake Storage Gen2 endpoint +# URL: https://mynewstorage.dfs.core.windows.net/ +# Access key: +# Account shared access signature: +# Storage API version: 2018-03-28 ``` -This shows that the base URL to access blob storage is https://mynewstorage.blob.core.windows.net/, while that for file storage is https://mynewstorage.file.core.windows.net/. While it's not displayed, the endpoint objects also include the access key necessary for authenticated access to storage; this is obtained directly from the storage account resource. +This shows that the base URL to access blob storage is https://mynewstorage.blob.core.windows.net/, for file storage it is https://mynewstorage.file.core.windows.net/, and for Azure Data Lake Storage Gen2, it is https://mynewstorage.dfs.core.windows.net/. While it's not displayed, the endpoint objects also include the access key necessary for authenticated access to storage; this is obtained directly from the storage account resource. -More practically, you will usually want to work with a storage endpoint without having to go through the process of authenticating with Azure Resource Manager. Often, you may not have any ARM credentials to start with (a tenant ID and service principal details). In this case, you can create the endpoint object directly with the `blob_endpoint()` and `file_endpoint()` functions. When create the endpoint this way, you have to provide the access key explicitly (assuming you know what it is). +More practically, you will usually want to work with a storage endpoint without having to go through the process of authenticating with Azure Resource Manager. Often, you may not have any ARM credentials to start with (a tenant ID and/or service principal details). In this case, you can create the endpoint object directly with the `blob_endpoint()`, `file_endpoint()` and `adls_endpoint()` functions. When you create the endpoint this way, you have to provide the access key explicitly (assuming you know what it is). ```{r, eval=FALSE} # same as using the get_xxxx_endpoint() methods above @@ -138,6 +145,8 @@ blob_endpoint("https://mynewstorage.blob.core.windows.net/", key="mystorageaccesskey") file_endpoint("https://mynewstorage.file.core.windows.net/", key="mystorageaccesskey") +adls_endpoint("https://mynewstorage.dfs.core.windows.net/", + key="mystorageaccesskey") ``` Instead of an access key, you can provide a [shared access signature (SAS)](https://docs.microsoft.com/en-us/azure/storage/common/storage-dotnet-shared-access-signature-part-1) to gain authenticated access. The main difference between using a key and a SAS is that the former unlocks access to the _entire_ storage account. A user who has a key can access all containers and files, and can transfer, modify and delete data without restriction. On the other hand, a user with a SAS can be limited to have access only to specific containers, or be limited to read access, or only for a given span of time, and so on. This is usually much better in terms of security. @@ -156,20 +165,22 @@ sas <- stor$get_account_sas(permissions="rw", blob_endp <- stor$get_blob_endpoint(key=NULL, sas=sas) ``` -If you don't have a key or a SAS, you will only have access to unauthenticated (public) containers and file shares. +If you don't have a key or a SAS, you will only have access to unauthenticated (public) containers. -### Container and object access: blob containers, file shares, blobs, files +### Container and object access: blob containers, file shares, ADLS filesystems, blobs, files -Given an endpoint object, AzureStor provides the following methods for working with containers: +The client interface for AzureStor supports blob storage, file storage, and Azure Data Lake Storage Gen 2. All of these types have some features in common with each other. In particular, the storage within each type is organised into containers: blob _containers_, file _shares_, and ADLSgen2 _filesystems_. Given an endpoint object, AzureStor provides the following methods for working with containers: - `blob_container, create_blob_container, delete_blob_container`: get an existing blob container, create a new container, and delete a container - `list_blob_containers`: return a list of blob container objects - `file_share, create_file_share, delete_file_share`: get an existing file share, create a new share, and delete a share - `list_file_shares`: return a list of file share objects +- `adls_filesystem, create_adls_filesystem, delete_adls_filesystem`: get an existing ADLSgen2 filesystem, create a new filesystem, and delete a filesystem +- `list_adls_filesystems`: return a list of ADLSgen2 filesystem objects -You can only use the methods corresponding to a given endpoint type. For example, it's an error to try to list the file shares for a blob endpoint. +You can only use the methods corresponding to a given endpoint type. For example, it's an error to try to list the file shares for a blob endpoint, or create a blob container within an ADLSgen2 endpoint. -Here is some example blob container code showing their use. The file share code is similar, except that it doesn't allow any form of unauthenticated access. +Here is some example blob container code showing their use. The file share and ADLSgen2 filesystem code is similar, except that they don't allow any form of unauthenticated access. ```{r, eval=FALSE} # an existing container @@ -211,9 +222,11 @@ blob_container("https://mynewstorage.blob.core.windows.net/mycontainer", key="mystorageaccountkey") file_share("https://mynewstorage.file.core.windows.net/myshare", key="mystorageaccountkey") +adls_filesystem("https://mynewstorage.dfs.core.windows.net/myshare", + key="mystorageaccountkey") ``` -Given a blob container or file share object, use the `list_blobs()` and `list_azure_files()` functions to list the storage objects they contain. Note the "azure" in `list_azure_files` to avoid any confusion with R's regular `list.files` function. +The `list_blobs()`, `list_azure_files()` and `list_adls_files()` functions will list the storage objects within a container of the requisite type. Note the "azure" and "adls" in `list_azure_files` and `list_adls_files` to avoid confusion with R's regular `list.files` function. ```{r, eval=FALSE} # list blobs inside a blob container @@ -227,11 +240,18 @@ list_blobs(cont, info="name") # [1] "fs.txt" "fs2.txt" -# and for files inside a file share +# files inside a file share list_azure_files(share, "/") # name type size # 1 100k.txt File 100000 # 2 fs.txt File 132 + + +# and files inside an ADLS filesystem +list_adls_files(fs, "/") +# name contentLength isDirectory lastModified permissions +# 1 blog.html 27128 FALSE Mon, 03 Dec 2018 15:20:31 GMT rw-r----- +# 2 newdir 0 TRUE Thu, 29 Nov 2018 03:42:56 GMT rwxr-x--- ``` To transfer files and blobs, use the following functions: @@ -265,7 +285,7 @@ download_from_url("https://mynewstorage.blob.core.windows.net/mycontainer/myblob overwrite=TRUE) ``` -File shares have the additional feature of supporting directories. To create and delete directories, use `create_azure_dir()` and `delete_azure_dir()`: +File shares and ADLS filesystems have the additional feature of supporting directories. To create and delete directories, use `create_azure_dir()` and `delete_azure_dir()` for a file share, and `create_adls_dir()` and `delete_adls_dir()` for an ADLS filesystem. ```{r, eval=FALSE} list_azure_files(share, "/") @@ -287,7 +307,5 @@ list_azure_files(share, "/") delete_azure_dir(share, "newdir") ``` -### Limitations - -Currently, the client interface for AzureStor only supports blob and file storage; for blobs, only block-blob uploading is supported. Support for other storage types, in particular Data Lake Storage Gen2, is planned. +For more information about the different types of storage, see the [Microsoft Docs site](https://docs.microsoft.com/en-us/azure/storage/). Note that there are other types of storage (queue, table) that do not have a client interface exposed by AzureStor.