diff --git a/NAMESPACE b/NAMESPACE index 3c8f5c5..6135e0e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ S3method(adls_filesystem,adls_endpoint) S3method(adls_filesystem,character) S3method(blob_container,blob_endpoint) S3method(blob_container,character) +S3method(copy_url_to_storage,blob_container) S3method(create_adls_filesystem,adls_endpoint) S3method(create_adls_filesystem,adls_filesystem) S3method(create_adls_filesystem,character) @@ -90,6 +91,8 @@ export(blob_endpoint) export(break_lease) export(call_azcopy) export(change_lease) +export(copy_url_to_blob) +export(copy_url_to_storage) export(create_adls_dir) export(create_adls_filesystem) export(create_azure_dir) diff --git a/R/add_methods.R b/R/add_methods.R index d6f9d5d..7258af8 100644 --- a/R/add_methods.R +++ b/R/add_methods.R @@ -10,7 +10,7 @@ #' @section Usage: #' ``` #' create_storage_account(name, location, kind = "StorageV2", replication = "Standard_LRS", -#' access_tier = "hot"), https_only = TRUE, +#' access_tier = "hot"), https_only = TRUE, #' hierarchical_namespace_enabled = FALSE, properties = list(), ...) #' ``` #' @section Arguments: @@ -27,7 +27,7 @@ #' @section Details: #' This method deploys a new storage account resource, with parameters given by the arguments. A storage account can host multiple types of storage: #' - blob storage -#' - file storage +#' - file storage #' - table storage #' - queue storage #' - Azure Data Lake Storage Gen2 diff --git a/R/adls_client_funcs.R b/R/adls_client_funcs.R index 8b1b27b..0e0d584 100644 --- a/R/adls_client_funcs.R +++ b/R/adls_client_funcs.R @@ -118,7 +118,7 @@ list_adls_filesystems.character <- function(endpoint, key=NULL, token=NULL, sas= list_adls_filesystems.adls_endpoint <- function(endpoint, ...) { lst <- do_storage_call(endpoint$url, "/", options=list(resource="account"), - key=endpoint$key, token=endpoint$token,, sas=endpoint$sas, + key=endpoint$key, token=endpoint$token, sas=endpoint$sas, api_version=endpoint$api_version) sapply(lst$filesystems$name, function(fs) adls_filesystem(endpoint, fs), simplify=FALSE) @@ -209,7 +209,7 @@ delete_adls_filesystem.adls_endpoint <- function(endpoint, name, confirm=TRUE, . #' @param filesystem An ADLSgen2 filesystem object. #' @param dir,file A string naming a directory or file respectively. #' @param info Whether to return names only, or all information in a directory listing. -#' @param src,dest The source and destination files for uploading and downloading. Paths are allowed. For uploading, `src` can also be a [textConnection] or [rawConnection] object to allow transferring in-memory R objects without creating a temporary file. +#' @param src,dest The source and destination paths/files for uploading and downloading. See 'Details' below. #' @param confirm Whether to ask for confirmation on deleting a file or directory. #' @param blocksize The number of bytes to upload/download per HTTP(S) request. #' @param lease The lease for a file, if present. diff --git a/R/adls_transfer_internal.R b/R/adls_transfer_internal.R index 4482c27..0cc9c6c 100644 --- a/R/adls_transfer_internal.R +++ b/R/adls_transfer_internal.R @@ -122,7 +122,7 @@ download_adls_file_internal <- function(filesystem, src, dest, blocksize=2^24, o } if(conn_dest) on.exit(seek(dest, 0)) - + # get file size (for progress bar) res <- do_container_op(filesystem, src, headers=headers, http_verb="HEAD", http_status_handler="pass") httr::stop_for_status(res, storage_error_message(res)) diff --git a/R/blob_client_funcs.R b/R/blob_client_funcs.R index 4709b55..3b5e77c 100644 --- a/R/blob_client_funcs.R +++ b/R/blob_client_funcs.R @@ -225,7 +225,7 @@ delete_blob_container.blob_endpoint <- function(endpoint, name, confirm=TRUE, le #' #' @param container A blob container object. #' @param blob A string naming a blob. -#' @param src,dest The source and destination files for uploading and downloading. See 'Details' below.For uploading, `src` can also be a [textConnection] or [rawConnection] object to allow transferring in-memory R objects without creating a temporary file. For downloading, +#' @param src,dest The source and destination files for uploading and downloading. See 'Details' below. #' @param info For `list_blobs`, level of detail about each blob to return: a vector of names only; the name, size and last-modified date (default); or all information. #' @param confirm Whether to ask for confirmation on deleting a blob. #' @param blocksize The number of bytes to upload/download per HTTP(S) request. @@ -287,6 +287,11 @@ delete_blob_container.blob_endpoint <- function(endpoint, name, confirm=TRUE, le #' download_blob(cont, "iris.rds", con) #' unserialize(con) #' +#' # copy from a public URL: Iris data from UCI machine learning repository +#' copy_url_to_blob(cont, +#' "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", +#' "iris.csv") +#' #' } #' @rdname blob #' @export diff --git a/R/blob_copyurl.R b/R/blob_copyurl.R new file mode 100644 index 0000000..78c421a --- /dev/null +++ b/R/blob_copyurl.R @@ -0,0 +1,37 @@ +#' @details +#' `copy_url_to_storage` transfers the contents of the file at the specified HTTP\[S\] URL directly to storage, without requiring a temporary local copy to be made. Currently this is only implemented for blob storage. +#' @rdname file_transfer +#' @export +copy_url_to_storage <- function(container, src, dest, ...) +{ + UseMethod("copy_from_url") +} + + +#' @rdname file_transfer +#' @export +copy_url_to_storage.blob_container <- function(container, src, dest, ...) +{ + copy_url_to_blob(container, src, dest, ...) +} + + +#' @param async For `copy_url_to_blob`, whether the copy operation should be asynchronous (proceed in the background). +#' @details +#' `copy_url_to_blob` transfers the contents of the file at the specified HTTP\[S\] URL directly to blob storage, without requiring a temporary local copy to be made. This has a current file size limit of 256MB. +#' @rdname blob +#' @export +copy_url_to_blob <- function(container, src, dest, lease=NULL, async=FALSE) +{ + if(!is_url(src)) + stop("Source must be a HTTP[S] url", call.=FALSE) + + headers <- list( + `x-ms-copy-source`=src, + `x-ms-requires-sync`=!async + ) + if(!is.null(lease)) + headers[["x-ms-lease-id"]] <- as.character(lease) + + do_container_op(container, dest, headers=headers, http_verb="PUT") +} diff --git a/R/blob_lease.R b/R/blob_lease.R index b4af0b9..6c10416 100644 --- a/R/blob_lease.R +++ b/R/blob_lease.R @@ -40,7 +40,7 @@ break_lease <- function(container, blob="", period=NULL) { headers <- list("x-ms-lease-action"="break") if(!is_empty(period)) - headers=c(headers, list("x-ms-lease-break-period"=period)) + headers <- c(headers, list("x-ms-lease-break-period"=period)) do_container_op(container, blob, options=list(comp="lease", restype="container"), headers=headers, http_verb="PUT") } diff --git a/R/file_client_funcs.R b/R/file_client_funcs.R index c33d5eb..ec43141 100644 --- a/R/file_client_funcs.R +++ b/R/file_client_funcs.R @@ -3,7 +3,7 @@ #' Get, list, create, or delete file shares. #' #' @param endpoint Either a file endpoint object as created by [storage_endpoint], or a character string giving the URL of the endpoint. -#' @param key,token,sas If an endpoint object is not supplied, authentication credentials: either an access key, an Azure Active Directory (AAD) token, or a SAS, in that order of priority. +#' @param key,token,sas If an endpoint object is not supplied, authentication credentials: either an access key, an Azure Active Directory (AAD) token, or a SAS, in that order of priority. #' @param api_version If an endpoint object is not supplied, the storage API version to use when interacting with the host. Currently defaults to `"2018-03-28"`. #' @param name The name of the file share to get, create, or delete. #' @param confirm For deleting a share, whether to ask for confirmation. @@ -195,7 +195,7 @@ delete_file_share.file_endpoint <- function(endpoint, name, confirm=TRUE, ...) #' @param share A file share object. #' @param dir,file A string naming a directory or file respectively. #' @param info Whether to return names only, or all information in a directory listing. -#' @param src,dest The source and destination files for uploading and downloading. For uploading, `src` can also be a [textConnection] or [rawConnection] object to allow transferring in-memory R objects without creating a temporary file. +#' @param src,dest The source and destination files for uploading and downloading. See 'Details' below. #' @param confirm Whether to ask for confirmation on deleting a file or directory. #' @param blocksize The number of bytes to upload/download per HTTP(S) request. #' @param overwrite When downloading, whether to overwrite an existing destination file. @@ -276,7 +276,7 @@ list_azure_files <- function(share, dir, info=c("all", "name"), name <- vapply(lst$Entries, function(ent) ent$Name[[1]], FUN.VALUE=character(1)) if(info == "name") return(name) - + type <- if(is_empty(name)) character(0) else names(name) size <- vapply(lst$Entries, function(ent) if(is_empty(ent$Properties)) NA_character_ diff --git a/R/pool.R b/R/pool.R index eb6e7ad..6bd633d 100644 --- a/R/pool.R +++ b/R/pool.R @@ -45,7 +45,7 @@ delete_pool <- function() { if(!exists("pool", envir=.AzureStor)) return() - + message("Deleting background pool") parallel::stopCluster(.AzureStor$pool) rm(pool, envir=.AzureStor) diff --git a/R/transfer_generics.R b/R/transfer_generics.R index 5d43eca..e225d72 100644 --- a/R/transfer_generics.R +++ b/R/transfer_generics.R @@ -13,7 +13,7 @@ #' #' `upload_to_url` and `download_to_url` allow you to transfer a file to or from Azure storage, given the URL of the source or destination. The storage details (endpoint, container name, and so on) are obtained from the URL. #' -#' By default, `storage_download` and `download_from_url` will display a progress bar while they are downloading. To turn this off, use `options(azure_storage_progress_bar=FALSE)`. To turn the progress bar back on, use `options(azure_storage_progress_bar=TRUE)`. +#' By default, the upload and download functions will display a progress bar while they are downloading. To turn this off, use `options(azure_storage_progress_bar=FALSE)`. To turn the progress bar back on, use `options(azure_storage_progress_bar=TRUE)`. #' #' @seealso #' [storage_container], [blob_container], [file_share], [adls_filesystem] diff --git a/man/adls.Rd b/man/adls.Rd index 2d4c636..d8c925a 100644 --- a/man/adls.Rd +++ b/man/adls.Rd @@ -42,7 +42,7 @@ delete_adls_dir(filesystem, dir, recursive = FALSE, confirm = TRUE) \item{recursive}{For \code{list_adls_files}, and \code{delete_adls_dir}, whether the operation should recurse through subdirectories. For \code{delete_adls_dir}, this must be TRUE to delete a non-empty directory.} -\item{src, dest}{The source and destination files for uploading and downloading. Paths are allowed. For uploading, \code{src} can also be a \link{textConnection} or \link{rawConnection} object to allow transferring in-memory R objects without creating a temporary file.} +\item{src, dest}{The source and destination paths/files for uploading and downloading. See 'Details' below.} \item{blocksize}{The number of bytes to upload/download per HTTP(S) request.} diff --git a/man/blob.Rd b/man/blob.Rd index 26424ef..5af3249 100644 --- a/man/blob.Rd +++ b/man/blob.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/blob_client_funcs.R +% Please edit documentation in R/blob_client_funcs.R, R/blob_copyurl.R \name{list_blobs} \alias{list_blobs} \alias{upload_blob} @@ -7,6 +7,7 @@ \alias{download_blob} \alias{multidownload_blob} \alias{delete_blob} +\alias{copy_url_to_blob} \title{Operations on a blob container or blob} \usage{ list_blobs(container, info = c("partial", "name", "all"), @@ -27,6 +28,8 @@ multidownload_blob(container, src, dest, blocksize = 2^24, max_concurrent_transfers = 10) delete_blob(container, blob, confirm = TRUE) + +copy_url_to_blob(container, src, dest, lease = NULL, async = FALSE) } \arguments{ \item{container}{A blob container object.} @@ -35,7 +38,7 @@ delete_blob(container, blob, confirm = TRUE) \item{prefix}{For \code{list_blobs}, filters the result to return only blobs whose name begins with this prefix.} -\item{src, dest}{The source and destination files for uploading and downloading. See 'Details' below.For uploading, \code{src} can also be a \link{textConnection} or \link{rawConnection} object to allow transferring in-memory R objects without creating a temporary file. For downloading,} +\item{src, dest}{The source and destination files for uploading and downloading. See 'Details' below.} \item{type}{When uploading, the type of blob to create. Currently only block blobs are supported.} @@ -52,6 +55,8 @@ delete_blob(container, blob, confirm = TRUE) \item{blob}{A string naming a blob.} \item{confirm}{Whether to ask for confirmation on deleting a blob.} + +\item{async}{For \code{copy_url_to_blob}, whether the copy operation should be asynchronous (proceed in the background).} } \value{ For \code{list_blobs}, details on the blobs in the container. For \code{download_blob}, if \code{dest=NULL}, the contents of the downloaded blob as a raw vector. @@ -67,6 +72,8 @@ Upload, download, or delete a blob; list blobs in a container. The file transfer functions also support working with connections to allow transferring R objects without creating temporary files. For uploading, \code{src} can be a \link{textConnection} or \link{rawConnection} object. For downloading, \code{dest} can be NULL or a \code{rawConnection} object. In the former case, the downloaded data is returned as a raw vector, and for the latter, it will be placed into the connection. See the examples below. By default, the upload and download functions will display a progress bar to track the file transfer. To turn this off, use \code{options(azure_storage_progress_bar=FALSE)}. To turn the progress bar back on, use \code{options(azure_storage_progress_bar=TRUE)}. + +\code{copy_url_to_blob} transfers the contents of the file at the specified HTTP[S] URL directly to blob storage, without requiring a temporary local copy to be made. This has a current file size limit of 256MB. } \examples{ \dontrun{ @@ -102,6 +109,11 @@ con <- rawConnection(raw(0), "r+") download_blob(cont, "iris.rds", con) unserialize(con) +# copy from a public URL: Iris data from UCI machine learning repository +copy_url_to_blob(cont, + "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", + "iris.csv") + } } \seealso{ diff --git a/man/create_storage_account.Rd b/man/create_storage_account.Rd index 12510d6..5982643 100644 --- a/man/create_storage_account.Rd +++ b/man/create_storage_account.Rd @@ -8,7 +8,7 @@ Method for the \link[AzureRMR:az_resource_group]{AzureRMR::az_resource_group} cl } \section{Usage}{ \preformatted{create_storage_account(name, location, kind = "StorageV2", replication = "Standard_LRS", - access_tier = "hot"), https_only = TRUE, + access_tier = "hot"), https_only = TRUE, hierarchical_namespace_enabled = FALSE, properties = list(), ...) } } diff --git a/man/file.Rd b/man/file.Rd index 63dd8b7..e91b19b 100644 --- a/man/file.Rd +++ b/man/file.Rd @@ -41,7 +41,7 @@ delete_azure_dir(share, dir, confirm = TRUE) \item{prefix}{For \code{list_azure_files}, filters the result to return only files and directories whose name begins with this prefix.} -\item{src, dest}{The source and destination files for uploading and downloading. For uploading, \code{src} can also be a \link{textConnection} or \link{rawConnection} object to allow transferring in-memory R objects without creating a temporary file.} +\item{src, dest}{The source and destination files for uploading and downloading. See 'Details' below.} \item{blocksize}{The number of bytes to upload/download per HTTP(S) request.} diff --git a/man/file_transfer.Rd b/man/file_transfer.Rd index 82aaf28..9531a73 100644 --- a/man/file_transfer.Rd +++ b/man/file_transfer.Rd @@ -1,6 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transfer_generics.R -\name{storage_upload} +% Please edit documentation in R/blob_copyurl.R, R/transfer_generics.R +\name{copy_url_to_storage} +\alias{copy_url_to_storage} +\alias{copy_url_to_storage.blob_container} \alias{storage_upload} \alias{storage_upload.blob_container} \alias{storage_upload.file_share} @@ -21,6 +23,10 @@ \alias{upload_to_url} \title{Upload and download generics} \usage{ +copy_url_to_storage(container, src, dest, ...) + +\method{copy_url_to_storage}{blob_container}(container, src, dest, ...) + storage_upload(container, ...) \method{storage_upload}{blob_container}(container, src, dest, ...) @@ -61,10 +67,10 @@ upload_to_url(src, dest, key = NULL, token = NULL, sas = NULL, ...) \arguments{ \item{container}{A storage container object.} -\item{...}{Further arguments to pass to lower-level functions.} - \item{src, dest}{The source and destination files to transfer.} +\item{...}{Further arguments to pass to lower-level functions.} + \item{key, token, sas}{Authentication arguments: an access key, Azure Active Directory (AAD) token or a shared access signature (SAS). If multiple arguments are supplied, a key takes priority over a token, which takes priority over a SAS. For \code{upload_to_url} and \code{download_to_url}, you can also provide a SAS as part of the URL itself.} \item{overwrite}{For downloading, whether to overwrite any destination files that exist.} @@ -73,13 +79,15 @@ upload_to_url(src, dest, key = NULL, token = NULL, sas = NULL, ...) Upload and download generics } \details{ +\code{copy_url_to_storage} transfers the contents of the file at the specified HTTP[S] URL directly to storage, without requiring a temporary local copy to be made. Currently this is only implemented for blob storage. + These functions allow you to transfer files to and from a storage account. \code{storage_upload}, \code{storage_download}, \code{storage_multiupload} and \code{storage_multidownload} take as first argument a storage container, either for blob storage, file storage, or ADLSgen2. They dispatch to the corresponding file transfer functions for the given storage type. \code{upload_to_url} and \code{download_to_url} allow you to transfer a file to or from Azure storage, given the URL of the source or destination. The storage details (endpoint, container name, and so on) are obtained from the URL. -By default, \code{storage_download} and \code{download_from_url} will display a progress bar while they are downloading. To turn this off, use \code{options(azure_storage_progress_bar=FALSE)}. To turn the progress bar back on, use \code{options(azure_storage_progress_bar=TRUE)}. +By default, the upload and download functions will display a progress bar while they are downloading. To turn this off, use \code{options(azure_storage_progress_bar=FALSE)}. To turn the progress bar back on, use \code{options(azure_storage_progress_bar=TRUE)}. } \examples{ \dontrun{ diff --git a/tests/testthat/test02_blob.R b/tests/testthat/test02_blob.R index 515114c..1b679d0 100644 --- a/tests/testthat/test02_blob.R +++ b/tests/testthat/test02_blob.R @@ -187,7 +187,7 @@ test_that("Blob client interface works", test_that("AAD authentication works", { - url <- stor$get_blob_endpoint()$url + url <- stor$get_blob_endpoint()$url token <- AzureRMR::get_azure_token("https://storage.azure.com/", tenant=tenant, app=app, password=password) bl <- blob_endpoint(url, token=token) cont <- create_blob_container(bl, "newcontainer4") @@ -322,6 +322,24 @@ test_that("chunked downloading works", }) +test_that("copy from url works", +{ + bl <- stor$get_blob_endpoint() + cont <- create_blob_container(bl, "urltransfer") + + # copy from GitHub repo + src_url <- "https://raw.githubusercontent.com/Azure/AzureStor/master/tests/resources/iris.csv" + orig_file <- "../resources/iris.csv" + new_file <- tempfile() + + copy_url_to_blob(cont, src_url, "iris.csv", async=FALSE) + download_blob(cont, "iris.csv", new_file) + + # use readLines to workaround GH auto-translating CRLF -> LF + expect_identical(readLines(orig_file), readLines(new_file)) +}) + + teardown( { bl <- stor$get_blob_endpoint()