Adds hierarchy functionality to list_blobs mentioned in issue #55 (#56)

* Adds hierarchy functionality to list_blobs mentioned in issue #55

* documentation

* update documentation for list_blobs

* Formatting fixes for suggestions.

* linting

* style

* by_hierarchy -> recursive

* test flipped

* add tests

* document

* consistent behaviour for recursive arg

* simplifying logic

Co-authored-by: Hans Van Slooten <HansVanSlooten@twinsbaseball.com>
Co-authored-by: Hong Ooi <hongooi@microsoft.com>
This commit is contained in:
Hans Van Slooten 2020-06-30 11:21:51 -05:00 коммит произвёл GitHub
Родитель 9136e555f7
Коммит 1a30a9c054
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 130 добавлений и 17 удалений

Просмотреть файл

@ -1,6 +1,7 @@
# AzureStor 3.2.2
- Fixes to the directory detection logic of `list_blobs`. Note that since blob storage doesn't have true directories, the `isdir` column of the `list_blobs` output should be treated as a best guess. For best results, avoid uploading zero-length files to blob storage, as this can cause problems for the service as a whole (not just AzureStor).
- Implement recursive/non-recursive directory listings for `list_blobs`, thanks to @cantpitch. Note that since blob storage doesn't have true directories, there are some warts to be aware of; see the help for `list_blobs` for more details.
- Fixes to the directory detection logic of `list_blobs`. Again, since blob storage doesn't have true directories, the `isdir` column of the `list_blobs` output should be treated as a best guess.
# AzureStor 3.2.1

Просмотреть файл

@ -235,7 +235,7 @@ delete_blob_container.blob_endpoint <- function(endpoint, name, confirm=TRUE, le
#' @param use_azcopy Whether to use the AzCopy utility from Microsoft to do the transfer, rather than doing it in R.
#' @param max_concurrent_transfers For `multiupload_blob` and `multidownload_blob`, the maximum number of concurrent file transfers. Each concurrent file transfer requires a separate R process, so limit this if you are low on memory.
#' @param prefix For `list_blobs`, an alternative way to specify the directory.
#' @param recursive This argument is for consistency with the methods for the other storage types. It is not used for blob storage.
#' @param recursive For the multiupload/download functions, whether to recursively transfer files in subdirectories. For `list_blobs`, whether to include the contents of any subdirectories in the listing.
#'
#' @details
#' `upload_blob` and `download_blob` are the workhorse file transfer functions for blobs. They each take as inputs a _single_ filename as the source for uploading/downloading, and a single filename as the destination. Alternatively, for uploading, `src` can be a [textConnection] or [rawConnection] object; and for downloading, `dest` can be NULL or a `rawConnection` object. If `dest` is NULL, the downloaded data is returned as a raw vector, and if a raw connection, it will be placed into the connection. See the examples below.
@ -252,12 +252,12 @@ delete_blob_container.blob_endpoint <- function(endpoint, name, confirm=TRUE, le
#' Note that AzCopy only supports SAS and AAD (OAuth) token as authentication methods. AzCopy also expects a single filename or wildcard spec as its source/destination argument, not a vector of filenames or a connection.
#'
#' @section Directories:
#'
#' Blob storage does not have true directories, instead using filenames containing a separator character (typically '/') to mimic a directory structure. This has some consequences:
#'
#' - The `isdir` column in the data frame output of `list_blobs` is a best guess as to whether an object represents a file or directory, and may not always be correct.
#' - `create_storage_dir` and `delete_storage_dir` currently do not have methods for blob containers.
#' - The `isdir` column in the data frame output of `list_blobs` is a best guess as to whether an object represents a file or directory, and may not always be correct. Currently, `list_blobs` assumes that any object with a file size of zero is a directory.
#' - Zero-length files can cause problems for the blob storage service as a whole (not just AzureStor). Try to avoid uploading such files.
#' - The output of `list_blobs(recursive=TRUE)` can vary based on whether the storage account has hierarchical namespaces enabled.
#' - `create_storage_dir` and `delete_storage_dir` currently do not have methods for blob containers.
#'
#' @return
#' For `list_blobs`, details on the blobs in the container. For `download_blob`, if `dest=NULL`, the contents of the downloaded blob as a raw vector. For `blob_exists` a flag whether the blob exists.
@ -320,12 +320,21 @@ list_blobs <- function(container, dir="/", info=c("partial", "name", "all"),
info <- match.arg(info)
opts <- list(comp="list", restype="container")
# ensure last char is always '/', to get list of blobs in a subdir
if(dir != "/")
{
if(!grepl("/$", dir))
dir <- paste0(dir, "/")
prefix <- dir
}
if(!is_empty(prefix))
opts <- c(opts, prefix=as.character(prefix))
if(!recursive)
opts <- c(opts, delimiter="/")
res <- do_container_op(container, options=opts)
lst <- res$Blobs
while(length(res$NextMarker) > 0)
@ -336,9 +345,21 @@ list_blobs <- function(container, dir="/", info=c("partial", "name", "all"),
if(info != "name")
{
rows <- lapply(lst, function(blob)
prefixes <- lst[names(lst) == "BlobPrefix"]
blobs <- lst[names(lst) == "Blob"]
prefix_rows <- lapply(prefixes, function(prefix)
{
props <- c(Name=blob$Name, blob$Properties)
data.frame(Type="BlobPrefix",
Name=unlist(prefix$Name),
"Content-Length"=NA,
stringsAsFactors=FALSE,
check.names=FALSE)
})
blob_rows <- lapply(blobs, function(blob)
{
props <- c(Type="Blob", Name=blob$Name, blob$Properties)
props <- data.frame(lapply(props, function(p) if(!is_empty(p)) unlist(p) else NA),
stringsAsFactors=FALSE, check.names=FALSE)
@ -348,7 +369,22 @@ list_blobs <- function(container, dir="/", info=c("partial", "name", "all"),
props
})
df <- do.call(rbind, rows)
df_prefixes <- do.call(rbind, prefix_rows)
df_blobs <- do.call(rbind, blob_rows)
if(is.null(df_prefixes) & is.null(df_blobs))
return(data.frame())
else if(is.null(df_prefixes))
df <- df_blobs
else if(is.null(df_blobs))
df <- df_prefixes
else
{
missing_cols <- setdiff(colnames(df_blobs), intersect(colnames(df_prefixes), colnames(df_blobs)))
df_prefixes[, missing_cols] <- NA
df <- rbind(df_prefixes, df_blobs)
}
if(length(df) > 0)
{
row.names(df) <- NULL
@ -358,11 +394,11 @@ list_blobs <- function(container, dir="/", info=c("partial", "name", "all"),
namecol <- which(ndf == "Name")
sizecol <- which(ndf == "Content-Length")
names(df)[c(namecol, sizecol)] <- c("name", "size")
df$size <- as.numeric(df$size)
# assume zero-length entries are directories
df$isdir <- df$size == 0
df$size[df$isdir] <- NA
df$size <- if(!is.null(df$size)) as.numeric(df$size) else NA
df$size[df$size == 0] <- NA
df$isdir <- is.na(df$size)
dircol <- which(names(df) == "isdir")
if(info == "all")
@ -441,7 +477,7 @@ delete_blob <- function(container, blob, confirm=TRUE)
blob_exists <- function(container, blob)
{
res <- do_container_op(container, blob, headers = list(), http_verb = "HEAD", http_status_handler = "pass")
if (httr::status_code(res) == 404L)
if(httr::status_code(res) == 404L)
return(FALSE)
httr::stop_for_status(res, storage_error_message(res))

Просмотреть файл

@ -47,7 +47,7 @@ multicopy_url_to_blob(container, src, dest, lease = NULL, async = FALSE,
\item{prefix}{For \code{list_blobs}, an alternative way to specify the directory.}
\item{recursive}{This argument is for consistency with the methods for the other storage types. It is not used for blob storage.}
\item{recursive}{For the multiupload/download functions, whether to recursively transfer files in subdirectories. For \code{list_blobs}, whether to include the contents of any subdirectories in the listing.}
\item{src, dest}{The source and destination files for uploading and downloading. See 'Details' below.}
@ -97,12 +97,12 @@ Note that AzCopy only supports SAS and AAD (OAuth) token as authentication metho
\section{Directories}{
Blob storage does not have true directories, instead using filenames containing a separator character (typically '/') to mimic a directory structure. This has some consequences:
\itemize{
\item The \code{isdir} column in the data frame output of \code{list_blobs} is a best guess as to whether an object represents a file or directory, and may not always be correct.
\item \code{create_storage_dir} and \code{delete_storage_dir} currently do not have methods for blob containers.
\item The \code{isdir} column in the data frame output of \code{list_blobs} is a best guess as to whether an object represents a file or directory, and may not always be correct. Currently, \code{list_blobs} assumes that any object with a file size of zero is a directory.
\item Zero-length files can cause problems for the blob storage service as a whole (not just AzureStor). Try to avoid uploading such files.
\item The output of \code{list_blobs(recursive=TRUE)} can vary based on whether the storage account has hierarchical namespaces enabled.
\item \code{create_storage_dir} and \code{delete_storage_dir} currently do not have methods for blob containers.
}
}

Просмотреть файл

@ -0,0 +1,76 @@
context("Blob client interface, directories")
tenant <- Sys.getenv("AZ_TEST_TENANT_ID")
app <- Sys.getenv("AZ_TEST_APP_ID")
password <- Sys.getenv("AZ_TEST_PASSWORD")
subscription <- Sys.getenv("AZ_TEST_SUBSCRIPTION")
if(tenant == "" || app == "" || password == "" || subscription == "")
skip("Authentication tests skipped: ARM credentials not set")
rgname <- Sys.getenv("AZ_TEST_STORAGE_RG")
storname <- Sys.getenv("AZ_TEST_STORAGE_HNS")
if(rgname == "" || storname == "")
skip("Blob client tests skipped: resource names not set")
sub <- AzureRMR::az_rm$new(tenant=tenant, app=app, password=password)$get_subscription(subscription)
stor <- sub$get_resource_group(rgname)$get_storage_account(storname)
options(azure_storage_progress_bar=FALSE)
test_that("Blob recursive file listing works",
{
bl <- stor$get_blob_endpoint()
cont <- create_blob_container(bl, make_name())
dirs <- file.path(tempdir(), c("dir1", "dir1/dir2", "dir1/dir2/dir3"))
files <- sapply(dirs, function(d)
{
dir.create(d, recursive=TRUE)
file.path(d, write_file(d))
})
expect_silent(upload_blob(cont, files[1], "/dir1/file1"))
expect_silent(upload_blob(cont, files[2], "/dir1/dir2/file2"))
expect_silent(upload_blob(cont, files[3], "/dir1/dir2/dir3/file3"))
# this is for hierarchical namespace enabled
l <- list_blobs(cont)
expect_is(l, "data.frame")
expect_identical(nrow(l), 6L)
l0 <- list_blobs(cont, recursive=FALSE)
expect_is(l0, "data.frame")
expect_identical(nrow(l0), 1L)
expect_true(all(is.na(l0$size[l0$isdir])))
expect_false(any(is.na(l0$size[!l0$isdir])))
l0n <- list_blobs(cont, recursive=FALSE, info="name")
expect_is(l0n, "character")
expect_identical(l0n, "dir1/")
l1 <- list_blobs(cont, "dir1/", recursive=FALSE)
expect_identical(nrow(l1), 2L)
expect_identical(l1$name, c("dir1/dir2/", "dir1/file1"))
expect_true(all(is.na(l1$size[l1$isdir])))
expect_false(any(is.na(l1$size[!l1$isdir])))
l1n <- list_blobs(cont, "dir1/", recursive=FALSE, info="name")
expect_identical(l1n, c("dir1/dir2/", "dir1/file1"))
l1rec <- list_blobs(cont, "dir1/", recursive=TRUE)
expect_identical(nrow(l1rec), 5L)
l1noslash <- list_blobs(cont, "dir1", recursive=FALSE)
expect_identical(nrow(l1noslash), 2L)
expect_identical(l1noslash$name, c("dir1/dir2/", "dir1/file1"))
})
teardown(
{
unlink(file.path(tempdir(), "dir1"), recursive=TRUE)
bl <- stor$get_blob_endpoint()
conts <- list_blob_containers(bl)
lapply(conts, delete_blob_container, confirm=FALSE)
})