зеркало из https://github.com/Azure/AzureStor.git
* Adds hierarchy functionality to list_blobs mentioned in issue #55 * documentation * update documentation for list_blobs * Formatting fixes for suggestions. * linting * style * by_hierarchy -> recursive * test flipped * add tests * document * consistent behaviour for recursive arg * simplifying logic Co-authored-by: Hans Van Slooten <HansVanSlooten@twinsbaseball.com> Co-authored-by: Hong Ooi <hongooi@microsoft.com>
This commit is contained in:
Родитель
9136e555f7
Коммит
1a30a9c054
3
NEWS.md
3
NEWS.md
|
@ -1,6 +1,7 @@
|
|||
# AzureStor 3.2.2
|
||||
|
||||
- Fixes to the directory detection logic of `list_blobs`. Note that since blob storage doesn't have true directories, the `isdir` column of the `list_blobs` output should be treated as a best guess. For best results, avoid uploading zero-length files to blob storage, as this can cause problems for the service as a whole (not just AzureStor).
|
||||
- Implement recursive/non-recursive directory listings for `list_blobs`, thanks to @cantpitch. Note that since blob storage doesn't have true directories, there are some warts to be aware of; see the help for `list_blobs` for more details.
|
||||
- Fixes to the directory detection logic of `list_blobs`. Again, since blob storage doesn't have true directories, the `isdir` column of the `list_blobs` output should be treated as a best guess.
|
||||
|
||||
# AzureStor 3.2.1
|
||||
|
||||
|
|
|
@ -235,7 +235,7 @@ delete_blob_container.blob_endpoint <- function(endpoint, name, confirm=TRUE, le
|
|||
#' @param use_azcopy Whether to use the AzCopy utility from Microsoft to do the transfer, rather than doing it in R.
|
||||
#' @param max_concurrent_transfers For `multiupload_blob` and `multidownload_blob`, the maximum number of concurrent file transfers. Each concurrent file transfer requires a separate R process, so limit this if you are low on memory.
|
||||
#' @param prefix For `list_blobs`, an alternative way to specify the directory.
|
||||
#' @param recursive This argument is for consistency with the methods for the other storage types. It is not used for blob storage.
|
||||
#' @param recursive For the multiupload/download functions, whether to recursively transfer files in subdirectories. For `list_blobs`, whether to include the contents of any subdirectories in the listing.
|
||||
#'
|
||||
#' @details
|
||||
#' `upload_blob` and `download_blob` are the workhorse file transfer functions for blobs. They each take as inputs a _single_ filename as the source for uploading/downloading, and a single filename as the destination. Alternatively, for uploading, `src` can be a [textConnection] or [rawConnection] object; and for downloading, `dest` can be NULL or a `rawConnection` object. If `dest` is NULL, the downloaded data is returned as a raw vector, and if a raw connection, it will be placed into the connection. See the examples below.
|
||||
|
@ -252,12 +252,12 @@ delete_blob_container.blob_endpoint <- function(endpoint, name, confirm=TRUE, le
|
|||
#' Note that AzCopy only supports SAS and AAD (OAuth) token as authentication methods. AzCopy also expects a single filename or wildcard spec as its source/destination argument, not a vector of filenames or a connection.
|
||||
#'
|
||||
#' @section Directories:
|
||||
#'
|
||||
#' Blob storage does not have true directories, instead using filenames containing a separator character (typically '/') to mimic a directory structure. This has some consequences:
|
||||
#'
|
||||
#' - The `isdir` column in the data frame output of `list_blobs` is a best guess as to whether an object represents a file or directory, and may not always be correct.
|
||||
#' - `create_storage_dir` and `delete_storage_dir` currently do not have methods for blob containers.
|
||||
#' - The `isdir` column in the data frame output of `list_blobs` is a best guess as to whether an object represents a file or directory, and may not always be correct. Currently, `list_blobs` assumes that any object with a file size of zero is a directory.
|
||||
#' - Zero-length files can cause problems for the blob storage service as a whole (not just AzureStor). Try to avoid uploading such files.
|
||||
#' - The output of `list_blobs(recursive=TRUE)` can vary based on whether the storage account has hierarchical namespaces enabled.
|
||||
#' - `create_storage_dir` and `delete_storage_dir` currently do not have methods for blob containers.
|
||||
#'
|
||||
#' @return
|
||||
#' For `list_blobs`, details on the blobs in the container. For `download_blob`, if `dest=NULL`, the contents of the downloaded blob as a raw vector. For `blob_exists` a flag whether the blob exists.
|
||||
|
@ -320,12 +320,21 @@ list_blobs <- function(container, dir="/", info=c("partial", "name", "all"),
|
|||
info <- match.arg(info)
|
||||
|
||||
opts <- list(comp="list", restype="container")
|
||||
|
||||
# ensure last char is always '/', to get list of blobs in a subdir
|
||||
if(dir != "/")
|
||||
{
|
||||
if(!grepl("/$", dir))
|
||||
dir <- paste0(dir, "/")
|
||||
prefix <- dir
|
||||
}
|
||||
|
||||
if(!is_empty(prefix))
|
||||
opts <- c(opts, prefix=as.character(prefix))
|
||||
|
||||
if(!recursive)
|
||||
opts <- c(opts, delimiter="/")
|
||||
|
||||
res <- do_container_op(container, options=opts)
|
||||
lst <- res$Blobs
|
||||
while(length(res$NextMarker) > 0)
|
||||
|
@ -336,9 +345,21 @@ list_blobs <- function(container, dir="/", info=c("partial", "name", "all"),
|
|||
|
||||
if(info != "name")
|
||||
{
|
||||
rows <- lapply(lst, function(blob)
|
||||
prefixes <- lst[names(lst) == "BlobPrefix"]
|
||||
blobs <- lst[names(lst) == "Blob"]
|
||||
|
||||
prefix_rows <- lapply(prefixes, function(prefix)
|
||||
{
|
||||
props <- c(Name=blob$Name, blob$Properties)
|
||||
data.frame(Type="BlobPrefix",
|
||||
Name=unlist(prefix$Name),
|
||||
"Content-Length"=NA,
|
||||
stringsAsFactors=FALSE,
|
||||
check.names=FALSE)
|
||||
})
|
||||
|
||||
blob_rows <- lapply(blobs, function(blob)
|
||||
{
|
||||
props <- c(Type="Blob", Name=blob$Name, blob$Properties)
|
||||
props <- data.frame(lapply(props, function(p) if(!is_empty(p)) unlist(p) else NA),
|
||||
stringsAsFactors=FALSE, check.names=FALSE)
|
||||
|
||||
|
@ -348,7 +369,22 @@ list_blobs <- function(container, dir="/", info=c("partial", "name", "all"),
|
|||
props
|
||||
})
|
||||
|
||||
df <- do.call(rbind, rows)
|
||||
df_prefixes <- do.call(rbind, prefix_rows)
|
||||
df_blobs <- do.call(rbind, blob_rows)
|
||||
|
||||
if(is.null(df_prefixes) & is.null(df_blobs))
|
||||
return(data.frame())
|
||||
else if(is.null(df_prefixes))
|
||||
df <- df_blobs
|
||||
else if(is.null(df_blobs))
|
||||
df <- df_prefixes
|
||||
else
|
||||
{
|
||||
missing_cols <- setdiff(colnames(df_blobs), intersect(colnames(df_prefixes), colnames(df_blobs)))
|
||||
df_prefixes[, missing_cols] <- NA
|
||||
df <- rbind(df_prefixes, df_blobs)
|
||||
}
|
||||
|
||||
if(length(df) > 0)
|
||||
{
|
||||
row.names(df) <- NULL
|
||||
|
@ -358,11 +394,11 @@ list_blobs <- function(container, dir="/", info=c("partial", "name", "all"),
|
|||
namecol <- which(ndf == "Name")
|
||||
sizecol <- which(ndf == "Content-Length")
|
||||
names(df)[c(namecol, sizecol)] <- c("name", "size")
|
||||
df$size <- as.numeric(df$size)
|
||||
|
||||
# assume zero-length entries are directories
|
||||
df$isdir <- df$size == 0
|
||||
df$size[df$isdir] <- NA
|
||||
df$size <- if(!is.null(df$size)) as.numeric(df$size) else NA
|
||||
df$size[df$size == 0] <- NA
|
||||
df$isdir <- is.na(df$size)
|
||||
|
||||
dircol <- which(names(df) == "isdir")
|
||||
|
||||
if(info == "all")
|
||||
|
@ -441,7 +477,7 @@ delete_blob <- function(container, blob, confirm=TRUE)
|
|||
blob_exists <- function(container, blob)
|
||||
{
|
||||
res <- do_container_op(container, blob, headers = list(), http_verb = "HEAD", http_status_handler = "pass")
|
||||
if (httr::status_code(res) == 404L)
|
||||
if(httr::status_code(res) == 404L)
|
||||
return(FALSE)
|
||||
|
||||
httr::stop_for_status(res, storage_error_message(res))
|
||||
|
|
|
@ -47,7 +47,7 @@ multicopy_url_to_blob(container, src, dest, lease = NULL, async = FALSE,
|
|||
|
||||
\item{prefix}{For \code{list_blobs}, an alternative way to specify the directory.}
|
||||
|
||||
\item{recursive}{This argument is for consistency with the methods for the other storage types. It is not used for blob storage.}
|
||||
\item{recursive}{For the multiupload/download functions, whether to recursively transfer files in subdirectories. For \code{list_blobs}, whether to include the contents of any subdirectories in the listing.}
|
||||
|
||||
\item{src, dest}{The source and destination files for uploading and downloading. See 'Details' below.}
|
||||
|
||||
|
@ -97,12 +97,12 @@ Note that AzCopy only supports SAS and AAD (OAuth) token as authentication metho
|
|||
|
||||
\section{Directories}{
|
||||
|
||||
|
||||
Blob storage does not have true directories, instead using filenames containing a separator character (typically '/') to mimic a directory structure. This has some consequences:
|
||||
\itemize{
|
||||
\item The \code{isdir} column in the data frame output of \code{list_blobs} is a best guess as to whether an object represents a file or directory, and may not always be correct.
|
||||
\item \code{create_storage_dir} and \code{delete_storage_dir} currently do not have methods for blob containers.
|
||||
\item The \code{isdir} column in the data frame output of \code{list_blobs} is a best guess as to whether an object represents a file or directory, and may not always be correct. Currently, \code{list_blobs} assumes that any object with a file size of zero is a directory.
|
||||
\item Zero-length files can cause problems for the blob storage service as a whole (not just AzureStor). Try to avoid uploading such files.
|
||||
\item The output of \code{list_blobs(recursive=TRUE)} can vary based on whether the storage account has hierarchical namespaces enabled.
|
||||
\item \code{create_storage_dir} and \code{delete_storage_dir} currently do not have methods for blob containers.
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,76 @@
|
|||
context("Blob client interface, directories")
|
||||
|
||||
tenant <- Sys.getenv("AZ_TEST_TENANT_ID")
|
||||
app <- Sys.getenv("AZ_TEST_APP_ID")
|
||||
password <- Sys.getenv("AZ_TEST_PASSWORD")
|
||||
subscription <- Sys.getenv("AZ_TEST_SUBSCRIPTION")
|
||||
|
||||
if(tenant == "" || app == "" || password == "" || subscription == "")
|
||||
skip("Authentication tests skipped: ARM credentials not set")
|
||||
|
||||
rgname <- Sys.getenv("AZ_TEST_STORAGE_RG")
|
||||
storname <- Sys.getenv("AZ_TEST_STORAGE_HNS")
|
||||
|
||||
if(rgname == "" || storname == "")
|
||||
skip("Blob client tests skipped: resource names not set")
|
||||
|
||||
sub <- AzureRMR::az_rm$new(tenant=tenant, app=app, password=password)$get_subscription(subscription)
|
||||
stor <- sub$get_resource_group(rgname)$get_storage_account(storname)
|
||||
options(azure_storage_progress_bar=FALSE)
|
||||
|
||||
|
||||
test_that("Blob recursive file listing works",
|
||||
{
|
||||
bl <- stor$get_blob_endpoint()
|
||||
cont <- create_blob_container(bl, make_name())
|
||||
|
||||
dirs <- file.path(tempdir(), c("dir1", "dir1/dir2", "dir1/dir2/dir3"))
|
||||
files <- sapply(dirs, function(d)
|
||||
{
|
||||
dir.create(d, recursive=TRUE)
|
||||
file.path(d, write_file(d))
|
||||
})
|
||||
expect_silent(upload_blob(cont, files[1], "/dir1/file1"))
|
||||
expect_silent(upload_blob(cont, files[2], "/dir1/dir2/file2"))
|
||||
expect_silent(upload_blob(cont, files[3], "/dir1/dir2/dir3/file3"))
|
||||
|
||||
# this is for hierarchical namespace enabled
|
||||
l <- list_blobs(cont)
|
||||
expect_is(l, "data.frame")
|
||||
expect_identical(nrow(l), 6L)
|
||||
|
||||
l0 <- list_blobs(cont, recursive=FALSE)
|
||||
expect_is(l0, "data.frame")
|
||||
expect_identical(nrow(l0), 1L)
|
||||
expect_true(all(is.na(l0$size[l0$isdir])))
|
||||
expect_false(any(is.na(l0$size[!l0$isdir])))
|
||||
|
||||
l0n <- list_blobs(cont, recursive=FALSE, info="name")
|
||||
expect_is(l0n, "character")
|
||||
expect_identical(l0n, "dir1/")
|
||||
|
||||
l1 <- list_blobs(cont, "dir1/", recursive=FALSE)
|
||||
expect_identical(nrow(l1), 2L)
|
||||
expect_identical(l1$name, c("dir1/dir2/", "dir1/file1"))
|
||||
expect_true(all(is.na(l1$size[l1$isdir])))
|
||||
expect_false(any(is.na(l1$size[!l1$isdir])))
|
||||
|
||||
l1n <- list_blobs(cont, "dir1/", recursive=FALSE, info="name")
|
||||
expect_identical(l1n, c("dir1/dir2/", "dir1/file1"))
|
||||
|
||||
l1rec <- list_blobs(cont, "dir1/", recursive=TRUE)
|
||||
expect_identical(nrow(l1rec), 5L)
|
||||
|
||||
l1noslash <- list_blobs(cont, "dir1", recursive=FALSE)
|
||||
expect_identical(nrow(l1noslash), 2L)
|
||||
expect_identical(l1noslash$name, c("dir1/dir2/", "dir1/file1"))
|
||||
})
|
||||
|
||||
|
||||
teardown(
|
||||
{
|
||||
unlink(file.path(tempdir(), "dir1"), recursive=TRUE)
|
||||
bl <- stor$get_blob_endpoint()
|
||||
conts <- list_blob_containers(bl)
|
||||
lapply(conts, delete_blob_container, confirm=FALSE)
|
||||
})
|
Загрузка…
Ссылка в новой задаче