Fix: Upgrading to R Batch SDK to 2018-12-01.8.0 (#354)

* Added resource files

* Added resource files

* Removed comments

* Fixed resource files documentation

* Added check on job state

* Fixed jobState
This commit is contained in:
Brian Hoang 2019-06-18 21:04:30 -07:00 коммит произвёл GitHub
Родитель 93f3fbc6fd
Коммит 96bfc22662
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 57 добавлений и 51 удалений

Просмотреть файл

@ -19,7 +19,7 @@ BatchUtilities <- R6::R6Class(
accountName <- storageClient$authentication$name accountName <- storageClient$authentication$name
resourceFiles <- NULL resourceFiles <- args$resourceFiles
if (!is.null(argsList)) { if (!is.null(argsList)) {
envFile <- paste0(taskId, ".rds") envFile <- paste0(taskId, ".rds")
saveRDS(argsList, file = envFile) saveRDS(argsList, file = envFile)
@ -37,8 +37,18 @@ BatchUtilities <- R6::R6Class(
envFile, envFile,
readToken, readToken,
config$endpointSuffix) config$endpointSuffix)
resourceFiles <-
list(rAzureBatch::createResourceFile(url = envFileUrl, fileName = envFile)) environmentResourceFile <-
rAzureBatch::createResourceFile(filePath = envFile, httpUrl = envFileUrl)
if (is.null(resourceFiles))
{
resourceFiles <-
list(environmentResourceFile)
}
else {
resourceFiles <- append(resourceFiles, environmentResourceFile)
}
} }
# Only use the download command if cloudCombine is enabled # Only use the download command if cloudCombine is enabled
@ -52,17 +62,6 @@ BatchUtilities <- R6::R6Class(
if (!is.null(cloudCombine)) { if (!is.null(cloudCombine)) {
assign("cloudCombine", cloudCombine, .doAzureBatchGlobals) assign("cloudCombine", cloudCombine, .doAzureBatchGlobals)
containerSettings$imageName <- "brianlovedocker/doazureparallel-merge-dockerfile:0.12.1"
copyCommand <- sprintf(
"%s %s %s --download --saskey $BLOBXFER_SASKEY --remoteresource . --include results/*.rds --endpoint %s",
accountName,
jobId,
"$AZ_BATCH_TASK_WORKING_DIR",
config$endpointSuffix
)
commands <- c(paste("blobxfer", copyCommand))
} }
exitConditions <- NULL exitConditions <- NULL

Просмотреть файл

@ -123,7 +123,7 @@ makeCluster <-
# install docker # install docker
containerConfiguration <- list( containerConfiguration <- list(
type = "docker" type = "dockerCompatible"
) )
dockerImage <- "rocker/tidyverse:latest" dockerImage <- "rocker/tidyverse:latest"

Просмотреть файл

@ -474,12 +474,12 @@ setHttpTraffic <- function(value = FALSE) {
storageEndpointSuffix = config$endpointSuffix) storageEndpointSuffix = config$endpointSuffix)
requiredJobResourceFiles <- list( requiredJobResourceFiles <- list(
rAzureBatch::createResourceFile(url = workerScriptUrl, fileName = "worker.R"), rAzureBatch::createResourceFile(filePath = "worker.R", httpUrl = workerScriptUrl),
rAzureBatch::createResourceFile(url = mergerScriptUrl, fileName = "merger.R"), rAzureBatch::createResourceFile(filePath = "merger.R", httpUrl = mergerScriptUrl),
rAzureBatch::createResourceFile(url = installGithubScriptUrl, fileName = "install_github.R"), rAzureBatch::createResourceFile(filePath = "install_github.R", httpUrl = installGithubScriptUrl),
rAzureBatch::createResourceFile(url = installCranScriptUrl, fileName = "install_cran.R"), rAzureBatch::createResourceFile(filePath = "install_cran.R", httpUrl = installCranScriptUrl),
rAzureBatch::createResourceFile(url = installBioConductorScriptUrl, fileName = "install_bioconductor.R"), rAzureBatch::createResourceFile(filePath = "install_bioconductor.R", httpUrl = installBioConductorScriptUrl),
rAzureBatch::createResourceFile(url = jobCommonFileUrl, fileName = jobFileName) rAzureBatch::createResourceFile(filePath = jobFileName, httpUrl = jobCommonFileUrl)
) )
resourceFiles <- resourceFiles <-
@ -669,6 +669,21 @@ setHttpTraffic <- function(value = FALSE) {
) )
) )
mergeReadSasToken <- storageClient$generateSasToken("rl", "c", id)
mergeResourceFileUrl <-
rAzureBatch::createBlobUrl(
storageAccount = storageClient$authentication$name,
containerName = id,
sasToken = mergeReadSasToken,
storageEndpointSuffix = config$endpointSuffix
)
mergeResources <-
list(
rAzureBatch::createResourceFile(
storageContainerUrl = mergeResourceFileUrl,
blobPrefix = "results"))
BatchUtilitiesOperations$addTask( BatchUtilitiesOperations$addTask(
jobId = id, jobId = id,
taskId = "merge", taskId = "merge",
@ -684,7 +699,8 @@ setHttpTraffic <- function(value = FALSE) {
dependsOn = taskDependencies, dependsOn = taskDependencies,
cloudCombine = cloudCombine, cloudCombine = cloudCombine,
outputFiles = append(obj$options$azure$outputFiles, mergeOutput), outputFiles = append(obj$options$azure$outputFiles, mergeOutput),
containerImage = data$containerImage containerImage = data$containerImage,
resourceFiles = mergeResources
) )
cat(". . .") cat(". . .")
@ -726,7 +742,7 @@ setHttpTraffic <- function(value = FALSE) {
} }
if (!identical(function(a, ...) c(a, list(...)), if (!identical(function(a, ...) c(a, list(...)),
obj$combineInfo$fun, ignore.environment = TRUE)){ obj$combineInfo$fun, ignore.environment = TRUE)) {
tryCatch({ tryCatch({
accumulator <- foreach::makeAccum(it) accumulator <- foreach::makeAccum(it)
accumulator(results, as.numeric(names(results))) accumulator(results, as.numeric(names(results)))

Просмотреть файл

@ -472,19 +472,14 @@ waitForTasksToComplete <-
flush.console() flush.console()
validationFlag <-
(taskCounts$validationStatus == "Validated" &&
totalTasks <= 200000) ||
totalTasks > 200000
if (taskCounts$failed > 0 && if (taskCounts$failed > 0 &&
errorHandling == "stop" && errorHandling == "stop") {
validationFlag) {
cat("\n") cat("\n")
select <- "id, executionInfo" select <- "id, executionInfo"
filter <- "executionInfo/result eq 'failure'"
failedTasks <- failedTasks <-
batchClient$taskOperations$list(jobId, select = select) batchClient$taskOperations$list(jobId, select = select, filter = filter)
tasksFailureWarningLabel <- tasksFailureWarningLabel <-
sprintf( sprintf(
@ -498,14 +493,9 @@ waitForTasksToComplete <-
) )
for (i in 1:length(failedTasks$value)) { for (i in 1:length(failedTasks$value)) {
if (!is.null(failedTasks$value[[i]]$executionInfo$result) &&
grepl(failedTasks$value[[i]]$executionInfo$result,
"failure",
ignore.case = TRUE)) {
tasksFailureWarningLabel <- tasksFailureWarningLabel <-
paste0(tasksFailureWarningLabel, paste0(tasksFailureWarningLabel,
sprintf("%s\n", failedTasks$value[[i]]$id)) sprintf("%s\n", failedTasks$value[[i]]$id))
}
} }
warning(sprintf(tasksFailureWarningLabel, warning(sprintf(tasksFailureWarningLabel,
@ -533,9 +523,10 @@ waitForTasksToComplete <-
jobId) jobId)
} }
if (taskCounts$completed >= totalTasks && jobInfo <- getJob(jobId, verbose = FALSE)
(taskCounts$validationStatus == "Validated" || if (taskCounts$completed >= totalTasks ||
totalTasks >= 200000)) { jobInfo$jobState == "completed" ||
jobInfo$jobState == "terminating") {
cat("\n") cat("\n")
break break
} }

Просмотреть файл

@ -39,12 +39,12 @@ Here's an example that uses data stored in a public location on Azure Blob Stora
# define where to download data from # define where to download data from
resource_files = list( resource_files = list(
rAzureBatch::createResourceFile( rAzureBatch::createResourceFile(
url = "https://<accountname>.blob.core.windows.net/<container>/2010.csv", httpUrl = "https://<accountname>.blob.core.windows.net/<container>/2010.csv",
fileName = "2010.csv" filePath = "2010.csv"
), ),
rAzureBatch::createResourceFile( rAzureBatch::createResourceFile(
url = "https://<accountname>.blob.core.windows.net/<container>/2011.csv", httpUrl = "https://<accountname>.blob.core.windows.net/<container>/2011.csv",
fileName = "2011.csv" filePath = "2011.csv"
) )
) )

Просмотреть файл

@ -34,12 +34,12 @@ doAzureParallel::setCredentials("credentials.json")
# Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml # Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset" azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset"
resource_files <- list( resource_files <- list(
rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), fileName = "yellow_tripdata_2016-1.csv"), rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), filePath = "yellow_tripdata_2016-1.csv"),
rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), fileName = "yellow_tripdata_2016-2.csv"), rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), filePath = "yellow_tripdata_2016-2.csv"),
rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), fileName = "yellow_tripdata_2016-3.csv"), rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), filePath = "yellow_tripdata_2016-3.csv"),
rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), fileName = "yellow_tripdata_2016-4.csv"), rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), filePath = "yellow_tripdata_2016-4.csv"),
rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), fileName = "yellow_tripdata_2016-5.csv"), rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), filePath = "yellow_tripdata_2016-5.csv"),
rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), fileName = "yellow_tripdata_2016-6.csv") rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), filePath = "yellow_tripdata_2016-6.csv")
) )
# add the parameter 'resourceFiles' to download files to nodes # add the parameter 'resourceFiles' to download files to nodes

Просмотреть файл

@ -56,8 +56,8 @@ csvFileUrl2 <- rAzureBatch::createBlobUrl(storageAccount = storageAccountName,
# Create a list of files to download to the cluster using read-only permissions # Create a list of files to download to the cluster using read-only permissions
# Place the files in a directory called 'data' # Place the files in a directory called 'data'
resource_files = list( resource_files = list(
rAzureBatch::createResourceFile(url = csvFileUrl1, fileName = "data/1989.csv"), rAzureBatch::createResourceFile(httpUrl = csvFileUrl1, filePath = "data/1989.csv"),
rAzureBatch::createResourceFile(url = csvFileUrl2, fileName = "data/1990.csv") rAzureBatch::createResourceFile(httpUrl = csvFileUrl2, filePath = "data/1990.csv")
) )
# Create the cluster # Create the cluster