Parallel (#7)

closes #4
2019-06-16 06:57:06 +10:00 · 2019-06-16 06:57:06 +10:00 · 53065c6aa8
--- a/3
+++ b/3
@ -15,7 +15,8 @@ Depends:
 Imports: 
    R6,
    AzureRMR (>= 2.1.2),
-    jsonlite
+    jsonlite,
+    parallel
 Suggests:
    knitr,
    testthat,
--- a/2
+++ b/2
@ -15,7 +15,9 @@ export(build_template_parameters)
 export(datadisk_config)
 export(debian_9_backports)
 export(debian_9_backports_ss)
+export(delete_pool)
 export(image_config)
+export(init_pool)
 export(ip_config)
 export(is_vm)
 export(is_vm_resource)
--- a/NEWS.md
+++ b/NEWS.md
@ -8,6 +8,7 @@
  * Clear distinction between a VM deployment template and a resource. `get_vm` and `get_vm_scaleset` will always attempt to retrieve the template; to get the resource, use `get_vm_resource` and `get_vm_scaleset_resource`.
  * New VM resource methods: `get_public_ip_address`, `get_private_ip_address`.
  * New cluster/scaleset resource methods: `get_public_ip_address` (technically the address for the load balancer, if present), `get_vm_public_ip_addresses`, `get_vm_private_ip_addresses`, `list_instances`, `get_instance`.
+  * Use a pool of background processes to talk to scalesets in parallel when carrying out instance operations. The pool size can be controlled with the global options `azure_vm_minpoolsize` and `azure_vm_maxpoolsize`.
  * See the README and/or the vignette for more information.

 # AzureVM 1.0.1
--- a/R/AzureVM.R
+++ b/R/AzureVM.R
@ -7,5 +7,23 @@ AzureRMR::build_template_definition
 #' @export
 AzureRMR::build_template_parameters

-globalVariables("self", "AzureVM")
+globalVariables(c("self", "pool"), "AzureVM")

+.AzureVM <- new.env()
+
+
+# adding methods to classes in external package must go in .onLoad
+.onLoad <- function(libname, pkgname)
+{
+    add_sub_methods()
+    add_rg_methods()
+    add_defunct_methods()
+    options(azure_vm_minpoolsize=2)
+    options(azure_vm_maxpoolsize=10)
+}
+
+.onUnload <- function(libpath)
+{
+    if(exists("pool", envir=.AzureVM))
+        try(parallel::stopCluster(.AzureVM$pool), silent=TRUE)
+}
--- a/R/add_methods.R
+++ b/R/add_methods.R
@ -306,15 +306,6 @@ NULL
 NULL


-# adding methods to classes in external package must go in .onLoad
-.onLoad <- function(libname, pkgname)
-{
-    add_sub_methods()
-    add_rg_methods()
-    add_defunct_methods()
-}
-
-
 # extend subscription methods
 add_sub_methods <- function()
 {
--- a/R/az_vmss_resource.R
+++ b/R/az_vmss_resource.R
@ -23,8 +23,6 @@
 #' - `do_vmss_operation(...)` Carry out an arbitrary operation on the scaleset resource (as opposed to the instances in the scaleset).
 #'
 #' @details
-#' With the exception of `get_public_ip_address`, the scaleset operations listed above are actually provided by the [az_vmss_resource] class, and propagated to the template as active bindings.
-#'
 #' A single virtual machine scaleset in Azure is actually a collection of resources, including any and all of the following.
 #' - Network security group (Azure resource type `Microsoft.Network/networkSecurityGroups`)
 #' - Virtual network (Azure resource type `Microsoft.Network/virtualNetworks`)
@ -35,8 +33,15 @@
 #'
 #' By wrapping the deployment template used to create these resources, the `az_vmss_template` class allows managing them all as a single entity.
 #'
+#' @section Instance operations:
+#' With the exception of `get_public_ip_address`, the scaleset methods listed above are actually provided by the [az_vmss_resource] class, and propagated to the template as active bindings.
+#'
+#' AzureVM has the ability to parallelise scaleset instance operations using a pool of background processes. This can lead to significant speedups when working with scalesets with high instance counts. The pool is created automatically the first time that it is required, and remains persistent for the session. For more information, see [init_pool].
+#'
+#' The `id` argument lets you specify a subset of instances on which to carry out an operation. This can be a character vector of instance IDs; a list of instance objects such as returned by `list_instances`; or a single instance object. The default (NULL) is to carry out the operation on all instances.
+#'
 #' @seealso
-#' [AzureRMR::az_resource], [get_vm_scaleset_resource], [az_vmss_template]
+#' [AzureRMR::az_resource], [get_vm_scaleset_resource], [az_vmss_template], [init_pool]
 #'
 #' [VM scaleset API reference](https://docs.microsoft.com/en-us/rest/api/compute/virtualmachinescalesets)
 #' @format An R6 object of class `az_vmss_resource`, inheriting from `AzureRMR::az_resource`.
@ -243,9 +248,18 @@ private=list(

    vm_map=function(id, f)
    {
-        vms <- self$list_instances()
-        if(!is.null(id))
-            vms <- vms[as.character(id)]
-        lapply(vms, f)
+        vms <- if(is.null(id))
+            self$list_instances()
+        else if(is.list(id) && all(sapply(id, is_vm_resource)))
+            id
+        else if(is_vm_resource(id))
+            structure(list(id), names=basename(id$id))
+        else self$list_instances()[as.character(id)]
+
+        if(length(vms) < 2 || getOption("azure_vm_maxpoolsize") == 0)
+            return(lapply(vms, f))
+
+        init_pool(length(vms))
+        parallel::parLapply(.AzureVM$pool, vms, f)
    }
 ))
--- a/R/pool.R
+++ b/R/pool.R
@ -0,0 +1,60 @@
+#' Parallelise operations on VM scaleset instances
+#'
+#' @param connections The number of concurrent connections to support, which translates into the number of background R processes to create. Each connection requires a separate R process, so limit this is you are low on memory.
+#' @param restart For `init_pool`, whether to terminate an already running pool first.
+#' @param ... Other arguments passed on to `parallel::makeCluster`.
+#'
+#' @details
+#' AzureVM can parallelise operations on scaleset instances by utilizing a pool of R processes in the background. This can lead to significant speedups when working with scalesets with high instance counts. The pool is created automatically the first time that it is required, or it can be (re)created by calling `init_pool` manually. It remains persistent for the session or until terminated by `delete_pool`.
+#'
+#' If `init_pool` is called and the current pool is smaller than `connections`, it is resized. The size of the pool can be controlled by the global options `azure_vm_minpoolsize` and `azure_vm_maxpoolsize`, which have default values of 2 and 10 respectively. To disable parallel operations, set `options(azure_vm_maxpoolsize=0)`.
+#'
+#' Note that the pool size is unrelated to the _scaleset_ size, it only controls how many instances can communicate simultaneously with AzureVM.
+#'
+#' @seealso
+#' [az_vmss_template], [parallel::makeCluster]
+#' @rdname pool
+#' @aliases azure_vm_minpoolsize azure_vm_maxpoolsize
+#' @export
+init_pool <- function(connections, restart=FALSE, ...)
+{
+    if(restart)
+        delete_pool()
+
+    minsize <- getOption("azure_vm_minpoolsize")
+    maxsize <- getOption("azure_vm_maxpoolsize")
+    size <- min(max(connections, minsize), maxsize)
+    if(size < 1)
+        stop("Invalid pool size ", size, call.=FALSE)
+
+    if(!exists("pool", envir=.AzureVM) || length(.AzureVM$pool) < size)
+    {
+        delete_pool()
+        message("Creating background pool")
+        .AzureVM$pool <- parallel::makeCluster(size)
+    }
+    else
+    {
+        # restore original state, set working directory to master working directory
+        parallel::clusterCall(.AzureVM$pool, function(wd)
+        {
+            setwd(wd)
+            rm(list=ls(all.names=TRUE), envir=.GlobalEnv)
+        }, wd=getwd())
+    }
+
+    invisible(NULL)
+}
+
+
+#' @rdname pool
+#' @export
+delete_pool <- function()
+{
+    if(!exists("pool", envir=.AzureVM))
+        return()
+
+    message("Deleting background pool")
+    parallel::stopCluster(.AzureVM$pool)
+    rm(pool, envir=.AzureVM)
+}
--- a/README.md
+++ b/README.md
@ -138,12 +138,21 @@ sub$create_vm_scaleset("mydsvmss", user_config("myname", "~/.ssh/id_rsa.pub"), i

 # Large Debian scaleset (multiple placement groups), using low-priority VMs
 # need to set the instance size to something that supports low-pri
-sub$create_vm_scaleset("mydebss", user_config("myname", "~/.ssh/id_rsa.pub"), instances=10,
+sub$create_vm_scaleset("mylargess", user_config("myname", "~/.ssh/id_rsa.pub"), instances=10,
                       size="Standard_DS3_v2", config="debian_9_backports_ss",
                       options=scaleset_options(low_priority=TRUE, large_scaleset=TRUE),
                       location="australiaeast")
 ```

+Working with scaleset instances can be tedious if you have a large scaleset, since R can only connect to one instance at a time. To solve this problem, AzureVM creates a pool of background processes that connect in parallel with the scaleset, leading to significant speedups. The pool is created automatically the first time it is needed, and is deleted at the end of the session.
+
+```r
+# this will create a pool of up to 10 processes that talk to the scaleset
+mylargess$run_script("echo hello world! > /tmp/hello.txt")
+```
+
+You can control the size of the pool with the global `azure_vm_minpoolsize` and `azure_vm_maxpoolsize` options, which have default values 2 and 10 respectively. To turn off parallel connections, set `options(azure_vm_maxpoolsize=0)`. Note that the pool size is unrelated to the _scaleset_ size; it only controls how many instances can communicate with AzureVM simultaneously.
+
 ## Sharing resources

 You can also include an existing Azure resource in a deployment, by supplying an AzureRMR `az_resource` object as an argument in the `create_vm` or `create_vm_scaleset` call. For example, here we create a VM and a scaleset that share a single virtual network/subnet.
--- a/man/az_vmss_resource.Rd
+++ b/man/az_vmss_resource.Rd
@ -12,8 +12,6 @@ az_vmss_resource
 Class representing a virtual machine scaleset resource. In general, the methods in this class should not be called directly, nor should objects be directly instantiated from it. Use the \code{az_vmss_template} class for interacting with scalesets instead.
 }
 \details{
-With the exception of \code{get_public_ip_address}, the scaleset operations listed above are actually provided by the \link{az_vmss_resource} class, and propagated to the template as active bindings.
-
 A single virtual machine scaleset in Azure is actually a collection of resources, including any and all of the following.
 \itemize{
 \item Network security group (Azure resource type \code{Microsoft.Network/networkSecurityGroups})
@ -49,8 +47,17 @@ The following methods are available, in addition to those provided by the \link[
 }
 }

+\section{Instance operations}{
+
+With the exception of \code{get_public_ip_address}, the scaleset methods listed above are actually provided by the \link{az_vmss_resource} class, and propagated to the template as active bindings.
+
+AzureVM has the ability to parallelise scaleset instance operations using a pool of background processes. This can lead to significant speedups when working with scalesets with high instance counts. The pool is created automatically the first time that it is required, and remains persistent for the session. For more information, see \link{init_pool}.
+
+The \code{id} argument lets you specify a subset of instances on which to carry out an operation. This can be a character vector of instance IDs; a list of instance objects such as returned by \code{list_instances}; or a single instance object. The default (NULL) is to carry out the operation on all instances.
+}
+
 \seealso{
-\link[AzureRMR:az_resource]{AzureRMR::az_resource}, \link{get_vm_scaleset_resource}, \link{az_vmss_template}
+\link[AzureRMR:az_resource]{AzureRMR::az_resource}, \link{get_vm_scaleset_resource}, \link{az_vmss_template}, \link{init_pool}

 \href{https://docs.microsoft.com/en-us/rest/api/compute/virtualmachinescalesets}{VM scaleset API reference}
 }
--- a/man/pool.Rd
+++ b/man/pool.Rd
@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pool.R
+\name{init_pool}
+\alias{init_pool}
+\alias{azure_vm_minpoolsize}
+\alias{azure_vm_maxpoolsize}
+\alias{delete_pool}
+\title{Parallelise operations on VM scaleset instances}
+\usage{
+init_pool(connections, restart = FALSE, ...)
+
+delete_pool()
+}
+\arguments{
+\item{connections}{The number of concurrent connections to support, which translates into the number of background R processes to create. Each connection requires a separate R process, so limit this is you are low on memory.}
+
+\item{restart}{For \code{init_pool}, whether to terminate an already running pool first.}
+
+\item{...}{Other arguments passed on to \code{parallel::makeCluster}.}
+}
+\description{
+Parallelise operations on VM scaleset instances
+}
+\details{
+AzureVM can parallelise operations on scaleset instances by utilizing a pool of R processes in the background. This can lead to significant speedups when working with scalesets with high instance counts. The pool is created automatically the first time that it is required, or it can be (re)created by calling \code{init_pool} manually. It remains persistent for the session or until terminated by \code{delete_pool}.
+
+If \code{init_pool} is called and the current pool is smaller than \code{connections}, it is resized. The size of the pool can be controlled by the global options \code{azure_vm_minpoolsize} and \code{azure_vm_maxpoolsize}, which have default values of 2 and 10 respectively. To disable parallel operations, set \code{options(azure_vm_maxpoolsize=0)}.
+
+Note that the pool size is unrelated to the \emph{scaleset} size, it only controls how many instances can communicate simultaneously with AzureVM.
+}
+\seealso{
+\link{az_vmss_template}, \link[parallel:makeCluster]{parallel::makeCluster}
+}
--- a/tests/testthat/test11_vmss.R
+++ b/tests/testthat/test11_vmss.R
@ -11,6 +11,9 @@ if(tenant == "" || app == "" || password == "" || subscription == "")
 vmss_name <- paste0("vmss", paste0(sample(letters, 10, TRUE), collapse=""))
 location <- "australiaeast"

+# turn off parallelisation
+maxpoolsize <- options(azure_vm_maxpoolsize=0)
+
 rg <- AzureRMR::az_rm$
    new(tenant=tenant, app=app, password=password)$
    get_subscription(subscription)$
@ -18,7 +21,7 @@ rg <- AzureRMR::az_rm$

 test_that("Scaleset creation works",
 {
-    vm <- rg$create_vm_scaleset(vmss_name, user_config("username", "../resources/testkey.pub"), instances=2)
+    vm <- rg$create_vm_scaleset(vmss_name, user_config("username", "../resources/testkey.pub"), instances=3)
    expect_is(vm, "az_vmss_template")
 })

@ -27,11 +30,18 @@ test_that("Scaleset interaction works",
    vm <- rg$get_vm_scaleset(vmss_name)
    expect_is(vm, "az_vmss_template")

+    expect_is(vm$get_public_ip_address(), "character")
+
+    inst <- vm$list_instances()
+    expect_is(inst, "list")
+
    expect_silent(vm$run_script("ls /tmp"))

    expect_is(vm$get_vm_private_ip_addresses(), "character")
    expect_is(vm$get_vm_public_ip_addresses(), "character")
-    expect_is(vm$get_public_ip_address(), "character")
+
+    expect_is(vm$get_vm_private_ip_addresses(names(inst)[1:2]), "character")
+    expect_is(vm$get_vm_public_ip_addresses(names(inst)[1:2]), "character")
 })

 test_that("Scaleset deletion works",
@ -44,3 +54,4 @@ test_that("Scaleset deletion works",
 })

 rg$delete(confirm=FALSE)
+options(maxpoolsize)
--- a/tests/testthat/test12_custom.R
+++ b/tests/testthat/test12_custom.R
@ -32,6 +32,16 @@ test_that("Resource sharing works",
    expect_is(rg$create_vm(vmname2, user, size, vnet=vnet, nsg=NULL), "az_vm_template")

    expect_is(rg$create_vm_scaleset(ssname, user, instances=3, size=size, vnet=vnet, nsg=NULL), "az_vmss_template")
+
+    expect_error(rg$get_resource(type="Microsoft.Network/virtualNetworks", name=paste0(vmname2, "-vnet")))
+    expect_error(rg$get_resource(type="Microsoft.Network/virtualNetworks", name=paste0(vmname2, "-nsg")))
+
+    rg$delete_vm_scaleset(ssname, confirm=FALSE)
+    rg$delete_vm(vmname2, confirm=FALSE)
+    rg$delete_vm(vmname1, confirm=FALSE)
+
+    Sys.sleep(10)
+    expect_true(is_empty(rg$list_resources()))
 })

 test_that("Custom resource works",
@ -48,6 +58,13 @@ test_that("Custom resource works",
        kind="Storage"
    )
    expect_is(rg$create_vm(vmname, user, size, other_resources=list(stor)), "az_vm_template")
+
+    expect_is(rg$get_resource(type="Microsoft.Storage/storageAccounts", name=paste0(vmname, "stor")), "az_resource")
+
+    rg$delete_vm(vmname, confirm=FALSE)
+
+    Sys.sleep(10)
+    expect_true(is_empty(rg$list_resources()))
 })

 test_that("Scaleset options work",
--- a/tests/testthat/test13_par.R
+++ b/tests/testthat/test13_par.R
@ -0,0 +1,48 @@
+context("VM scaleset connection pool")
+
+tenant <- Sys.getenv("AZ_TEST_TENANT_ID")
+app <- Sys.getenv("AZ_TEST_APP_ID")
+password <- Sys.getenv("AZ_TEST_PASSWORD")
+subscription <- Sys.getenv("AZ_TEST_SUBSCRIPTION")
+
+if(tenant == "" || app == "" || password == "" || subscription == "")
+    skip("Tests skipped: ARM credentials not set")
+
+vmss_name <- paste0("vmss", paste0(sample(letters, 10, TRUE), collapse=""))
+location <- "australiaeast"
+
+maxpoolsize <- options(azure_vm_maxpoolsize=10)
+
+rg <- AzureRMR::az_rm$
+    new(tenant=tenant, app=app, password=password)$
+    get_subscription(subscription)$
+    create_resource_group(vmss_name, location)
+
+test_that("Scaleset connection pool works",
+{
+    vm <- rg$create_vm_scaleset(vmss_name, user_config("username", "../resources/testkey.pub"), instances=5,
+        autoscaler=NULL, load_balancer=NULL)
+    expect_is(vm, "az_vmss_template")
+
+    inst <- vm$list_instances()
+    expect_is(inst, "list")
+
+    expect_message(vm$run_script("ls /tmp", id=names(inst)[1:2]), "Creating background pool")
+    expect_true(exists("pool", AzureVM:::.AzureVM) && length(AzureVM:::.AzureVM$pool) == 2)
+
+    expect_silent(vm$get_vm_private_ip_addresses(names(inst[1:2])))
+    expect_silent(vm$get_vm_private_ip_addresses(inst[1:2]))
+
+    expect_message(vm$get_vm_private_ip_addresses(), "Creating background pool")
+    expect_true(exists("pool", AzureVM:::.AzureVM) && length(AzureVM:::.AzureVM$pool) == 5)
+
+    expect_silent(vm$get_vm_private_ip_addresses(inst))
+    expect_silent(vm$get_vm_private_ip_addresses(inst[[1]]))
+
+    delete_pool()
+    expect_false(exists("pool", AzureVM:::.AzureVM))
+})
+
+rg$delete(confirm=FALSE)
+options(maxpoolsize)
+
--- a/vignettes/intro.rmd
+++ b/vignettes/intro.rmd
@ -142,12 +142,21 @@ sub$create_vm_scaleset("mydsvmss", user_config("myname", "~/.ssh/id_rsa.pub"), i

 # Large Debian scaleset (multiple placement groups), using low-priority VMs
 # need to set the instance size to something that supports low-pri
-sub$create_vm_scaleset("mydebss", user_config("myname", "~/.ssh/id_rsa.pub"), instances=10,
+sub$create_vm_scaleset("mylargess", user_config("myname", "~/.ssh/id_rsa.pub"), instances=10,
                       size="Standard_DS3_v2", config="debian_9_backports_ss",
                       options=scaleset_options(low_priority=TRUE, large_scaleset=TRUE),
                       location="australiaeast")
 ```

+Working with scaleset instances can be tedious if you have a large scaleset, since R can only connect to one instance at a time. To solve this problem, AzureVM creates a pool of background processes that connect in parallel with the scaleset, leading to significant speedups. The pool is created automatically the first time it is needed, and is deleted at the end of the session.
+
+```r
+# this will create a pool of up to 10 processes that talk to the scaleset
+mylargess$run_script("echo hello world! > /tmp/hello.txt")
+```
+
+You can control the size of the pool with the global `azure_vm_minpoolsize` and `azure_vm_maxpoolsize` options, which have default values 2 and 10 respectively. To turn off parallel connections, set `options(azure_vm_maxpoolsize=0)`. Note that the pool size is unrelated to the _scaleset_ size; it only controls how many instances can communicate with AzureVM simultaneously.
+
 ## Sharing resources

 You can also include an existing Azure resource in a deployment, by supplying an AzureRMR `az_resource` object as an argument in the `create_vm` or `create_vm_scaleset` call. For example, here we create a VM and a scaleset that share a single virtual network/subnet.