Merge branch 'master' of https://github.com/Azure/AzureDSVM

2017-03-22 17:10:31 +08:00 · 2017-03-22 17:10:31 +08:00 · 84be4f4bae
--- a/.gitignore
+++ b/.gitignore
@ -27,5 +27,3 @@ inst/doc
 .Rproj.user
 *_credentials.R
 *~
-man/
-NAMESPACE
--- a/4
+++ b/4
@ -47,8 +47,8 @@ vkmeans: vignettes

 .PHONY: resources deploy delete ping

-resources:
-	(cd test; Rscript resources.R)
+list:
+	(cd test; Rscript listRG.R)

 deploy: 
 	(cd test; Rscript deployDSVM.R)
--- a/38
+++ b/38
@ -0,0 +1,38 @@
+# Generated by roxygen2: do not edit by hand
+
+export(createComputeInterface)
+export(dataConsumption)
+export(deployDSVM)
+export(deployDSVMCluster)
+export(dumpInterface)
+export(executeScript)
+export(existsRG)
+export(expenseCalculator)
+export(fileTransfer)
+export(getVMSizes)
+export(operateDSVM)
+export(pricingRates)
+export(setConfig)
+export(updateScript)
+import(dplyr)
+import(magrittr)
+importFrom(XML,htmlParse)
+importFrom(XML,xmlValue)
+importFrom(XML,xpathApply)
+importFrom(XML,xpathSApply)
+importFrom(httr,GET)
+importFrom(httr,POST)
+importFrom(httr,PUT)
+importFrom(httr,add_headers)
+importFrom(httr,authenticate)
+importFrom(httr,content)
+importFrom(httr,headers)
+importFrom(httr,http_status)
+importFrom(httr,status_code)
+importFrom(jsonlite,fromJSON)
+importFrom(lubridate,hour)
+importFrom(lubridate,minute)
+importFrom(lubridate,second)
+importFrom(stringr,str_c)
+importFrom(stringr,str_detect)
+importFrom(utils,URLencode)
--- a/git.mk
+++ b/git.mk
@ -10,9 +10,6 @@ ifneq ($(APP),)
 endif
 	@echo "-------------------------------------------------------"

-info:
-	git info
-
 pull:
 	@echo "-------------------------------------------------------"
 	git pull
--- a/man/AzureDSVM-package.Rd
+++ b/man/AzureDSVM-package.Rd
@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/azureDSVM-package.R
+\docType{package}
+\name{AzureDSVM-package}
+\alias{AzureDSVM}
+\alias{AzureDSVM-package}
+\title{AzureDSVM}
+\description{
+Support data science analytics with Azure resources.
+}
+\keyword{package}
+
--- a/man/dataConsumption.Rd
+++ b/man/dataConsumption.Rd
@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/expenseCalculator.R
+\name{dataConsumption}
+\alias{dataConsumption}
+\title{Get data consumption of an Azure subscription for a time period. Aggregation method can be either daily based or hourly based.}
+\usage{
+dataConsumption(context, instance, timeStart, timeEnd, granularity = "Hourly")
+}
+\arguments{
+\item{context}{AzureSMR context object.}
+
+\item{instance}{Instance of Azure instance name that one would like to check expense. For example, it can be the name of a virtual machine.}
+
+\item{timeStart}{Start time.}
+
+\item{timeEnd}{End time.}
+
+\item{granularity}{Aggregation granularity. Can be either "Daily" or "Hourly".}
+}
+\note{
+Formats of start time point and end time point follow ISO 8601 standard. Say if one would like to calculate data consumption between Feb 21, 2017 to Feb 25, 2017, the inputs should be "2017-02-21 00:00:00" and "2017-02-25 00:00:00", for start time point and end time point, respectively. For hourly based calculation, note there should be no minute and second included.
+}
+
--- a/man/deployDSVM.Rd
+++ b/man/deployDSVM.Rd
@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/deployDSVM.R
+\name{deployDSVM}
+\alias{deployDSVM}
+\title{Deploy a new Data Science Virtual Machine (DSVM).}
+\usage{
+deployDSVM(context, resource.group, location, hostname, username,
+  size = "Standard_DS2_v2", os = "Linux", authen = ifelse(os == "Linux",
+  "Key", "Password"), pubkey = "", password = "", dns.label = hostname,
+  mode = "Sync")
+}
+\arguments{
+\item{context}{Authentication context of AzureSMR encapsulating the
+TID, CID, and key obtained from Azure Actrive Directory.}
+
+\item{resource.group}{The Azure resource group where the DSVM is
+created.}
+
+\item{location}{Location of the data centre to host the DSVM.}
+
+\item{hostname}{Name of the DSVM.  Lowercase characters or numbers
+only. Special characters are not permitted.}
+
+\item{username}{User name of the DSVM. It should be different from
+`name`.}
+
+\item{size}{Size of the DSVM. The default is "Standard_D1_v2". All
+available sizes can be obtained by function `getVMSizes`.}
+
+\item{os}{Operating system of DSVM. Permitted values are "Linux"
+and "Windows". The default is to deploy a Linux Data Science
+Virtual Machine.}
+
+\item{authen}{Either "Key" for public-key based authentication
+(with Linux) or "Password" for a password based authentication
+(Linux or Windows). Default is to use public key authentication
+for Linux and password based authentication for Windows.}
+
+\item{pubkey}{Public key for the DSVM. Only applicable for
+public-key based authentication of Linux based DSVM.}
+
+\item{password}{Pass word for the DSVM.}
+
+\item{dns.label}{DNS label for the VM address. The URL for
+accessing the deployed DSVM will be
+"<dns_label>.<location>.cloudapp.azure.com}
+
+\item{mode}{Mode of virtual machine deployment. Default is "Sync".}
+}
+\details{
+If the deployment fails visit the Azure portal
+https://ms.portal.azure.com and browse to the resource group and
+click on the failed deployment link to view the failure
+message. Typical errors include DnsRecordInUse or
+StorageAccountAlreadyTaken. If so then choose a different hostname.
+}
+
--- a/man/deployDSVMCluster.Rd
+++ b/man/deployDSVMCluster.Rd
@ -0,0 +1,74 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/deployDSVMCluster.R
+\name{deployDSVMCluster}
+\alias{deployDSVMCluster}
+\title{Deploy a cluster of Linux Data Science Virtual Machines on Azure.}
+\usage{
+deployDSVMCluster(context, resource.group, location, hostnames, usernames,
+  pubkeys, count, size = "Standard_D1_v2", dns.labels = hostnames)
+}
+\arguments{
+\item{context}{AzureSMR active context.}
+
+\item{resource.group}{The Azure resource group where the DSVM is
+allocated.}
+
+\item{location}{Location of the data centre to host the DSVM.}
+
+\item{hostnames}{Hostnames for the DSVMs. Lowercase characters or
+numbers only. If a single hostname is supplied and count > 1 then
+the hostname will be used as a prefix for a sequential count of
+hostnames.}
+
+\item{usernames}{Usernames for the admin account created on the
+DSVM. If a single username is supplied then that username is used
+as the admin user on each host. Otherwise a username is provided
+for each of the DSVMs.}
+
+\item{pubkeys}{Public keys for the DSVM. This is only applicable
+for public-key based authentication of Linux based DSVM. One or a
+vector of public keys can be provided, depending on the count or
+the number of hostnames or usernames.}
+
+\item{count}{If provided this is the number of DSVM instances to be
+created. If not provided the number of DSVMs created will be
+either the number of names provided or the number of usernames
+provided.}
+
+\item{size}{The size of the DSVMs. Each DSVM is the same size.}
+
+\item{dns.labels}{DNS labels for the VM. By default this is the
+hostnames but is not required to be. The fully qualified domain
+name for accessing the deployed DSVM will then be
+"<dns.label_label>.<location>.cloudapp.azure.com".}
+}
+\description{
+Creates a cluster of Data Science Virtual Machines and enables the
+DSVMs to communicate across the cluster via public key based
+credentials for high performance computation. All DSVMs in the
+cluster are based on Linux OS and use public key cryptogrphy for
+log in.
+}
+\details{
+We identify two specific use cases but recognise there are many
+that are supported by this function.
+
+A cluster is intended as a High Performance Compute engine across
+the deployed DSVMs supporting a parallel computing context as is
+available with Microsoft R Server ScaleR package. A cluster is a
+deployment of multiple identitical DSVMs. A single admin username
+and public key will be used across the cluster. The individual
+machine names will be constructed from the provided name with
+sequential numbers. The data scientist will typically connect to
+the cluster from their local desktop/laptop running R locally with
+remote execution for computation. A cluster is typcially created by
+the data scientist when needed and the resource group deleted on
+completion of the activity.
+
+A collection is a deployment with different usernames and public
+keys for each of the DSVMS. A vector of usernames must be
+provided. A colleciton is often used in creating multiple DSVMs for
+a group of data scientists or for training. A colleciton is often
+longer lasting than a cluster.
+}
+
--- a/man/executeScript.Rd
+++ b/man/executeScript.Rd
@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/executeScript.R
+\name{executeScript}
+\alias{executeScript}
+\title{Remote execution of R script in an R interface new_interface.}
+\usage{
+executeScript(context, resourceGroup, machines, remote, user, script, master,
+  slaves, computeContext)
+}
+\arguments{
+\item{context}{AzureSMR context.}
+
+\item{resourceGroup}{Resource group of Azure resources for computation.}
+
+\item{machines}{Remote DSVMs that will be used for computation.}
+
+\item{remote}{IP address or FQDN for a computation engine. For
+DSVM, it is either the fully qualified domain name (usually in the format of
+<hostname>.<location>.cloudapp.azure.com) or its public IP
+address. Note if more than one machines are used for execution,
+the remote is used as master node by default.}
+
+\item{user}{Username for logging into the remote resource.}
+
+\item{script}{R script to be executed on remote resource(s).}
+
+\item{master}{IP address or URL of a DSVM which will be used as the
+master. By default is remote.}
+
+\item{slaves}{IP addresses or URLs of slave DSVMs.}
+
+\item{computeContext}{Computation context of Microsoft R Server
+under which the mechanisms of parallelization (e.g., local
+parallel, cluster based parallel, or Spark) is
+specified. Accepted computing context include "localParallel",
+"clusterParallel", "Hadoop", and "Spark".}
+}
+\value{
+Status of scription execution.
+}
+\description{
+Remote execution of R script in an R interface new_interface.
+}
+\details{
+For a localParallel compute context,
+}
+
--- a/man/existsRG.Rd
+++ b/man/existsRG.Rd
@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/existsRG.R
+\name{existsRG}
+\alias{existsRG}
+\title{Check if a resource group exists.}
+\usage{
+existsRG(context, resource.group, location, verbose = FALSE)
+}
+\arguments{
+\item{context}{Authentication context of AzureSMR encapsulating the
+TID, CID, and key obtained from Azure Actrive Directory.}
+
+\item{resource.group}{The Azure resource group where the DSVM is
+created.}
+
+\item{location}{Location of the data centre to host the DSVM.}
+}
+
--- a/man/expenseCalculator.Rd
+++ b/man/expenseCalculator.Rd
@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/expenseCalculator.R
+\name{expenseCalculator}
+\alias{expenseCalculator}
+\title{Calculate cost of using a specific instance of Azure for certain period.}
+\usage{
+expenseCalculator(context, instance, timeStart, timeEnd, granularity, currency,
+  locale, offerId, region)
+}
+\arguments{
+\item{context}{AzureSMR context.}
+
+\item{instance}{Instance of Azure instance that one would like to check expense. No matter whether resource group is given or not, if a instance of instance is given, data consumption of that instance is returned.}
+
+\item{timeStart}{Start time.}
+
+\item{timeEnd}{End time.}
+
+\item{granularity}{Aggregation granularity. Can be either "Daily" or "Hourly".}
+
+\item{currency}{Currency in which price rating is measured.}
+
+\item{locale}{Locality information of subscription.}
+
+\item{offerId}{Offer ID of the subscription. Detailed information can be found at https://azure.microsoft.com/en-us/support/legal/offer-details/}
+
+\item{region}{region information about the subscription.}
+}
+\value{
+Total cost measured in the given currency of the specified Azure instance in the period.
+}
+
--- a/man/fileTransfer.Rd
+++ b/man/fileTransfer.Rd
@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/executeScript.R
+\name{fileTransfer}
+\alias{fileTransfer}
+\title{Upload or download files.}
+\usage{
+fileTransfer(from = ".", to = ".", user, file)
+}
+\arguments{
+\item{from}{Source location (path) of file.}
+
+\item{to}{Target location (path) of file.}
+
+\item{file}{File name - a character string.}
+}
+\note{
+File transfer is implemented by `scp` with public key based authentication.
+}
+
--- a/man/getVMSizes.Rd
+++ b/man/getVMSizes.Rd
@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/getVMSizes.R
+\name{getVMSizes}
+\alias{getVMSizes}
+\title{Get available sizes for data science virtual machines.}
+\usage{
+getVMSizes(context, location)
+}
+\arguments{
+\item{context}{An AzureSMR context.}
+
+\item{location}{Location of the Azure resources.}
+}
+\value{
+A data frame that contains basic information
+about the available DSVM sizes.
+}
+\description{
+Get available sizes for data science virtual machines.
+}
+
--- a/man/pricingRates.Rd
+++ b/man/pricingRates.Rd
@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/expenseCalculator.R
+\name{pricingRates}
+\alias{pricingRates}
+\title{Get pricing details of resources under a subscription.}
+\usage{
+pricingRates(context, currency, locale, offerId, region)
+}
+\arguments{
+\item{context}{- Azure Context Object.}
+
+\item{currency}{Currency in which price rating is measured.}
+
+\item{locale}{Locality information of subscription.}
+
+\item{offerId}{Offer ID of the subscription. Detailed information can be found at https://azure.microsoft.com/en-us/support/legal/offer-details/}
+
+\item{region}{region information about the subscription.}
+}
+
--- a/r.mk
+++ b/r.mk
@ -11,7 +11,7 @@ R_FILES   := $(wildcard R/*.R)
 SRC_FILES := $(wildcard src/*) $(addprefix src/, $(COPY_SRC))
 PKG_FILES := DESCRIPTION NAMESPACE $(R_FILES) $(SRC_FILES)

-.PHONY: list doc vignettes tarball check install build clean realclean
+.PHONY: info doc vignettes tarball check install build clean realclean

 %.R: %.Rmd
 	${RSCRIPT} ${RSCRIPT_OPTS} -e 'library(knitr);purl("$<", out="$@")'	
@ -23,11 +23,11 @@ PKG_FILES := DESCRIPTION NAMESPACE $(R_FILES) $(SRC_FILES)
 $(PKG_NAME)_$(PKG_VERSION).tar.gz: $(PKG_FILES)
 	R CMD build .

-list:
-	@echo -e "PKG_NAME:    $(PKG_NAME)"
-	@echo -e "PKG_VERSION: $(PKG_VERSION)"
-	@echo -e "R_FILES:     $(R_FILES)"
-	@echo -e "SRC_FILES:   $(SRC_FILES)"
+info:
+	@echo -e "PKG_NAME    \t$(PKG_NAME)"
+	@echo -e "PKG_VERSION \t$(PKG_VERSION)"
+	@echo -e "R_FILES     \t"$(shell echo $(R_FILES) | sed 's| |\\\\n\\\\t\\\\t|g')
+	@echo -e "SRC_FILES   \t$(SRC_FILES)"

 doc:
 	R -e 'devtools::document()'
--- a/test/resources.R
+++ b/test/resources.R
--- a/vignettes/10Deploy.Rmd
+++ b/vignettes/10Deploy.Rmd
@ -1,28 +1,33 @@
 ---
 title = "Using Azure Data Science Virtual Machine: Deployment of a single DSVM"
-author= "Graham Williams"
+author= "Graham Williams and Le Zhang"
 ---

 # Use Case

-In this tutorial, both a Linux and a Windows Data Science Virtual Machine (DSVM) are deployed. 
-Code is included but
-not run to then delete the resource group if the resources are no
-longer required. Once deleted consumption will cease.
+In this tutorial, a Linux is deployed and sample code to deploy a
+Windows Data Science Virtual Machine (DSVM) is provided.  Code is
+included but not run to then delete the resource group if the resource
+group was created within this vignette. Once deleted consumption
+(cost) will cease.

 This script is best run interactively to review its operation and to
-ensure that the interaction with Azure completes. As a standalone
-script it can be run to setup a new resource group and single Linux
-DSVM.
+ensure that the interaction with Azure completes.
+
+As a standalone script we can run this script to setup a new resource
+group and single Linux DSVM.

 # Preparation

 We assume the user already has an Azure subscription and we have
-obtained the credentials required. See instructions in the [previous vignette](https://github.com/Azure/AzureDSVM/blob/master/vignettes/00Introduction.Rmd) 
+obtained the credentials required. See the
+[Introduction](https://github.com/Azure/AzureDSVM/blob/master/vignettes/00Introduction.Rmd)
 for details. We will then ensure a resource group exists and within
-that resource group create DSVMs. A public ssh key is used to
-access the server in this script although a username and password is
-also an option. We create a Linux DSVM and a Windows DSVM.
+that resource group deploy a Linux DSVM. A secure shell (ssh) public
+key matching the current user's private key is used to access the
+server in this script although a username and password is also an
+option. We deploy a Linux DSVM and include sample code to deploy a
+Windows DSVM.

 # Setup

@ -31,7 +36,9 @@ user's ssh public key. Public keys on Linux are typically created on
 the users desktop/laptop machine and will be found within
 ~/.ssh/id_rsa.pub. It will be convenient to create a credentials file
 to contain this information. The content's of the credentials file
-will be something like:
+will be something like the foloowing and we assume the user creates
+such a file in the current working directory, naming the file
+<USER>_credentials.R, replace <USER> with the user's username.

 ```{r credentials, eval=FALSE}
 # Credentials come from app creation in Active Directory within Azure.
@ -45,9 +52,7 @@ PASSWORD <- "Public%4aR3@kn"               # For Windows DSVM

 ```

-Save such information into a file with the name <USER>_credentials.R
-where <USER> is replaced with your username. Then we simply source
-that file in R.
+We can simply source that file in R.

 ```{r setup}
 # Load the required subscription resources: TID, CID, and KEY.
@ -58,13 +63,19 @@ USER <- Sys.info()[['user']]
 source(paste0(USER, "_credentials.R"))
 ```

+If the required pacakges are not yet installed the following will do
+so. You may need to install them into your own local library rather
+than the system library if you are not a system user.
+
 ```{r, eval=FALSE}
 # Install the packages if required.

 devtools::install_github("Microsoft/AzureSMR")
-devtools::install_github("Azure/AzureDSVM", auth_token=GIT_TOKEN) # auth_token is only required when repo is private.
+devtools::install_github("Azure/AzureDSVM", auth_token=GIT_TOKEN)
 ```

+We can then load the required pacakges from the libraries.
+
 ```{r packages}
 # Load the required packages.

@ -83,29 +94,31 @@ library(dplyr)
 # Create a random resource group to reduce likelihood of conflict with
 # other users.

-BASE <- 
-  runif(4, 1, 26) %>%
+runif(4, 1, 26) %>%
  round() %>%
  letters[.] %>%
  paste(collapse="") %T>%
-  {sprintf("Base name:\t\t%s", .) %>% cat("\n")}
+  {sprintf("Base name:\t\t%s", .) %>% cat("\n")} ->
+BASE

-RG <-
-  paste0("my_dsvm_", BASE,"_rg_sea") %T>%
-  {sprintf("Resource group:\t\t%s", .) %>% cat("\n")}
+BASE %>%
+  paste0("my_dsvm_", .,"_rg_sea") %T>%
+  {sprintf("Resource group:\t\t%s", .) %>% cat("\n")} ->
+RG

 # Choose a data centre location.

-LOC <-
-  "southeastasia"  %T>%
-  {sprintf("Data centre location:\t%s", .) %>% cat("\n")}
+"southeastasia"  %T>%
+  {sprintf("Data centre location:\t%s", .) %>% cat("\n")} ->
+LOC

 # Include the random BASE in the hostname to reducely likelihood of
 # conflict.

-HOST <-
-  paste0("my", BASE) %T>%
-  {sprintf("Hostname:\t\t%s", .) %>% cat("\n")}
+BASE %>%
+  paste0("my", .) %T>%
+  {sprintf("Hostname:\t\t%s", .) %>% cat("\n")} ->
+HOST

 cat("\n")
 ```
@ -121,6 +134,7 @@ context <- createAzureContext(tenantID=TID, clientID=CID, authKey=KEY)

 rg_pre_exists <- existsRG(context, RG, LOC)
 ```
+
 # Create a Resource Group

 Create the resource group within which all resources we create will be
@ -150,10 +164,24 @@ cat("Resource group", RG, "at", LOC,
 Create the actual Linux DSVM with public-key based authentication
 method. Name, username, and size can also be configured.

-Note one can check available VM sizes within the region by using `getVMSizes()`. Different sizes will cost differently, and the detailed information can be checked on [Azure website](https://docs.microsoft.com/en-us/azure/virtual-machines/virtual-machines-windows-sizes). The default VM size for deployment is "Standard_DS2_v2", which is featured by enhanced computation performance.
+Note one can check available VM sizes within the region by using
+`getVMSizes()`. Different sizes will cost differently, and the
+detailed information can be checked on
+[Azure website](https://docs.microsoft.com/en-us/azure/virtual-machines/virtual-machines-windows-sizes). The
+default VM size for deployment is chosen for by enhanced computation
+performance. See the documentation for deployDSVM() for the actual
+default.

 ```{r}
-getVMSizes(context, "southeastasia")
+getVMSizes(context, "southeastasia") %>%
+  rename(Name=name,
+         Cores=numberOfCores,
+         Disk=resourceDiskSizeInMB,
+         RAM=memoryInMB,
+         Disks=maxDataDiskCount) %>%
+  select(Name, Cores, Disk, RAM, Disks) %>%
+  mutate(Disk=scales::comma(Disk/1024),
+         RAM=scales::comma(round(RAM/1024)))
 ```

 The following code deploys a Linux DSVM, and it will take approximately 4 minutes.
@ -194,7 +222,10 @@ system(cmd, intern=TRUE)
 # Deploy a Windows Data Science Virtual Machine

 `deployDSVM` also supports deployment of Windows DSVM, which can be
-achieved by setting the argument of `os` to "Windows". The deployment will take approximately 10 minutes. One can use Remote Desktop to verify the success of deployment and use the virtual machine in a remote desktop environment.
+achieved by setting the argument of `os` to "Windows". The deployment
+will take approximately 10 minutes. One can use Remote Desktop to
+verify the success of deployment and use the virtual machine in a
+remote desktop environment.

 ```{r, eval=FALSE}
 wdsvm <- deployDSVM(context,
@ -218,7 +249,7 @@ operateDSVM(context, RG, HOST, operation="Stop")

 # Optional Cleanup

-```{r optionally delete resource group}
+```{r optionally_delete_resource_group, eval=FALSE}
 # Delete the resource group now that we have proved existence. There
 # is probably no need to wait. Only delete if it did not pre-exist
 # this script. Deletion seems to take 10 minutes or more.