Use Graham's updates on deployDSVM

This commit is contained in:
yueguoguo 2017-02-16 12:48:32 +08:00
Родитель 123d1cd5ce ae3581703e
Коммит 368a6f4f43
9 изменённых файлов: 546 добавлений и 17 удалений

Просмотреть файл

@ -1,2 +1,5 @@
^.*\.Rproj$
^\.Rproj\.user$
^.*_credentials.R$
^Makefile$
^vignettes/.*.R$

10
Makefile Normal file
Просмотреть файл

@ -0,0 +1,10 @@
VER=$(shell grep Version: DESCRIPTION | cut -d" " -f2)
PKG=$(shell basename '${PWD}')
# R Specific
include r.mk
# GIT Specific
include git.mk

Просмотреть файл

@ -1,6 +1,6 @@
#' AzureDSR
#'
#' The AzureDSR functions boost efficiency of data science analytics with Azure resources.
#' Support data science analytics with Azure resources.
#'
#' @name AzureDSR-package
#' @aliases AzureDSR

Просмотреть файл

@ -7,6 +7,7 @@
#' @param location Location of the data centre to host the DSVM.
#' @param name Name of the DSVM. Lowercase characters or numbers
#' only. Special characters are not permitted.
<<<<<<< HEAD
#' @param username User name of the DSVM. It should be different
#' from `name`.
#' @param size Size of the DSVM. The default is
@ -18,18 +19,45 @@
#' @param pubkey Public key for the DSVM. Only applicable for
#' public-key based authentication of Linux based DSVM.
#' @param password Pass word for the DSVM.
=======
#' @param username User name of the DSVM. It should be different from
#' `name`.
#' @param size Size of the DSVM. The default is "Standard_D1_v2". All
#' available sizes can be obtained by function `getVMSizes`.
#' @param os Operating system of DSVM. Permitted values are "Linux"
#' and "Windows" for Linux based and Windows based operating
#' systems, respectively.
#' @param authen Either "Key" or "Password", meaning public-key based
#' or password based authentication, respectively. Note Windows DSVM
#' by default uses password based authentication and this argument
#' can be left unset.
#' @param pubkey Public key for the DSVM. Only applicable for
#' public-key based authentication of Linux based DSVM.
#' @param password Pass word for the DSVM.
#' @param dns DNS label for the VM address. The URL for accessing the
#' deployed DSVM will be "<dns_label>.<location>.cloudapp.azure.com
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
#' @param mode Mode of virtual machine deployment. Default is "Sync".
#'
#' @export
deployDSVM <- function(context,
resource.group,
location,
name,
username,
<<<<<<< HEAD
size="Standard_D3_v2",
=======
size="Standard_D1_v2",
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
os,
authen="",
pubkey="",
password="",
<<<<<<< HEAD
=======
dns=name,
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
mode="Sync")
{
# check if required arguments are present.
@ -85,12 +113,23 @@ deployDSVM <- function(context,
if(!rg_exist)
stop("The specified resource group does not exist in the current region.")
<<<<<<< HEAD
# check if vm is in the available set.
=======
# Check if location is available.
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
vm_available <- getVMSizes(context, location)
<<<<<<< HEAD
if(!(size %in% unlist(select(vm_available, name))))
stop("Unknown size - see getVMSizes() for allowed options.")
=======
# Check if vm size exists.
# if(!(size %in% getVMSizes()$Sizes))
# stop("Unknown size - see getVMSizes() for allowed options.")
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
# Incorrect naming of a vm may lead to an unsuccessful deployment of
# the DSVM - normally it returns a 400 error from REST call. Check
@ -142,7 +181,11 @@ deployDSVM <- function(context,
templ <-
readLines(temp_path) %>%
<<<<<<< HEAD
gsub("<DNS_LABEL>", name, .) %>%
=======
gsub("<DNS_LABEL>", dns, .) %>%
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
paste0(collapse="")
dname <- paste0(name, "_dpl")

207
R/deployDSVM.R.orig Normal file
Просмотреть файл

@ -0,0 +1,207 @@
#' @title Deploy a new Data Science Virtual Machine (DSVM).
#'
#' @param context Authentication context of AzureSMR encapsulating the
#' TID, CID, and key obtained from Azure Actrive Directory.
#' @param resource.group The Azure resource group where the DSVM is
#' created.
#' @param location Location of the data centre to host the DSVM.
#' @param name Name of the DSVM. Lowercase characters or numbers
#' only. Special characters are not permitted.
<<<<<<< HEAD
#' @param username User name of the DSVM. It should be different
#' from `name`.
#' @param size Size of the DSVM. The default is
#' "Standard_D1_v2". All available sizes can be obtained by function
#' `getsizes`.
#' @param os Operating system of DSVM. Permitted values are "Linux" and "Windows" for Linux based and Windows based operating systems, respectively.
#' @param authen Either "Key" or "Password", meaning public-key based or
#' password based authentication, respectively. Note Windows DSVM by default uses password based authentication and this argument can be left unset.
#' @param pubkey Public key for the DSVM. Only applicable for
#' public-key based authentication of Linux based DSVM.
#' @param password Pass word for the DSVM.
=======
#' @param username User name of the DSVM. It should be different from
#' `name`.
#' @param size Size of the DSVM. The default is "Standard_D1_v2". All
#' available sizes can be obtained by function `getVMSizes`.
#' @param os Operating system of DSVM. Permitted values are "Linux"
#' and "Windows" for Linux based and Windows based operating
#' systems, respectively.
#' @param authen Either "Key" or "Password", meaning public-key based
#' or password based authentication, respectively. Note Windows DSVM
#' by default uses password based authentication and this argument
#' can be left unset.
#' @param pubkey Public key for the DSVM. Only applicable for
#' public-key based authentication of Linux based DSVM.
#' @param password Pass word for the DSVM.
#' @param dns DNS label for the VM address. The URL for accessing the
#' deployed DSVM will be "<dns_label>.<location>.cloudapp.azure.com
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
#' @param mode Mode of virtual machine deployment. Default is "Sync".
#'
#' @export
deployDSVM <- function(context,
resource.group,
location,
name,
username,
<<<<<<< HEAD
size="Standard_D3_v2",
=======
size="Standard_D1_v2",
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
os,
authen="",
pubkey="",
password="",
<<<<<<< HEAD
=======
dns=name,
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
mode="Sync")
{
# check if required arguments are present.
if(missing(context))
stop("Please specify a context (contains TID, CID, KEY).")
if(missing(resource.group))
stop("Please specify an Azure resouce group.")
if(missing(location))
stop("Please specify a data centre location.")
if(missing(name))
stop("Please specify a virtual machine name.")
if(missing(username))
stop("Please specify a virtual machine user name.")
if(missing(os))
stop("Please specify a virtual machine OS.")
if(os == "Linux" && missing(authen))
stop("Please specify an authentication method for Linux DSVM.")
if(os == "Windows" && missing(password))
stop("Please specify a password for Windows DSVM.")
if(authen == "Key" && missing(pubkey))
stop("Please specify a public key.")
if(authen == "Password" && missing(password))
stop("Please specify a password.")
# Other preconditions.
# check if AzureSMR context is valid.
if(!is.azureActiveContext(context))
stop("Please use a valid AzureSMR context.")
# check if resource group exists.
rg_exist <-
context %>%
azureListRG() %>%
filter(name == RG) %>%
select(name, location) %>%
nrow() %>%
equals(0) %>%
not()
if(!rg_exist)
stop("The specified resource group does not exist in the current region.")
<<<<<<< HEAD
# check if vm is in the available set.
=======
# Check if location is available.
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
vm_available <- getVMSizes(context, location)
<<<<<<< HEAD
if(!(size %in% unlist(select(vm_available, name))))
stop("Unknown size - see getVMSizes() for allowed options.")
=======
# Check if vm size exists.
# if(!(size %in% getVMSizes()$Sizes))
# stop("Unknown size - see getVMSizes() for allowed options.")
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
# Incorrect naming of a vm may lead to an unsuccessful deployment of
# the DSVM - normally it returns a 400 error from REST call. Check
# the name here to ensure it is valid.
if(length(name) > 15)
stop("Name of virtual machine is too long.")
if(grepl("[[:upper:]]|[[:punct:]]", name))
stop("Name of virtual machine is not valid - only lowercase and digits permitted.")
# check if password is valid.
# if(!grepl("^(?=.*[[A-Za-z]])(?=.*\\d)(?=.*[[$@$!%*#?&]])[[A-Za-z\\d$@$!%*#?&]]{8,}$", password))
# stop("Password not valid - minimum 8 characters with at least one digit and one special character.")
# Load template and parameter JSON files for deployment
if(os == "Windows") {
temp_path <- system.file("etc", "template_windows.json", package="AzureDSR")
para_path <- system.file("etc", "parameter_windows.json", package="AzureDSR")
} else if(os == "Linux") {
if(authen == "Key") {
temp_path <- system.file("etc", "template_linux_key.json", package="AzureDSR")
para_path <- system.file("etc", "parameter_linux_key.json", package="AzureDSR")
} else if(authen == "Password") {
temp_path <- system.file("etc", "template_linux.json", package="AzureDSR")
para_path <- system.file("etc", "parameter_linux.json", package="AzureDSR")
} else {
stop("Please specific a valid authentication method, i.e., either 'Key' for public key based or 'Password' for password based, for Linux OS based DSVM")
}
} else {
stop("Please specify a valid OS type, i.e., either 'Windows' or 'Linux'.")
}
# Update the parameter JSON with the virtual machine name.
param <-
readLines(para_path) %>%
gsub("<LOCATION>", location, .) %>%
gsub("<DEFAULT>", name, .) %>%
gsub("<USER>", username, .) %>%
gsub("<VMSIZE>", size, .) %>%
gsub("<PWD>", password, .) %>%
gsub("<PUBKEY>", pubkey, .) %>%
paste0(collapse="")
# Update the template JSON with the appropriate parameters.
templ <-
readLines(temp_path) %>%
<<<<<<< HEAD
gsub("<DNS_LABEL>", name, .) %>%
=======
gsub("<DNS_LABEL>", dns, .) %>%
>>>>>>> ae3581703ebcdee12a4b154010d7202d1b4bd6f9
paste0(collapse="")
dname <- paste0(name, "_dpl")
AzureSMR::azureDeployTemplate(context,
deplname=dname,
templateJSON=templ,
paramJSON=param,
resourceGroup=resource.group,
mode=mode)
fqdn <- paste0(name, ".", location, ".cloudapp.azure.com")
if (tolower(mode) == "sync")
attr(fqdn, "ip") <-
system(paste("dig", fqdn, "+short"), intern=TRUE) # Get from the VM meta data?
return(fqdn)
}

66
git.mk Normal file
Просмотреть файл

@ -0,0 +1,66 @@
########################################################################
# Version Control - git
status:
@echo "-------------------------------------------------------"
git status --untracked-files=no
ifneq ($(APP),)
@echo "-------------------------------------------------------"
(cd $(APP); git status --untracked-files=no)
endif
@echo "-------------------------------------------------------"
info:
git info
pull:
@echo "-------------------------------------------------------"
git pull
ifneq ($(APP),)
@echo "-------------------------------------------------------"
(cd $(APP); git pull)
endif
@echo "-------------------------------------------------------"
push:
@echo "-------------------------------------------------------"
git push
ifneq ($(APP),)
@echo "-------------------------------------------------------"
(cd $(APP); git push)
endif
@echo "-------------------------------------------------------"
diff:
@echo "-------------------------------------------------------"
git --no-pager diff --color
ifneq ($(APP),)
@echo "-------------------------------------------------------"
(cd rattle; git --no-pager diff --color)
endif
@echo "-------------------------------------------------------"
difftool:
git difftool
ifneq ($(APP),)
(cd rattle; git difftool)
endif
log:
@echo "-------------------------------------------------------"
git --no-pager log --stat --max-count=10
ifneq ($(APP),)
@echo "-------------------------------------------------------"
(cd $(APP); git --no-pager log --stat --max-count=10)
endif
@echo "-------------------------------------------------------"
fulllog:
@echo "-------------------------------------------------------"
git --no-pager log
ifneq ($(APP),)
@echo "-------------------------------------------------------"
(cd $(APP); git --no-pager log)
endif
@echo "-------------------------------------------------------"

19
r.mk Normal file
Просмотреть файл

@ -0,0 +1,19 @@
RSCRIPT = Rscript
RSCRIPT_OPTS = --vanilla
%.R: %.Rmd
${RSCRIPT} ${RSCRIPT_OPTS} -e 'library(knitr);purl("$<", out="$@")'
%.run: %.R
cd $(shell dirname "$<")
${RSCRIPT} ${RSCRIPT_OPTS} -e 'source("$(shell basename $<)")'
check:
R CMD check --check-subdirs=yes .
build:
R CMD build .
install: build
R CMD INSTALL $(PKG)_$(VER).tar.gz

Просмотреть файл

@ -3,21 +3,15 @@ title = "Using Azure Data Science Resources: Compute on Linux DSVM Quick Start"
author= "Graham Williams"
---
# TODO
This is currently just a copy of LinuxDSVM. Intend to submit a script
file build.R across to the server to build randomForest and then
rxDForest on weatherAUS
# Use Case
Our use case here is for a Data Scientist creating their R programs to
analyse a dataset on their local compute platform (e.g., a laptop with
6GB RAM running Ubuntu with R installed). Development is done with a
dataset size (a random sample of the full dataset perhaps) that will
not tax the available memory and will return results quickly. When the
experimental setup is complete the script can be sent across to an
considerably more capable compute engine on Azure.
A common use case is for a Data Scientist to create their R programs
to analyse a dataset on their local compute platform (e.g., a laptop
with 6GB RAM running Ubuntu with R installed). Development is
performed with a subset of the full dataset (a random sample) that
will not exceed the available memory and will return results
quickly. When the experimental setup is complete the script can be
sent across to an considerably more capable compute engine on Azure.
A Linux Data Science Virtual Machine (DSVM) is deployed, the analysis
completed, results collected, and the compute resources deleted. Azure
@ -144,7 +138,8 @@ ldsvm <- deployDSVM(context,
ldsvm
```
`deployDSVM` also supports deployment of Windows DSVM, which can be achieved by setting the argument of `vmos` to "Windows".
`deployDSVM` also supports deployment of Windows DSVM, which can be
achieved by setting the argument of `vmos` to "Windows".
```{r}
wdsvm <- deployDSVM(context,
@ -165,7 +160,7 @@ Prove that the server exists.
```{r prove exists}
# Send a simple system() command across to the new server to test its
# existence. Expect a single line wiht an indication of how long the
# existence. Expect a single line with an indication of how long the
# server has been up and running.
cmd <- paste("ssh -q",
@ -183,7 +178,8 @@ system(cmd)
# is probably no need to wait. Only delete if it did not pre-exist
# this script. Deletion seems to take 10 minutes or more.
if (! rg_pre_exists) azureDeleteResourceGroup(context, RG)
if (! rg_pre_exists)
azureDeleteResourceGroup(context, RG)
```
Once deleted we are consuming no more.

Просмотреть файл

@ -0,0 +1,185 @@
---
title = "Using Azure Data Science Resources: Connect to Linux DSVM Quick Start"
author= "Graham Williams"
---
# Use Case
A Linux Data Science Virtual Machine (DSVM) is deployed, a remote
command is executed to demonstrates it exists, and then the resource
group is delete (unless the resource group pre-exists this script.
This script is best run interactively to review its operation and to
ensure that the interaction with Azure completes.
# Preparation
We assume the user already has an Azure subscription and we have
obtained the credentials required. See
[AzureSMR's Authentication Guide](https://github.com/Microsoft/AzureSMR/blob/master/vignettes/Authentication.Rmd)
for details. We will then ensure a resource group exists and within
that resource group create a Linux DSVM. A public ssh key is used to
access the server in this script although a username and password is
also an option. We create a Linux DSVM and a WindowsDSVM.
# Setup
To get started we need to load our Azure credentials as well as the
user's ssh public key. Public keys on Linux are typically created on
the users desktop/laptop machine and will be found within
~/.ssh/id_rsa.pub. It will be convenient to create a credentials file
to contain this information. The content's of the credentials file
will be something like:
```{r credentials, eval=FALSE}
# Credentials come from app creation in Active Directory within Azure.
TID <- "72f9....db47" # Tenant ID
CID <- "9c52....074a" # Client ID
KEY <- "9Efb....4nwV....ASa8=" # User key
PUBKEY <- readLines("~/.ssh/id_rsa.pub") # For Linux DSVM
PASSWORD <- "AmSj&%4aR3@kn" # For Windows DSVM
```
Save such information into a file with the name <USER>_credentials.R
where <USER> is replaced with your username. Then we simply source
that file in R.
```{r setup}
# Load the required subscription resources: TID, CID, and KEY.
# Also includes the ssh PUBKEY for the user.
USER <- Sys.getenv("USER")
source(paste0(USER, "_credentials.R"))
# Install the packages if required.
## devtools::install_github("Microsoft/AzureSMR")
## devtools::install_github("Azure/AzureDSR", auth_token=GIT_TOKEN)
```
```{r packages}
# Load the required packages.
library(AzureSMR) # Support for managing Azure resources.
library(AzureDSR) # Further support for the Data Scientist.
library(magrittr)
library(dplyr)
library(rattle) # Use weatherAUS as a "large" dataset.
```
```{r tuning}
# Parameters for this script: the name for the new resource group and
# its location across the Azure cloud. The resource name is used to
# name the resource group that we will create transiently for the
# purposes of this script.
RG <- "my_dsvm_rg_sea" # Will be created if not already exist then kill.
LOC <- "southeastasia" # Where the resource group (resources) will be hosted.
```
```{r connect}
# Connect to the Azure subscription and use this as the context for
# our activities.
context <- createAzureContext(tenantID=TID, clientID=CID, authKey=KEY)
# Check if the resource group already exists. Take note this script
# will not remove the resource group if it pre-existed.
context %>%
azureListRG() %>%
filter(name == RG) %>%
select(name, location) %T>%
print() %>%
nrow() %>%
equals(0) %>%
not() %T>%
print() ->
rg_pre_exists
```
# Creation
Create the resource group within which all resources we create will be
grouped.
```{r create resource group}
if (! rg_pre_exists)
{
# Create a new resource group into which we create the VMs and
# related resources. Resource group name is RG.
# Note that to create a new resource group one needs to add access
# control of Active Directory application at subscription level.
azureCreateResourceGroup(context, RG, LOC)
}
```
Create the actual Linux DSVM with public-key based authentication
method. Name, username, and size can also be configured.
```{r deploy}
# Create the required Linux DSVM - generally 4 minutes.
ldsvm <- deployDSVM(context,
resource.group=RG,
location=LOC,
name="mydsvm001",
username=USER,
size="Standard_DS1_v2",
os="Linux",
authen="Key",
pubkey=PUBKEY)
ldsvm
```
`deployDSVM` also supports deployment of Windows DSVM, which can be
achieved by setting the argument of `vmos` to "Windows".
```{r}
wdsvm <- deployDSVM(context,
resource.group=RG,
location=LOC,
name="mydsvm002",
username=USER,
size="Standard_DS1_v2",
os="Windows",
password=PASSWORD)
wdsvm
```
Prove that the server exists.
```{r prove exists}
# Send a simple system() command across to the new server to test its
# existence. Expect a single line with an indication of how long the
# server has been up and running.
cmd <- paste("ssh -q",
"-o StrictHostKeyChecking=no",
"-o UserKnownHostsFile=/dev/null",
ldsvm, "uptime")
cmd
system(cmd)
```
# Cleanup
```{r optionally delete resource group}
# Delete the resource group now that we have proved existence. There
# is probably no need to wait. Only delete if it did not pre-exist
# this script. Deletion seems to take 10 minutes or more.
if (! rg_pre_exists)
azureDeleteResourceGroup(context, RG)
```
Once deleted we are consuming no more.