зеркало из https://github.com/Azure/AzureDSVM.git
Merge branch 'master' of https://github.com/Azure/AzureDSVM
This commit is contained in:
Коммит
3aee03acee
|
@ -205,5 +205,5 @@ deployDSVMCluster <- function(context,
|
|||
dns.label=dns.label)
|
||||
}
|
||||
|
||||
invisible(TRUE)
|
||||
return(TRUE)
|
||||
}
|
||||
|
|
|
@ -105,8 +105,8 @@ library(dplyr)
|
|||
# name the resource group that we will create transiently for the
|
||||
# purposes of this script.
|
||||
|
||||
# Create a random resource group to reduce likelihood of conflict with
|
||||
# other users.
|
||||
# Create a random name which will be used for the hostname and
|
||||
# resource group to reduce likelihood of conflict with other users.
|
||||
|
||||
runif(4, 1, 26) %>%
|
||||
round() %>%
|
||||
|
@ -115,17 +115,23 @@ runif(4, 1, 26) %>%
|
|||
{sprintf("Base name:\t\t%s", .) %>% cat("\n")} ->
|
||||
BASE
|
||||
|
||||
BASE %>%
|
||||
paste0("my_dsvm_", .,"_rg_sea") %T>%
|
||||
{sprintf("Resource group:\t\t%s", .) %>% cat("\n")} ->
|
||||
RG
|
||||
|
||||
# Choose a data centre location.
|
||||
# Choose a data centre location. The abbreviation is used for the
|
||||
# resource group name.
|
||||
|
||||
"southeastasia" %T>%
|
||||
{sprintf("Data centre location:\t%s", .) %>% cat("\n")} ->
|
||||
LOC
|
||||
|
||||
ABR <- "sea"
|
||||
|
||||
# Create a random resource group to reduce likelihood of conflict with
|
||||
# other users.
|
||||
|
||||
BASE %>%
|
||||
paste0("my_dsvm_", .,"_rg_", ABR) %T>%
|
||||
{sprintf("Resource group:\t\t%s", .) %>% cat("\n")} ->
|
||||
RG
|
||||
|
||||
# Include the random BASE in the hostname to reducely likelihood of
|
||||
# conflict.
|
||||
|
||||
|
@ -192,13 +198,18 @@ default.
|
|||
```{r}
|
||||
# List the available VM sizes. May differ with location of the data centre.
|
||||
|
||||
getVMSizes(context, "southeastasia") %>%
|
||||
getVMSizes(context, LOC) %>%
|
||||
set_names(c("Size", "Cores", "DiskGB", "RAM GB", "Disks"))
|
||||
|
||||
# The default size.
|
||||
|
||||
formals(deployDSVM)$size
|
||||
|
||||
# Choose a size to suit
|
||||
|
||||
SIZE <- "Standard_D1_v2" # 1 Core, 3.5 GB RAM, 50 GB SSD, $80
|
||||
SIZE <- "Standard_D3_v2" # 4 Cores, 14 GB RAM, 200 GB SSD, $318
|
||||
|
||||
# The default operating system.
|
||||
|
||||
formals(deployDSVM)$os
|
||||
|
@ -214,6 +225,7 @@ ldsvm <- deployDSVM(context,
|
|||
location = LOC,
|
||||
hostname = HOST,
|
||||
username = USER,
|
||||
size = SIZE,
|
||||
pubkey = PUBKEY)
|
||||
ldsvm
|
||||
|
||||
|
@ -265,9 +277,10 @@ system(paste(ssh, "sudo apt-get -y install wajig"))
|
|||
system(paste(ssh, "wajig install -y lsb htop"))
|
||||
system(paste(ssh, "lsb_release -idrc"))
|
||||
system(paste(ssh, "wajig update"))
|
||||
# Manually ssh to the server and then ...
|
||||
# wajig distupgrade
|
||||
# sudo reboot
|
||||
system(paste(ssh, "wajig distupgrade -y"))
|
||||
system(paste(ssh, "sudo reboot"))
|
||||
Sys.sleep(20)
|
||||
system(paste(ssh, "uptime"))
|
||||
```
|
||||
|
||||
# Deploy a Windows Data Science Virtual Machine - Optional
|
||||
|
|
|
@ -10,18 +10,19 @@ vignette: >
|
|||
|
||||
# Use Case
|
||||
|
||||
A common use case is for a Data Scientist to create their R programs
|
||||
A common use case for a Data Scientist is to create their R programs
|
||||
to analyse a dataset on their local compute platform (e.g., a laptop
|
||||
with 6GB RAM running Ubuntu with R installed). Development is
|
||||
performed with a subset of the full dataset (a random sample) that
|
||||
will not exceed the available memory and will return results
|
||||
quickly. When the experimental setup is complete the script can be
|
||||
sent across to a considerably more capable compute engine on Azure.
|
||||
sent across to a considerably more capable compute engine on Azure for
|
||||
modelling the whole population.
|
||||
|
||||
In this vignette a Linux Data Science Virtual Machine (DSVM) cluster
|
||||
is deployed, a distributed/parallel analysis is completed, results
|
||||
collected, and the compute resources deleted. Azure consumption occurs
|
||||
just for the duration.
|
||||
just for the duration.
|
||||
|
||||
# Setup
|
||||
|
||||
|
@ -32,18 +33,14 @@ just for the duration.
|
|||
USER <- Sys.info()[['user']]
|
||||
|
||||
source(paste0(USER, "_credentials.R"))
|
||||
```
|
||||
|
||||
```{r packages}
|
||||
# Load the required packages.
|
||||
|
||||
library(AzureSMR) # Support for managing Azure resources.
|
||||
library(AzureDSVM) # Further support for the Data Scientist.
|
||||
library(magrittr)
|
||||
library(dplyr)
|
||||
```
|
||||
|
||||
```{r tuning}
|
||||
# Parameters for this script: the name for the new resource group and
|
||||
# its location across the Azure cloud. The resource name is used to
|
||||
# name the resource group that we will create transiently for the
|
||||
|
@ -77,14 +74,7 @@ HOST <-
|
|||
{sprintf("Hostname:\t\t%s", .) %>% cat("\n")}
|
||||
|
||||
cat("\n")
|
||||
```
|
||||
|
||||
To begin with, let's check the status of the DSVM and start it if it
|
||||
is deallocated. This is achieved with AzureSMR, and again
|
||||
confidentials for authenticating the app in Active Directory should be
|
||||
provided.
|
||||
|
||||
```{r connect}
|
||||
# Connect to the Azure subscription and use this as the context for
|
||||
# all of our activities.
|
||||
|
||||
|
@ -106,23 +96,19 @@ if (! rg_pre_exists)
|
|||
azureCreateResourceGroup(context, RG, LOC)
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
# Deploy the VM Cluster
|
||||
|
||||
```{r deploy a cluster of DSVMs}
|
||||
# Deploy a cluster of 3 DSVMs.
|
||||
|
||||
COUNT <- 3
|
||||
|
||||
deployDSVMCluster(context,
|
||||
resource.group=RG,
|
||||
location=LOC,
|
||||
hostname=BASE,
|
||||
username=USER,
|
||||
authen="Key",
|
||||
pubkey=PUBKEY,
|
||||
count=COUNT)
|
||||
resource.group = RG,
|
||||
location = LOC,
|
||||
hostname = BASE,
|
||||
username = USER,
|
||||
authen = "Key",
|
||||
pubkey = PUBKEY,
|
||||
count = COUNT)
|
||||
|
||||
cluster <- azureListVM(context, RG, LOC)
|
||||
|
||||
|
@ -152,17 +138,27 @@ for (i in 1:COUNT)
|
|||
Next step is to use the DSVM for data analytics.
|
||||
|
||||
There are many ways of interacting with a DSVM. For both Linux and
|
||||
Windows based DSVMs, it is convenient to remote login onto the
|
||||
hostname with GUI (more detailed information can be found
|
||||
[here](https://docs.microsoft.com/en-us/azure/machine-learning/machine-learning-data-science-provision-vm)). A
|
||||
lot of times remote execution within R session is preferred by data
|
||||
scientist as it can be efficiently automated by R scripts. The
|
||||
following chunks of codes demonstrate how to use an R interface for
|
||||
remote execution of R scripts under a desired computing context.
|
||||
Windows based DSVMs it is convenient to remote login onto the hostname
|
||||
with GUI (more detailed information can be found
|
||||
[here](https://docs.microsoft.com/en-us/azure/machine-learning/machine-learning-data-science-provision-vm)). Often
|
||||
remote execution within an R session is preferred by data scientists
|
||||
as it can be efficiently automated through R scripts. The following
|
||||
chunks of codes demonstrate how to use R for remote execution of R
|
||||
scripts under a desired computing context.
|
||||
|
||||
A very simple experiment on random number generation. The function `executeScript` handles the remote execution (Note the current version only supports remote execution of script on a Linux DSVM, and the remote execution is achieved via ssh channel). Computing context can be specified for the execution. In the case of "clusterParallel", a cluster of DSVMs are used.
|
||||
We begin with a very simple experiment with random number
|
||||
generation. The function `executeScript()` handles the remote
|
||||
execution. (Note that the current version only supports remote
|
||||
execution of a script on a Linux DSVM and the remote execution is
|
||||
achieved via a ssh channel.) The computing context can be specified
|
||||
for the execution. In the case of "clusterParallel", a cluster of
|
||||
DSVMs is used.
|
||||
|
||||
Updates - **Microsoft R Server (>= 9.0) allows remote execution on a DSVM which is properly configured. One can follow the [steps](https://msdn.microsoft.com/en-us/microsoft-r/operationalize/remote-execution) to configure the deployed DSVMs for remote interaction with Microsoft R Server.**
|
||||
**Note that Microsoft R Server (>= 9.0) allows remote execution on a
|
||||
properly configured DSVM. One can follow the [steps
|
||||
here](https://msdn.microsoft.com/en-us/microsoft-r/operationalize/remote-execution)
|
||||
to configure the deployed DSVMs for remote interaction with Microsoft
|
||||
R Server.**
|
||||
|
||||
```{r set R interface}
|
||||
|
||||
|
@ -178,43 +174,43 @@ tmpf1 <- tempfile(paste0("AzureDSVM_experiment_01_"))
|
|||
file.create(tmpf1)
|
||||
writeLines(code, tmpf1)
|
||||
|
||||
# local parallelism on node cores.
|
||||
# Local parallelism on node cores.
|
||||
|
||||
t1 <- Sys.time()
|
||||
|
||||
executeScript(context,
|
||||
resource.group=RG,
|
||||
hostname=cluster$name[1],
|
||||
remote=paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
username=unique(cluster$admin),
|
||||
script=tmpf1,
|
||||
compute.context="localParallel")
|
||||
resource.group = RG,
|
||||
hostname = cluster$name[1],
|
||||
remote = paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
username = unique(cluster$admin),
|
||||
script = tmpf1,
|
||||
compute.context = "localParallel")
|
||||
|
||||
t2 <- Sys.time()
|
||||
|
||||
# cluster parallelism across nodes.
|
||||
|
||||
executeScript(context,
|
||||
resource.group=RG,
|
||||
hostname=cluster$name[1],
|
||||
remote=paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
master=paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
slaves=paste(cluster$name[-1],
|
||||
cluster$location[-1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
username=unique(cluster$admin),
|
||||
script=tmpf1,
|
||||
compute.context="clusterParallel")
|
||||
resource.group = RG,
|
||||
hostname = cluster$name[1],
|
||||
remote = paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
master = paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
slaves = paste(cluster$name[-1],
|
||||
cluster$location[-1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
username = unique(cluster$admin),
|
||||
script = tmpf1,
|
||||
compute.context = "clusterParallel")
|
||||
|
||||
t3 <- Sys.time()
|
||||
|
||||
|
@ -226,11 +222,12 @@ performance2
|
|||
|
||||
```
|
||||
|
||||
Yet another example with parallel execution by using `rxExec` function from Microsoft RevoScaleR package.
|
||||
Yet another example with parallel execution by using `rxExec` function
|
||||
from the Microsoft RevoScaleR package.
|
||||
|
||||
```{r}
|
||||
|
||||
# parallelizing k-means clustering on iris data.
|
||||
# Parallelizing k-means clustering on the iris dataset.
|
||||
|
||||
codes <- paste("library(scales)",
|
||||
"df <- scale(iris[, -5])",
|
||||
|
@ -245,23 +242,23 @@ writeLines(codes, tmpf2)
|
|||
t4 <- Sys.time()
|
||||
|
||||
executeScript(context,
|
||||
resource.group=RG,
|
||||
hostname=cluster$name[1],
|
||||
remote=paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
master=paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
slaves=paste(cluster$name[-1],
|
||||
cluster$location[-1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
username=unique(cluster$admin),
|
||||
script=tmpf2,
|
||||
compute.context="clusterParallel")
|
||||
resource.group = RG,
|
||||
hostname = cluster$name[1],
|
||||
remote = paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
master = paste(cluster$name[1],
|
||||
cluster$location[1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
slaves = paste(cluster$name[-1],
|
||||
cluster$location[-1],
|
||||
"cloudapp.azure.com",
|
||||
sep="."),
|
||||
username = unique(cluster$admin),
|
||||
script = tmpf2,
|
||||
compute.context = "clusterParallel")
|
||||
|
||||
t5 <- Sys.time()
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче