Added DSVM demo with air delay prediction

2017-05-12 17:32:51 +08:00 · 2017-05-12 17:32:51 +08:00 · e307a591e7
--- a/flightDelayPredictionWithDSVM/Codes/deepLearningDemo/codes/demo.R
+++ b/flightDelayPredictionWithDSVM/Codes/deepLearningDemo/codes/demo.R
@ -0,0 +1,207 @@
+# -----------------------------------------------------------------------------
+# Remote to spark demo.
+# -----------------------------------------------------------------------------
+
+library(AzureSMR)
+library(jsonlite)
+library(dplyr)
+library(magrittr)
+
+# authentication with credentials.
+
+source("demo/codes/credentials.R")
+
+context <- createAzureContext(tenantID=TID, clientID=CID, authKey=KEY)
+
+# get remote blob of the data processed previously in Spark.
+
+js_processed <- AzureSMR::azureGetBlob(context, 
+                                       blob="processed_data.json",
+                                       type="text", 
+                                       storageAccount=SA_ACCOUNT,
+                                       storageKey=SA_KEY,
+                                       container=SA_CONTAINER,
+                                       resourceGroup="dldemo")
+
+# take a look at the processed data.
+
+df_processed <- 
+  fromJSON(js_processed) %>%
+  mutate(DayOfWeek=as.factor(DayOfWeek),
+         Origin=as.factor(Origin),
+         Dest=as.factor(Dest)) %T>%
+         {head(.) %>% print()}
+
+# -----------------------------------------------------------------------------
+# Modeling training with Microsoft Neural Network algorithm with GPU acceleration
+# -----------------------------------------------------------------------------
+
+library(mrsdeploy)
+library(MicrosoftML)
+
+# split data into training and testing sets.
+
+index <- sample(1:nrow(df_processed), round(0.7 * nrow(df_processed)))
+
+df_train <- df_processed[index, ]
+df_test  <- df_processed[-index, ]
+
+# train a neural network model.
+
+var_names <- rxGetVarNames(df_train)
+dep_name  <- "ArrDel15"
+ind_names <- var_names[which(var_names != dep_name)]
+
+formu <- paste(dep_name, "~", paste(ind_names, collapse=" + "))
+
+# without GPU acceleration. 
+
+model_nn1 <- rxNeuralNet(formu,
+                         data=df_train,
+                         type="binary", 
+                         acceleration="sse", 
+                         optimizer=adaDeltaSgd(),
+                         numIterations=30) 
+
+# with GPU acceleration.
+
+model_nn2 <- rxNeuralNet(formu,
+                         data=df_train,
+                         type="binary", 
+                         acceleration="gpu", 
+                         optimizer=adaDeltaSgd(),
+                         numIterations=30, 
+                         miniBatchSize=256)
+
+# random forest model.
+
+model_rf <- rxFastForest(formu,
+                         data=df_train,
+                         type="binary")
+
+model_lr <- rxLogit(formu,
+                    data=df_train)
+
+# score model and plot roc curve.
+
+scores <- rxPredict(model_nn1, 
+                    data=df_test, 
+                    suffix="WithoutGPU",
+                    extraVarsToWrite=names(df_test))
+
+scores <- rxPredict(model_nn2,
+                    data=scores,
+                    suffix="WithGPU",
+                    extraVarsToWrite=names(scores))
+
+scores <- rxPredict(model_rf,
+                    data=scores,
+                    suffix="RandomForest",
+                    extraVarsToWrite=names(scores))
+
+roc <- rxRoc(actualVarName="ArrDel15", 
+             predVarNames=grep("Probability", 
+                               names(scores), 
+                               value=TRUE),
+             data=scores)
+
+plot(roc)
+
+auc <- rxAuc(roc)
+
+# save model into a RData object.
+
+models <- list(model_rf, model_nn2, model_nn1)
+
+model_optimal <- models[[which(auc == max(auc))]]
+
+save(model_optimal, file="demo/data/model.RData")
+
+# -----------------------------------------------------------------------------
+# Deploy model as a real-time web services.
+# -----------------------------------------------------------------------------
+
+end_point <- "appdemo.southeastasia.cloudapp.azure.com"
+
+# authentication on the remote MRS.
+
+remoteLogin(deployr_endpoint=paste0("http://", end_point, ":12800"),
+            session=TRUE, 
+            username="admin", 
+            password=PWD)
+
+# pause() in the remote session to switch back to local R session.
+
+save.image(file="demo/data/image.RData")
+
+putLocalFile("demo/data/image.RData")
+
+# resume() switch to remote session and load the saved image where credentials are contained.
+
+# load("image.RData")
+
+# authenticate again at remote session (remote session disabled).
+
+mrsdeploy::remoteLogin(deployr_endpoint=paste0("http://", end_point, ":12800"),
+                       session=FALSE,
+                       username="admin",
+                       password=PWD)
+
+# pause() to switch back to local R session to publish a real time service with the trained model on the remote app server.
+
+# wrap the model into a function for publish.
+
+delayPrediction <- function(DayOfMonth, 
+                            DayOfWeek,
+                            Origin,
+                            Dest,
+                            DepTime) {
+  
+  # NOTE column of ArrDel15 is provided due to the requirement of MML model.
+  
+  newdata <- data.frame(DayofMOnth=DayOfMonth,
+                        DayOfWeek=as.factor(DayOfWeek),
+                        Origin=as.factor(Origin),
+                        Dest=as.factor(Dest),
+                        DepTime=DepTime,
+                        ArrDel15=as.integer(0))
+  
+  rxPredict(model_optimal, newdata)$PredictedLabel
+}
+
+# publish the model as a real time web service.
+
+publishService(name="DelayPrediction",
+               model=model_optimal,
+               code=delayPrediction,
+               inputs=list(DayOfMonth="integer",
+                           DayOfWeek="character",
+                           Origin="character",
+                           Dest="character",
+                           DepTime="numeric"),
+               outputs=list(ArrDel15="integer"),
+               # model=model_lr,
+               v="0.0.1",
+               alias="DPModel") 
+
+# consume the web service.
+
+listServices()
+
+delay_pred_api <- getService(name="DelayPrediction", v="0.0.1")
+
+# test with testing data.
+
+df_test_1 <- df_test[sample(nrow(df_test), 1), ]
+
+# use the web service for prediction.
+
+air_delay_prediction <- delay_pred_api$DPModel(df_test_1$DayofMOnth,
+                                               as.character(df_test_1$DayOfWeek),
+                                               as.character(df_test_1$Origin),
+                                               as.character(df_test_1$Dest),
+                                               df_test_1$DepTime)
+
+# predicted label.
+
+print(air_delay_prediction$outputParameters$ArrDel15)
--- a/flightDelayPredictionWithDSVM/Codes/deepLearningDemo/demo.R
+++ b/flightDelayPredictionWithDSVM/Codes/deepLearningDemo/demo.R
@ -0,0 +1,103 @@
+# -----------------------------------------------------------------------------
+# Remote to spark demo.
+# -----------------------------------------------------------------------------
+
+library(AzureSMR)
+library(jsonlite)
+library(dplyr)
+library(magrittr)
+
+context <- createAzureContext(tenantID=TID, clientID=CID, authKey=KEY)
+
+# get remote blob of the data processed previously in Spark.
+
+js_processed <- AzureSMR::azureGetBlob(context, 
+                                       blob="processed_data.json",
+                                       type="text", 
+                                       storageAccount=SA_ACCOUNT,
+                                       storageKey=SA_KEY,
+                                       container=SA_CONTAINER,
+                                       resourceGroup=RG)
+
+# take a look at the processed data.
+
+df_processed <- 
+  fromJSON(js_processed) %>%
+  mutate(DayOfWeek=as.factor(DayOfWeek),
+         Origin=as.factor(Origin),
+         Dest=as.factor(Dest),
+         ArrDel15=ifelse(ArrDel15 == TRUE, 1, 0)) %T>%
+         {head(.) %>% print()}
+
+# -----------------------------------------------------------------------------
+# Modeling training with Microsoft Neural Network algorithm with GPU acceleration
+# -----------------------------------------------------------------------------
+
+library(mrsdeploy)
+library(MicrosoftML)
+
+# split data into training and testing sets.
+
+index <- sample(1:nrow(df_processed), round(0.7 * nrow(df_processed)))
+
+df_train <- df_processed[index, ]
+df_test  <- df_processed[-index, ]
+
+# train a neural network model.
+
+var_names <- rxGetVarNames(df_train)
+dep_name  <- "ArrDel15"
+ind_names <- var_names[which(var_names != dep_name)]
+
+formu <- paste(dep_name, "~", paste(ind_names, collapse=" + "))
+
+# without GPU acceleration. 
+
+model_nn1 <- rxNeuralNet(formu,
+                         data=df_train,
+                         type="binary", 
+                         acceleration="sse", 
+                         optimizer=adaDeltaSgd(),
+                         numIterations=30) 
+
+# with GPU acceleration.
+
+model_nn2 <- rxNeuralNet(formu,
+                         data=df_train,
+                         type="binary", 
+                         acceleration="gpu", 
+                         optimizer=adaDeltaSgd(),
+                         numIterations=30, 
+                         miniBatchSize=256)
+
+# random forest model.
+
+# score model and plot roc curve.
+
+scores <- rxPredict(model_nn1, 
+                    data=df_test, 
+                    suffix="WithoutGPU",
+                    extraVarsToWrite=names(df_test))
+
+scores <- rxPredict(model_nn2,
+                    data=scores,
+                    suffix="WithGPU",
+                    extraVarsToWrite=names(scores))
+
+roc <- rxRoc(actualVarName="ArrDel15", 
+             predVarNames=grep("Probability", 
+                               names(scores), 
+                               value=TRUE),
+             data=scores)
+
+plot(roc)
+
+auc <- rxAuc(roc)
+
+# save model into a RData object.
+
+models <- list(model_nn2, model_nn1)
+
+model_optimal <- models[[which(auc == max(auc))]]
+
+save(model_optimal, file="model.RData")
--- a/flightDelayPredictionWithDSVM/Codes/main.Rmd
+++ b/flightDelayPredictionWithDSVM/Codes/main.Rmd
@ -0,0 +1,484 @@
+---
+title: "Elastic use of Azure DSVM for data science"
+output: html_notebook
+---
+
+## Intro
+
+It is common that a cloud-based data science project is partitioned into segments in which there is specific data science job to be finished. This segmentation of the whole work flow requires a optimal resource allocation and management that maximize working efficiency while minimizing potential cost.
+
+## Data Science Virtual Machine
+
+### Introduction
+
+[Data Science Virtual Machine](http://aka.ms/dsvm) is a vitualized environment on Azure cloud platform where commonly used software and tools are pre-installed. With minimal efforts, data scientists can work directly on one or many DSVMs to process their data science project or machine learning task. 
+
+### Data science and machine learning on a DSVM
+
+With the pre-installed tools, many things can be done on a single DSVM or multiple DSVMs. 
+
+#### Local mode Spark for big data analytics
+
+Standalone mode Spark is useful to test programs for Spark on a single machine before they are scaled out onto a cluster. DSVM provides PySpark kernel as well as pre-installed R packages such as `RevoScaleR`, `SparkR`, and `sparkdplyr`.
+
+More details can be found [here](http://aka.ms/linuxdsvmdoc).
+
+#### GPU-accelerated deep learning neural network
+
+Microsoft Azure NC-series VMs are incorporated with GPUs that support CUDA toolkit. This allows GPU acceleration for training a deep learning neural network.
+
+More details can be found [here](http://aka.ms/linuxdsvmdoc).
+
+## Demo
+
+### Preliminaries
+
+The tutorial in this notebook will demonstrate how to make use of a heterogeneous set of DSVMs for different sorts of tasks in a data science project - experimentation with a standalone-mode Spark, GPU-accelerated deep neural network training, and model deployment via web services. The benefits of doing this is that each provisioned DSVM will suit the specific task of each project sub-task, and stay alive only when it is needed. 
+
+The demonstration is completely implemented in R, with the help of R packages like
+
+* Microsoft R Server including `RevoScaleR`, `deployR`, and `MicrosoftML`.
+* `AzureSMR` and `AzureDSVM`.
+* `dplyr`, `magrittr`, etc.
+
+A simple binary classification problem on how to create a predictive model to predict flight delay on the [Air Delay data set](https://packages.revolutionanalytics.com/datasets/) is demonstrated and the trained classifier is then published as a web service.  
+
+To achieve the heterogeneity, three DSVMs with different configurations are fired up.
+
+|DSVM name|DSVM Size|OS|Description|
+|---------------|--------------------|-------------|-----------------------|
+|spark|Standard D4 v2 - 8 cores and 28 GB memory|Linux|Standalone mode Spark for data preprocessing and feature engineering.|
+|deeplearning|Standard NC6 - 6 cores, 56 GB memory, and Tesla K80 GPU|Windows|Train deep neural network model with GPU acceleration.|
+|webserver|Standard D4 v2 - 8 cores and 28 GB memory|Windows|Deployed as a server where MRS service is published and run on.|
+
+### Deployment
+
+Deploying multiple DSVMs can be done by using `AzureSMR` and `AzureDSVM`.
+
+Load libraries to use for the demo.
+
+```{r}
+library(AzureDSVM)
+library(AzureSMR)
+library(dplyr)
+library(magrittr)
+```
+
+Import credentials.
+```{r}
+source("credentials.R")
+```
+
+A few global parameters to use.
+```{r}
+
+runif(2, 1, 26) %>%
+  round() %>%
+  letters[.] %>%
+  paste(collapse="") %T>%
+  {sprintf("Base name:\t\t%s", .) %>% cat("\n")} ->
+BASE
+
+BASE %>%
+  paste0("my_dsvm_", .,"_rg_sea") %T>%
+  {sprintf("Resource group:\t\t%s", .) %>% cat("\n")} ->
+RG
+
+# Choose a data centre location.
+# NOTE: NC-series DSVM is now merely available in a few data centers. It can be checked with AzureDSVM::getVMSize() function. East US is used for this demo.
+
+"eastus"  %T>%
+  {sprintf("Data centre location:\t%s", .) %>% cat("\n")} ->
+LOC
+
+BASE %>%
+  paste0("spark", .) %T>%
+  {sprintf("Hostname (Spark):\t%s", .) %>% cat("\n")} ->
+HOST1
+
+BASE %>%
+  paste0("dl", .) %T>%
+  {sprintf("Hostname (GPU):\t\t%s", .) %>% cat("\n")} ->
+HOST2
+
+BASE %>%
+  paste0("server", .) %T>%
+  {sprintf("Hostname (server):\t%s", .) %>% cat("\n")} ->
+HOST3
+
+cat("\n")
+
+```
+
+Create the resource group.
+```{r}
+
+# Connect to the Azure subscription and use this as the context for
+# our activities.
+
+context <- createAzureContext(tenantID=TID, clientID=CID, authKey=KEY)
+
+# Check if the resource group already exists. Take note this script
+# will not remove the resource group if it pre-existed.
+
+rg_pre_exists <- existsRG(context, RG, LOC)
+
+# Check that it now exists.
+
+cat("Resource group", RG, "at", LOC,
+    ifelse(!existsRG(context, RG, LOC), "does not exist.\n", "exists.\n"), "\n")
+
+if (! rg_pre_exists)
+{
+  azureCreateResourceGroup(context, RG, LOC) %>% cat("\n\n")
+}
+
+# Check that it now exists.
+
+cat("Resource group", RG, "at", LOC,
+    ifelse(!existsRG(context, RG, LOC), "does not exist.\n", "exists.\n"), "\n")
+
+```
+
+Fire up the three DSVMs.
+```{r}
+# Linux based DSVM for standalone mode Spark.
+
+deployDSVM(context, 
+           resource.group=RG, 
+           size="Standard_D4_v2",
+           location=LOC,
+           hostname=HOST1,
+           username=USER,
+           authen="Password", 
+           password=PWD, 
+           mode="Async")
+
+# Windows based DSVM for deep learning neural network model training.
+
+deployDSVM(context, 
+           resource.group=RG, 
+           os="DeepLearning",
+           size="Standard_NC6",
+           location=LOC,
+           hostname=HOST2,
+           username=USER,
+           password=PWD,
+           mode="Async")
+
+# Linux based DSVM for MRS web server.
+
+deployDSVM(context, 
+           resource.group=RG, 
+           size="Standard_D4_v2",
+           location=LOC,
+           hostname=HOST3,
+           username=USER,
+           authen="Password", 
+           password=PWD)
+```
+
+Check status of the machines.
+```{r}
+azureListVM(context, RG)
+
+operateDSVM(context, RG, HOST1, operation="Check")
+operateDSVM(context, RG, HOST2, operation="Check")
+operateDSVM(context, RG, HOST3, operation="Check")
+```
+
+### Setup
+
+After the deployment, there are several setups needed for the three provisioned DSVMs before experiments can be conducted. 
+
+2. **GPU toolkit configuration** - A Windows based DSVM does not come with an installed CUDA Toolkit and cuDNN library. Therefore one needs to manually install and configure both of the two. Guidelines for doing this can be found in the [introduction of `rxNeuralNet` function, the `acceleration` argument](https://msdn.microsoft.com/en-us/microsoft-r/microsoftml/packagehelp/neuralnet. 
+3. **One-box configuration** - [One-box configuration](https://msdn.microsoft.com/en-us/microsoft-r/operationalize/configuration-initial) is to enable remote execution and web service API calls of a DSVM which is used an R server. 
+
+### Experiment.
+
+Once the preliminary setups are finished, demo scripts can be executed on the remote DSVM sessions.
+
+Firstly specify the end points of remote DSVMs.
+```{r}
+end_point_1 <- paste(HOST1, LOC, "cloudapp.azure.com", sep=".")
+end_point_2 <- paste(HOST2, LOC, "cloudapp.azure.com", sep=".")
+end_point_3 <- paste(HOST3, LOC, "cloudapp.azure.com", sep=".")
+```
+
+Assuming all the three DSVMs were previously deallocated, start the one for data pre-processing on Spark and do the analytics on that.
+```{r}
+# check the available DSVMs in the resource group.
+
+azureListVM(context, RG, LOC)
+```
+
+Data used in this demo is small so it is preserved in Azure storage account as a blob. At least a container is needed for a storage account. This can be achieved in Azure portal or by using `AzureSMR`.
+```{r}
+SA_ACCOUNT   <- paste0(HOST3, "sa")
+SA_CONTAINER <- "demodata"
+
+SA_KEY <- AzureSMR::azureSAGetKey(context, 
+                                  storageAccount=SA_ACCOUNT,
+                                  resourceGroup=RG)
+
+# create a container.
+
+AzureSMR::azureCreateStorageContainer(context, 
+                                      container=SA_CONTAINER,
+                                      storageAccount=SA_ACCOUNT,
+                                      storageKey=SA_KEY)
+```
+
+Save the current image for reference in the remote session.
+```{r}
+save.image(file="../data/image.RData")
+```
+
+#### Task 1 - Data processing and feature engineering.
+
+Data pre-processing is done on the DSVM with standalone mode Spark, so start the DSVM.
+```{r}
+operateDSVM(context, RG, HOST1, operation="Start")
+```
+
+Remote log into the DSVM.
+```{r}
+remoteLogin(paste0("http://", end_point_1, ":12800"), 
+            session=TRUE, 
+            diff=FALSE,
+            username="admin",
+            password=PWD)
+```
+
+Pause from the remote session, and upload the image at local session.
+```{r}
+# REMOTE> pause()
+
+putLocalFile(filename="../data/image.RData")
+
+# resume the remote session and load the image.
+
+resume()
+```
+
+On the remote session load the image where needed objects are contained.
+```{r}
+# REMOTE> load("image.RData")
+```
+
+Switch back to local R session and execute the demo script.
+```{r}
+# RMEOTE> pause()
+
+# remote execution of scripts located at local.
+
+results <- remoteScript("../codes/sparkDemo/demo.R")
+
+results
+
+remoteLogout()
+```
+
+After the results are returned with no error, stop the DSVM to avoid unnecessary cost.
+```{r}
+operateDSVM(context, RG, HOST1, operation="Stop")
+```
+
+#### Task 2 - Deep neural network model training.
+
+Start the DSVM for deep neural network model training.
+```{r}
+operateDSVM(context, RG, HOST2, operation="Start")
+
+# authenticate with the remote DSVM.
+
+remoteLogin(paste0("http://", end_point_2, ":12800"), 
+            session=TRUE, 
+            diff=FALSE,
+            username="admin",
+            password=PWD)
+
+# REMOTE> pause()
+
+putLocalFile(filename="../data/image.RData")
+
+# resume the remote session and load the image.
+
+resume()
+
+# REMOTE> load("image.RData")
+# RMEOTE> pause()
+
+# remote execution of scripts located at local.
+
+results <- remoteScript("../codes/deepLearningDemo/demo.R", 
+                        displayPlots=TRUE, 
+                        writePlots=TRUE)
+```
+
+Get result from remote.
+```{r}
+model_optimal <- getRemoteObject("model_optimal", name="model_optimal")
+```
+
+Log out and stop the machine.
+```{r}
+remoteLogout()
+
+operateDSVM(context, RG, HOST2, operation="Stop")
+```
+
+#### Task 3 - Publish the trained model as a web service.
+```{r}
+operateDSVM(context, RG, HOST3, operation="Start")
+
+# NOTE this time we need to enable a session to authenticate again in order to publish the service.
+
+remoteLogin(paste0("http://", end_point_3, ":12800"),
+            session=TRUE, 
+            diff=FALSE,
+            username="admin", 
+            password=PWD)
+
+# REMOTE> pause()
+
+putLocalFile(filename="../data/image.RData")
+
+# resume the remote session and load the image.
+
+resume()
+
+# REMOTE> load("image.RData")
+
+# authenticate again at remote session (remote session disabled).
+
+# REMOTE> mrsdeploy::remoteLogin(paste0("http://", end_point_3, ":12800"),
+#                                session=FALSE,
+#                                username="admin",
+#                                password=PWD)
+
+# RMOTE> pause()
+```
+
+Wrap the model into a function which is then published as a service. 
+```{r}
+delayPrediction <- function(DayOfMonth, 
+                            DayOfWeek,
+                            Origin,
+                            Dest,
+                            DepTime) {
+  
+  # NOTE column of ArrDel15 is provided due to the requirement of MML model.
+  
+  newdata <- data.frame(DayofMOnth=DayOfMonth,
+                        DayOfWeek=as.factor(DayOfWeek),
+                        Origin=as.factor(Origin),
+                        Dest=as.factor(Dest),
+                        DepTime=DepTime,
+                        ArrDel15=as.integer(0))
+  
+  rxPredict(model_optimal, newdata)$PredictedLabel
+}
+```
+
+Publish the model as a real time web service.
+```{r}
+publishService(name="DelayPrediction",
+               model=model_optimal,
+               code=delayPrediction,
+               inputs=list(DayOfMonth="integer",
+                           DayOfWeek="character",
+                           Origin="character",
+                           Dest="character",
+                           DepTime="numeric"),
+               outputs=list(ArrDel15="integer"),
+               v="0.0.1",
+               alias="DPModel") 
+
+remoteLogout()
+```
+
+Note the server DSVM should be kept on as later on it will be visisted for consuming the published service.
+
+#### Task 4 - Consume the published web service.
+
+The published service can be consumed from anywhere and here it is demonstrated to consume the service from local machine.
+
+Again need to log in to consume the service.
+```{r}
+remoteLogin(paste0("http://", end_point_3, ":12800"),
+            session=FALSE, 
+            username="admin", 
+            password=PWD)
+```
+
+List the available web services.
+```{r}
+listServices()
+```
+
+```{r}
+delay_pred_api <- getService(name="DelayPrediction", v="0.0.1")
+
+# test with a random generated data.
+
+df_test <- data.frame(
+  DayOfMonth=14L,
+  DayOfWeek="Wed",
+  Origin="COS",
+  Dest="SLC",
+  DepTime=14.1500,
+  stringsAsFactors=FALSE
+)
+
+# use the web service for prediction.
+
+air_delay_prediction <- delay_pred_api$DPModel(df_test$DayOfMonth,
+                                               df_test$DayOfWeek,
+                                               df_test$Origin,
+                                               df_test$Dest,
+                                               df_test$DepTime)
+
+# predicted label.
+
+print(air_delay_prediction$outputParameters$ArrDel15)
+```
+
+Log out and stop the DSVM.
+```{r}
+remoteLogout()
+
+operateDSVM(context, RG, HOST2, operation="Stop")
+```
+
+The call to service API will return the prediction results by using the pre-trained model.
+
+The service also supports swagger to generate a JSON format description for REST-type API calls. 
+
+#### Clean-up
+
+Once the project is finished, delete the resource group to avoid any additional cost.
+```{r}
+if (! rg_pre_exists)
+  azureDeleteResourceGroup(context, RG)
+```
+
+## Cost 
+
+Cost efficiency is one of the greatest advantage of elastic computing on cloud. Following codes retrieve cost information of the three DSVMs.
+```{r}
+dsvms <- c(HOST1, HOST2, HOST3)
+
+cost_dsvms <- sapply(dsvms,
+                     AzureDSVM::expenseCalculator, 
+                     context,
+                     time.start="<starting_time_stamp>",
+                     time.end="<end_time_stamp>",
+                     granularity="Daily",
+                     currency="USD",
+                     locale="en-US",
+                     offerId="<offer_id_of_the_subscription",
+                     region=LOC)
+```
+
+NOTE it usually takes a while (30 mins to 1 hour which largely depends on locations of data centers) to record cost information into system so function may not be called right after a consumption of DSVMs.
--- a/flightDelayPredictionWithDSVM/Codes/sparkDemo/demo.R
+++ b/flightDelayPredictionWithDSVM/Codes/sparkDemo/demo.R
@ -0,0 +1,124 @@
+# ----------------------------------------------------------------------------
+# Data wrangling in Spark standalone mode.
+# ----------------------------------------------------------------------------
+
+# libraries to use.
+
+library(SparkR)
+library(RevoScaleR)
+library(dplyr)
+library(magrittr)
+library(ggplot2)
+library(readr)
+
+# data sources.
+
+data_url <- "https://zhledata.blob.core.windows.net/mldata/air_1per.xdf"
+
+# data preparation.
+
+# data_dir <- file.path(getwd(), "demo/data")
+data_dir <- getwd()
+
+# system("hadoop fs -ls /home/zhle/demo/data") # necessary for standalone mode?
+
+data_xdf_path     <- file.path(data_dir, "air_data.xdf")
+data_csv_path     <- file.path(data_dir, "air_data.csv")
+data_parquet_path <- file.path(data_dir, "air_data_parquet")
+
+download.file(data_url, destfile=data_xdf_path)
+
+rxDataStep(inData=data_xdf_path, 
+           outFile=data_csv_path, 
+           overwrite=TRUE)
+
+# initialize spark session.
+
+sc <- sparkR.session(
+  sparkPackages="com.databricks:spark-csv_2.10:1.3.0"
+)
+
+setLogLevel("OFF")
+
+# data preparation - convert csv to parquet.
+
+sdf_air <- read.df(data_csv_path, 
+                   source="com.databricks.spark.csv", 
+                   header="true", 
+                   inferSchema="true")
+
+SparkR::cache(sdf_air) # cache into memory.
+SparkR::count(sdf_air) # count number of rows.
+
+write.df(df=sdf_air, 
+         path=data_parquet_path, 
+         "parquet", 
+         "overwrite")
+
+sdf_air <- read.df(data_parquet_path, source="parquet") # write data into parquet format.
+
+printSchema(sdf_air)
+
+SparkR::cache(sdf_air) 
+SparkR::count(sdf_air)
+
+head(sdf_air)
+
+# create a SQL context for manipulating the Spark data frames.
+
+createOrReplaceTempView(sdf_air, "air")
+stable_air <-  SparkR::sql("SELECT a.DayofMOnth, a.DayOfWeek, a.Origin, a.Dest, a.DepTime, a.ArrDel15 FROM air a")  
+createOrReplaceTempView(stable_air, "stable_air")
+
+SparkR::cache(stable_air) 
+SparkR::count(stable_air)
+
+head(SparkR::sql("show tables"))
+
+# sample 30% data from all.
+
+sdf_air_sampled <- SparkR::sample(stable_air,
+                                  withReplacement=FALSE, 
+                                  fraction=0.3,
+                                  seed=123)
+
+# convert the data into local R data frame for visualization.
+
+df_air_sampled <- SparkR::as.data.frame(sdf_air_sampled)
+
+glimpse(df_air_sampled)
+
+df_air_sampled %<>% 
+  filter(as.character(ArrDel15) != "NA") %>%
+  mutate(DepTime=as.numeric(DepTime), ArrDel15=as.factor(ArrDel15)) %>%
+  na.omit()
+
+ggplot(data=df_air_sampled, aes(x=DayOfWeek, y=DepTime)) +
+  geom_boxplot()
+
+# ----------------------------------------------------------------------------
+# Put the processed data onto storage account.
+# ----------------------------------------------------------------------------
+
+# load AzureSMR library.
+
+library(AzureSMR)
+library(jsonlite)
+
+context <- createAzureContext(tenantID=TID, clientID=CID, authKey=KEY)
+
+# convert data frame to json.
+
+js_air_sampled <- toJSON(df_air_sampled)
+
+write(js_air_sampled, file=file.path(data_dir, "processed_data.json"))
+
+# create a blob to preserve data.
+
+AzureSMR::azurePutBlob(context,
+                       blob="processed_data.json",
+                       contents=js_air_sampled,
+                       storageAccount=SA_ACCOUNT,
+                       storageKey=SA_KEY,
+                       container=SA_CONTAINER,
+                       resourceGroup=RG)
--- a/flightDelayPredictionWithDSVM/Data/README.md
+++ b/flightDelayPredictionWithDSVM/Data/README.md
@ -0,0 +1,12 @@
+# List of data sets
+|  Data Set Name | Link to the Full Data Set   | Full Data Set Size (MB)  | Link to Report |
+| ---:| ---: | ---: | ---: |
+| Data Set 1 | [link](link/to/feature/set1) | 2,000 | [Data Set 1 Report](link/to/report1)|
+| Data Set 2 | [link](link/to/feature/set2) | 300 | [Data Set 2 Report](link/to/report2)|
+
+*If the link to the full dataset does not apply, provide some information on how to access the full dataset.*
+
+# Description of data sets
+
+* Data Set 1 *Description of data set 1.*
+* Data Set 2 *Description of data set 2.*
--- a/flightDelayPredictionWithDSVM/Docs/README.md
+++ b/flightDelayPredictionWithDSVM/Docs/README.md
@ -0,0 +1,5 @@
+# Documents
+
+*This folder contains documents such as blogs, installation instructions, etc. is also the default diretory where the generated reports from R markdown are placed.*
+
+*Given an introduction of documents placed in the directory.*