merged

2016-03-15 16:33:24 -04:00 · 2016-03-15 16:33:24 -04:00 · 25b2a34bf0
--- a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R
+++ b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R
@ -1,27 +1,30 @@
-# Regression: Demand estimation with Microsoft R Server 
-
-# This example demonstrates the feature engineering process for building 
-# a regression model to predict bike rental demand. 
-
-# The dataset contains 17,379 rows and 17 columns, each row representing 
-# the number of bike rentals within a specific hour of a day in the years 
-# 2011 or 2012. Weather conditions (such as temperature, humidity, 
-# and wind speed) were included in this raw feature set, and 
-# the dates were categorized as holiday vs. weekday, etc.
-
-# The field to predict is "cnt", which contains a count value ranging 
-# from 1 to 977, representing the number of bike rentals within 
-# a specific hour. The lag features we add in the data set are the number 
-# of bikes that were rented in each of the previous 12 hours.
-# This caputures the very recent demand for the bikes.
-
-# The following scripts include five basic steps of building this example 
-# using Microsoft R Server.
+#-------------------------------------------------------------------------------------------------------------
+#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
+#-------------------------------------------------------------------------------------------------------------
+# 
+#
+# This example demonstrates the feature engineering process for building a regression model to predict 
+# bike rental demand. 
+#
+# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within 
+# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
+# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs. 
+# weekday etc.
+#
+# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
+# of bike rentals within a specific hour. The lag features we add in the data set is the number of bikes that 
+# were rented in each of the previous 12 hours, which caputures the very recent demand for the bikes.
+#
+# The following scripts include five basic steps of building this example using Microsoft R Server.
+# This execution might require more than one minute.
+#
+#--------------------------------------------------------------------------------------------------------------


-# Step 0: Get Started
-
-# Check whether Microsoft R Server (RRE 8.0) is installed
+#---------------------------Step 0: Get Started-------------------------------
+# ----------------------------------------------------------------------------
+# Check if Microsoft R Server (RRE 8.0) is installed
+# ----------------------------------------------------------------------------
 if (!require("RevoScaleR")) {
  cat("RevoScaleR package does not seem to exist. 
      \nThis means that the functions starting with 'rx' will not run. 
@ -35,8 +38,8 @@ if (!require("RevoScaleR")) {
  quit()
 }

-# Initialize some variables.
-github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
+# Initial some variables.
+github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")

 # Create a temporary directory to store the intermediate .xdf files.
@ -44,115 +47,89 @@ td <- tempdir()
 outFileBike <- paste0(td, "/bike.xdf")
 outFileLag <- paste0(td, "/lagData.xdf")

-
-# Step 1: Import the Bike Data
-
-bike <- rxImport(inData = inputFileBikeURL,
-                 outFile = outFileBike, overwrite = TRUE,
+#---------------------------Step 1: Import the Bike Data---------------------------
+bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
                 missingValueString = "M", stringsAsFactors = FALSE,
-                 # Remove timestamps and all columns that are 
-                 # part of the label.
+                 # Remove timestamps and all columns that are part of the label.
                 varsToDrop = c("instant", "dteday", "casual", "registered"),
-                 # Define year, weather conditions and season columns 
-                 # as categorical.
+                 # Definite year, weather conditions and season columns as categorical.
                 colInfo = list(yr = list(type = "factor"),
                                weathersit = list(type = "factor"),
                                season = list(type = "factor")))

-
-# Step 2: Feature Engineering
-
-# Add the number of bikes that were rented in each of the previous 
-# 12 hours as 12 lag features.
-computeLagFeatures <- function(dataList) {
-    # Total number of lags that need to be added.
-    numLags <- length(nLagsVector)
-    # lag feature names as lagN    
-    varLagNameVector <- paste("lag", nLagsVector, sep="")  
+#---------------------------Step 2: Feature Engineering---------------------------
+# Add number of bikes that were rented in each of the previous 12 hours as 12 lag features.
+computeLagFeatures <- function (dataList) {  # function for computing lag features.
  
-    # Set the value of an object "storeLagData" in the transform environment.
-    if (!exists("storeLagData")) 
-    {              
-        lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
-        names(lagData) <- varLagNameVector
-        .rxSet("storeLagData",lagData)
-    }
+  numLags <- length(nLagsVector)   # total number of lags that need to be added
+  varLagNameVector <- paste("cnt_", nLagsVector, "hour", sep="")  # lag feature names as lagN
  
-    if (!.rxIsTestChunk)
+  # Set the value of an object "storeLagData" in the transform environment.
+  if (!exists("storeLagData")) 
+  {              
+    lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
+    names(lagData) <- varLagNameVector
+    .rxSet("storeLagData", lagData)
+  }
+  
+  if (!.rxIsTestChunk)
+  {
+    for (iL in 1:numLags)
    {
-        for (iL in 1:numLags) {
-            # Number of rows in the current chunk.
-            numRowsInChunk <- length(dataList[[varName]])
-            nlags <- nLagsVector[iL]
-            varLagName <- paste("lag", nlags, sep = "")
-            # Retrieve lag data from the previous chunk.        
-            lagData <- .rxGet("storeLagData")
-            # Concatenate lagData and the "cnt" feature.
-            allData <- c(lagData[[varLagName]], dataList[[varName]])
-            # Take the first N rows of allData, where N is the total 
-            # number of rows in the original dataList.
-            dataList[[varLagName]] <- allData[1:numRowsInChunk]
-            # Save last nlag rows as the new lagData to be used 
-            # to process in the next chunk.
-            lagData[[varLagName]] <- tail(allData, nlags) 
-            .rxSet("storeLagData", lagData)
-        }
+      numRowsInChunk <- length(dataList[[varName]])  # number of rows in the current chunk
+      nlags <- nLagsVector[iL]  
+      varLagName <- paste("cnt_", nlags, "hour", sep="")
+      lagData <- .rxGet("storeLagData")   # retrieve lag data from the previous chunk
+      allData <- c(lagData[[varLagName]], dataList[[varName]])  # concatenate lagData and the "cnt" feature 
+      dataList[[varLagName]] <- allData[1:numRowsInChunk]  # take the first N rows of allData, where N is the total number of rows in the original dataList
+      lagData[[varLagName]] <- tail(allData, nlags)  # save last nlag rows as the new lagData to be used to process in the next chunk
+      .rxSet("storeLagData", lagData)   
    }
-    return(dataList)                
+  }
+  return(dataList)                
 }

 # Apply the "computeLagFeatures" on the bike data.
-lagData <- rxDataStep(inData = bike, outFile = outFileLag,
-                      transformFunc = computeLagFeatures,
-                      transformObjects = list(varName = "cnt",
-                                              nLagsVector = seq(12)),
-                      transformVars = "cnt", overwrite=TRUE)
+lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = computeLagFeatures,
+                      transformObjects = list(varName = "cnt", nLagsVector = seq(12)),
+                      transformVars = "cnt", overwrite = TRUE)

-
-# Step 3: Prepare Training and Test Datasets
-
-# Split data by "yr" so that the training data contains records 
-# for the year 2011 and the test data contains records for 2012.
-rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"),
-        splitByFactor = "yr", overwrite = TRUE,
-        reportProgress = 0, verbose = 0)
+#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
+# Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012.
+rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0)

 # Point to the .xdf files for the training and test set.
 train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
 test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))

-
-# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
-
-# Build a formula for the regression model and remove the "yr", 
-# which is used to split the training and test data.
-modelFormula <- formula(train, depVars = "cnt",
-                        varsToDrop = c("RowNum", "yr"))
+#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
+# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
+modelFormula <- formula(train, depVars = "cnt", varsToDrop = c("RowNum", "yr"))

 # Fit a Decision Forest Regression model on the training data.
-dForest <- rxDForest(modelFormula, data = train, importance = TRUE,
-                     seed = 123)
+dForest <- rxDForest(modelFormula, data = train,
+                     method = "anova", maxDepth = 10, nTree = 50,
+                     importance = TRUE, seed = 123)

+# Plot a dotchart of the variable importance as measured by the decision forest.
+rxVarImpPlot(dForest, main = "Variable Importance of Decision Forest")

-# Step 5: Predict over new data and review the model performance
+# Plot Out-of-bag error rate comparing to the number of trees build in decision forest model.
+plot(dForest, main = "OOB Error Rate vs Number of Trees")

+#---------------------------Step 5: Predict over new data and review the model performance---------------------------
 # Predict the probability on the test dataset.
-predict <- rxPredict(dForest, data = test, overwrite = TRUE,
-                     computeResiduals = TRUE)
+predict <- rxPredict(dForest, data = test, overwrite = TRUE, computeResiduals = TRUE)

-# Calculate three statistical measures: Mean Absolute Error (MAE), 
-# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
-sum <- rxSummary(~cnt_Resid_abs + cnt_Resid_2 + cnt_rel, data = predict,
-                 summaryStats = "Mean", 
+# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
+sum <- rxSummary(~ cnt_Resid_abs+cnt_Resid_2+cnt_rel, data = predict, summaryStats = "Mean", 
                 transforms = list(cnt_Resid_abs = abs(cnt_Resid), 
                                   cnt_Resid_2 = cnt_Resid^2, 
                                   cnt_rel = abs(cnt_Resid)/cnt)
                )$sDataFrame

 # List all measures in a data frame.
-measures <- data.frame(MAE = sum[1, 2],
-                       RMSE = sqrt(sum[2, 2]),
-                       RAE = sum[3, 2])
+measures <- data.frame(MAE = sum[1, 2], RMSE = sqrt(sum[2, 2]), RAE = sum[3, 2])

 # Review the measures.
 measures
--- a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R
+++ b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R
@ -1,67 +1,64 @@
-# Regression: Demand estimation with Microsoft R Server
-
-# This example a replication of an existing Azure Machine Learning 
-# Experiment - Regression: Demand Estimation 
+#-------------------------------------------------------------------------------------------------------------
+#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
+#-------------------------------------------------------------------------------------------------------------
+# 
+#
+# This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation 
 # https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
-
-# The dataset contains 17,379 rows and 17 columns, each row representing 
-# the number of bike rentals within a given hour of a given day in 
-# the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
-# and wind speed) were included in this raw feature set, and the dates 
-# were categorized as holiday vs. weekday, etc.
-
-# The field to predict is "cnt", which contains a count value ranging 
-# from 1 to 977, representing the number of bike rentals within 
-# a given hour.
-
-# We built four models using the same algorithm, but with four different 
-# training datasets. The four training datasets that we constructed 
-# were all based on the same raw input data, but we added different 
-# additional features to each training set.
-
+#
+# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within 
+# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
+# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs. 
+# weekday etc.
+#
+# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
+# of bike rentals within a specific hour.
+#
+# We built four models using the same algorithm, but with four different training datasets. The four training 
+# datasets that we constructed were all based on the same raw input data, but we added different additional 
+# features to each training set.
+#
 # Set A = weather + holiday + weekday + weekend features for the predicted day
 # Set B = number of bikes that were rented in each of the previous 12 hours
-# Set C = number of bikes that were rented in each of the previous 12 days 
-#         at the same hour
-# Set D = number of bikes that were rented in each of the previous 12 weeks 
-#         at the same hour and the same day
-
+# Set C = number of bikes that were rented in each of the previous 12 days at the same hour
+# Set D = number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day
+#
 # Each of these feature sets captures different aspects of the problem:
 # Feature set B captures very recent demand for the bikes.
 # Feature set C captures the demand for bikes at a particular hour.
-# Feature set D captures demand for bikes at a particular hour and 
-#   particular day of the week.
-
-# The four training datasets were built by combining the feature set 
-# as follows:
+# Feature set D captures demand for bikes at a particular hour and particular day of the week.
+#
+# The four training datasets were built by combining the feature set as follows:
 # Training set 1: feature set A only
 # Training set 2: feature sets A+B
 # Training set 3: feature sets A+B+C
 # Training set 4: feature sets A+B+C+D
-
-# The following scripts include five basic steps of building this example 
-# using Microsoft R Server.
+#
+# The following scripts include five basic steps of building this example using Microsoft R Server.
+# This execution might require more than two minutes.
+#
+#--------------------------------------------------------------------------------------------------------------


-# Step 0: Get Started
-
-# Check whether Microsoft R Server (RRE 8.0) is installed
+#---------------------------Step 0: Get Started-------------------------------
+# ----------------------------------------------------------------------------
+# Check if Microsoft R Server (RRE 8.0) is installed
+# ----------------------------------------------------------------------------
 if (!require("RevoScaleR")) {
    cat("RevoScaleR package does not seem to exist. 
-        \nThis means that the functions starting with 'rx' will not run. 
-        \nIf you have Microsoft R Server installed, 
-        \nplease switch the R engine.
-        \nFor example, in R Tools for Visual Studio: 
-        \nR Tools -> Options -> R Engine. 
-        \nIf Microsoft R Server is not installed, you can download it from: 
-        \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
-        \n")
+      \nThis means that the functions starting with 'rx' will not run. 
+      \nIf you have Microsoft R Server installed, please switch the R engine.
+      \nFor example, in R Tools for Visual Studio: 
+      \nR Tools -> Options -> R Engine. 
+      \nIf Microsoft R Server is not installed, you can download it from: 
+      \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
+      \n")

    quit()
 }

 # Initial some variables.
-github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
+github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")

 # Create a temporary directory to store the intermediate .xdf files.
@ -70,36 +67,25 @@ outFileBike <- paste0(td, "/bike.xdf")
 outFileEdit <- paste0(td, "/editData.xdf")
 outFileLag <- paste0(td, "/lagData")

-
-# Step 1: Import Data
-
+#---------------------------Step 1: Import Data---------------------------
 # Import the bike data.
 # Remove timestamps and all columns that are part of the label.
-bike_mrs <- rxImport(inData = inputFileBikeURL,
-                     outFile = outFileBike, overwrite = TRUE,
+bike_mrs <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
                     missingValueString = "M", stringsAsFactors = FALSE,
-                     varsToDrop = c("instant", 
-                                    "dteday", 
-                                    "casual", 
-                                    "registered"))
+                     varsToDrop = c("instant", "dteday", "casual", "registered"))

-# Edit Metadata: Define year, weather conditions and season columns 
-# as categorical.
-editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit,
-                          sortLevels = TRUE,
-                          factorInfo = c("yr", "weathersit", "season"),
-                          overwrite = TRUE)
-
-
-# Step 2: Feature Engineering
+# Edit Metadata: Definite year, weather conditions and season columns as categorical.
+editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit, sortLevels = TRUE,
+                          factorInfo = c("yr", "weathersit", "season"), overwrite = TRUE)

+#---------------------------Step 2: Feature Engineering---------------------------
 # Create a function to construct lag features for four different aspects. 
-computeLagFeatures <- function (dataList) {   
-    # Total number of lags that need to be added.
-    numLags <- length(nLagsVector) 
+computeLagFeatures <- function(dataList) {
+
+    numLags <- length(nLagsVector) # total number of lags that need to be added
    for (iL in 1:numLags) {
        nlag <- nLagsVector[iL]
-        varLagName <- paste("demand.", nlag, unit, sep = "")
+        varLagName <- paste("cnt_", nlag, unit, sep = "")
        numRowsInChunk <- length(dataList[[baseVar]])
        numRowsToRead <- nlag * interval
        numRowsPadding <- 0
@ -107,89 +93,67 @@ computeLagFeatures <- function (dataList) {
            numRowsToRead <- .rxStartRow - 1
            numRowsPadding <- nlag * interval - numRowsToRead
        }
-        # Determine the current row to start processing the data 
-        # between chunks.
-        startRow <- .rxStartRow - numRowsToRead
+        startRow <- .rxStartRow - numRowsToRead # determine the current row to start processing the data between chunks.
        previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
-                                                varsToKeep = baseVar,
-                                                startRow = startRow,
-                                                numRows = numRowsToRead,
-                                                returnDataFrame = FALSE)
+                                      varsToKeep = baseVar,
+                                      startRow = startRow, numRows = numRowsToRead,
+                                      returnDataFrame = FALSE)
        paddingRowsDataList <- rxReadXdf(file = .rxReadFileName,
-                                                varsToKeep = baseVar,
-                                                startRow = 1,
-                                                numRows = numRowsPadding,
-                                                returnDataFrame = FALSE)
-        dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]],
-                                    previousRowsDataList[[baseVar]],
-                                    dataList[[baseVar]])[1:numRowsInChunk]
+                                     varsToKeep = baseVar,
+                                     startRow = 1, numRows = numRowsPadding,
+                                     returnDataFrame = FALSE)
+        dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]], previousRowsDataList[[baseVar]], dataList[[baseVar]])[1:numRowsInChunk]
    }
    return(dataList)
 }

 # Create a function to add lag features a set of columns at a time.
 addLag <- function(inputData, outputFileBase) {
-  
+
    inputFile <- inputData
-    outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
-    outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
-    outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf",
-                                    sep="")
-  
+    outputFileHour <- paste(outputFileBase, "_hour", ".xdf", sep = "")
+    outputFileHourDay <- paste(outputFileBase, "_hour_day", ".xdf", sep = "")
+    outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf", sep = "")
+
    # Initialize some fix values.
    hourInterval <- 1
    dayInterval <- 24
    weekInterval <- 168
    previous <- 12
-  
-    # Add number of bikes that were rented in each of the previous 12 hours 
-    # (for Set B).
-    rxDataStep(inData = inputFile, outFile = outputFileHour,
-                transformFunc=computeLagFeatures,
-                transformObjects = list(baseVar = "cnt", unit = "hour",
-                                        nLagsVector=seq(12), 
-                                        interval = hourInterval),
-                transformVars = c("cnt"), overwrite=TRUE)
-  
-    # Add number of bikes that were rented in each of the previous 12 days 
-    # at the same hour (for Set C).
-    rxDataStep(inData = outputFileHour, outFile = outputFileHourDay,
-               transformFunc=computeLagFeatures,
-               transformObjects = list(baseVar = "cnt",
-                                       unit = "day",
-                                       nLagsVector=seq(12), 
-                                       interval = dayInterval),
-                                       transformVars = c("cnt"),
-                                       overwrite=TRUE)
-  
-    # Add number of bikes that were rented in each of the previous 12 weeks 
-    # at the same hour and the same day (for Set D).
-    rxDataStep(inData = outputFileHourDay,
-               outFile = outputFileHourDayWeek,
-               transformFunc=computeLagFeatures,
-               transformObjects = list(baseVar = "cnt",
-                                       unit = "week",
-                                       nLagsVector=seq(12), 
-                                       interval = weekInterval),
-               transformVars = c("cnt"), overwrite=TRUE)
-  
+
+    # Add number of bikes that were rented in each of the previous 12 hours (for Set B).
+    rxDataStep(inData = inputFile, outFile = outputFileHour, transformFunc = computeLagFeatures,
+             transformObjects = list(baseVar = "cnt", unit = "hour", nLagsVector = seq(12),
+                                     interval = hourInterval),
+             transformVars = c("cnt"), overwrite = TRUE)
+
+    # Add number of bikes that were rented in each of the previous 12 days at the same hour (for Set C).
+    rxDataStep(inData = outputFileHour, outFile = outputFileHourDay, transformFunc = computeLagFeatures,
+             transformObjects = list(baseVar = "cnt", unit = "day", nLagsVector = seq(12),
+                                     interval = dayInterval),
+             transformVars = c("cnt"), overwrite = TRUE)
+
+    # Add number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day (for Set D).
+    rxDataStep(inData = outputFileHourDay, outFile = outputFileHourDayWeek, transformFunc = computeLagFeatures,
+             transformObjects = list(baseVar = "cnt", unit = "week", nLagsVector = seq(12),
+                                     interval = weekInterval),
+             transformVars = c("cnt"), overwrite = TRUE)
+
    file.remove(outputFileHour)
    file.remove(outputFileHourDay)
+
    return(outputFileHourDayWeek)
 }

-# Set A = weather + holiday + weekday + weekend features for 
-# the predicted day.
+# Set A = weather + holiday + weekday + weekend features for the predicted day.
 finalDataA_mrs <- editData_mrs

 # Set B, C & D.
-finalDataLag_dir <- addLag(inputData = editData_mrs,
-                           outputFileBase = outFileLag)
+finalDataLag_dir <- addLag(inputData = editData_mrs, outputFileBase = outFileLag)
 finalDataLag_mrs <- RxXdfData(finalDataLag_dir)


-# Step 3: Prepare Training and Test Datasets
-
+#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
 ## Set A:
 # Split Data.
 rxSplit(inData = finalDataA_mrs,
@ -202,62 +166,62 @@ testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf"))

 ## Set B, C & D:
 # Split Data.
-rxSplit(inData = finalDataLag_mrs,
-        outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
+rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
        overwrite = TRUE, reportProgress = 0, verbose = 0)
 # Point to the .xdf files for the training and test set.
 train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
 test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))

-  
-# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
-
-newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
-newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")
+#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
+newDayFeatures <- paste("cnt_", seq(12), "day", sep = "")
+newWeekFeatures <- paste("cnt_",  seq(12), "week", sep = "")

 ## Set A:
-# Build a formula for the regression model and remove the "yr", 
-# which is used to split the training and test data.
-formA_mrs <- formula(trainA_mrs, depVars = "cnt",
-                     varsToDrop = c("RowNum", "yr"))
+# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
+formA_mrs <- formula(trainA_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
 # Fit Decision Forest Regression model.
 dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs,
+                          method = "anova", maxDepth = 10, nTree = 20,
                          importance = TRUE, seed = 123)

 ## Set B:
-# Build a formula for the regression model and remove the "yr", 
-# which is used to split the training and test data, and lag features 
-# for Set C and D.
-formB_mrs <- formula(train_mrs, depVars = "cnt",
-                     varsToDrop = c("RowNum", "yr",
-                                    newDayFeatures,
-                                    newWeekFeatures))
+# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set C and D.
+formB_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newDayFeatures, newWeekFeatures))
 # Fit Decision Forest Regression model.
 dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs,
+                          method = "anova", maxDepth = 10, nTree = 20,
                          importance = TRUE, seed = 123)

 ## Set C:
-# Build a formula for the regression model and remove the "yr", 
-# which is used to split the training and test data, and lag features 
-# for Set D.
-formC_mrs <- formula(train_mrs, depVars = "cnt",
-                     varsToDrop = c("RowNum", "yr", newWeekFeatures))
+# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set D.
+formC_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newWeekFeatures))
 # Fit Decision Forest Regression model.
 dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs,
+                          method = "anova", maxDepth = 10, nTree = 20,
                          importance = TRUE, seed = 123)

 ## Set D:
-# Build a formula for the regression model and remove the "yr", 
-# which is used to split the training and test data.
-formD_mrs <- formula(train_mrs, depVars = "cnt",
-                     varsToDrop = c("RowNum", "yr"))
+# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
+formD_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
 # Fit Decision Forest Regression model.
 dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs,
+                          method = "anova", maxDepth = 10, nTree = 20,
                          importance = TRUE, seed = 123)

+# Plot four dotchart of the variable importance as measured by the four decision forest models.
+par(mfrow = c(2, 2))
+rxVarImpPlot(dForestA_mrs, main = "Variable Importance of Set A")
+rxVarImpPlot(dForestB_mrs, main = "Variable Importance of Set B")
+rxVarImpPlot(dForestC_mrs, main = "Variable Importance of Set C")
+rxVarImpPlot(dForestD_mrs, main = "Variable Importance of Set D")

-# Step 5: Predict over new data
+# Plot Out-of-bag error rate comparing to the number of trees build in decision forest model.
+plot(dForestA_mrs, main = "OOB Error Rate vs Number of Trees: Set A")
+plot(dForestB_mrs, main = "OOB Error Rate vs Number of Trees: Set B")
+plot(dForestC_mrs, main = "OOB Error Rate vs Number of Trees: Set C")
+plot(dForestD_mrs, main = "OOB Error Rate vs Number of Trees: Set D")

+#---------------------------Step 5: Predict over new data---------------------------
 ## Set A:
 # Predict the probability on the test dataset.
 rxPredict(dForestA_mrs, data = testA_mrs, 
@ -286,22 +250,17 @@ rxPredict(dForestD_mrs, data = test_mrs,
          residVarNames = "cnt_Resid_D",
          overwrite = TRUE, computeResiduals = TRUE)

-
-# Prepare outputs
+#---------------------------Prepare outputs---------------------------
 ## Set A:
-# Calculate three statistical measures: Mean Absolute Error (MAE), 
-# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
-sumA <- rxSummary( ~ cnt_Resid_A_abs + cnt_Resid_A_2 + cnt_rel_A,
-                  data = testA_mrs, summaryStats = "Mean", 
+# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
+sumA <- rxSummary(~ cnt_Resid_A_abs+cnt_Resid_A_2+cnt_rel_A, data = testA_mrs, summaryStats = "Mean", 
                  transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A), 
                                    cnt_Resid_A_2 = cnt_Resid_A^2, 
                                    cnt_rel_A = abs(cnt_Resid_A)/cnt)
                 )$sDataFrame

 ## Set B, C & D:
-sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
-                 cnt_Resid_C_abs + cnt_Resid_C_2 + cnt_rel_C +
-                 cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D, 
+sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_Resid_C_2+cnt_rel_C+cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D, 
                 data = test_mrs, summaryStats = "Mean", 
                 transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B), 
                                   cnt_Resid_B_2 = cnt_Resid_B^2, 
@ -312,7 +271,7 @@ sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
                                   cnt_Resid_D_abs = abs(cnt_Resid_D), 
                                   cnt_Resid_D_2 = cnt_Resid_D^2, 
                                   cnt_rel_D = abs(cnt_Resid_D)/cnt)
-                 )$sDataFrame
+)$sDataFrame

 # Add row names.
 features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
@ -323,10 +282,7 @@ features <- c("baseline: weather + holiday + weekday + weekend features for the
 # List all measures in a data frame.
 measures <- data.frame(Features = features,
                       MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
-                       RMSE = c(sqrt(sumA[2, 2]),
-                                sqrt(sum[2, 2]),
-                                sqrt(sum[5, 2]),
-                                sqrt(sum[8, 2])),
+                       RMSE = c(sqrt(sumA[2, 2]), sqrt(sum[2, 2]), sqrt(sum[5, 2]), sqrt(sum[8, 2])),
                       RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))

 # Review the measures.
--- a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R
+++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R
@ -7,7 +7,7 @@
 library("RCurl", quietly = TRUE)

 # A URL contains the raw data.
-github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
+github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputDataURL <- paste0(github, "Flight_Delays_Sample.csv")

 # Download data from the URL.
--- a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R
+++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R
@ -17,7 +17,7 @@ if (!require("RevoScaleR")) {
 } 

 # A URL contains the raw data.
-github <- "https://raw.githubusercontent.com/mezmicrosoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
+github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputDataURL <- paste0(github, "Flight_Delays_Sample.csv")

 # Create a temporary directory to store the intermediate .xdf files.
--- a/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays.R
+++ b/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays.R
@ -16,6 +16,9 @@
 # if the flight was on time.

 # The following scripts include five basic steps of building this example using Microsoft R Server.
+# This execution might require more than one minute.
+#
+#------------------------------------------------------------------------------------------------------------------------------------


 #---------------------------Step 0: Get Started-------------------------------
@ -36,7 +39,7 @@ if (!require("RevoScaleR")) {
 }

 # Initial some variables.
-github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
+github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
 inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")

--- a/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays_with_R_comparison.R
+++ b/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays_with_R_comparison.R
@ -23,7 +23,7 @@
 # are labeled 1 if a flight was delayed, and labeled 0 if the flight was on time.
 #
 # The following scripts include five basic steps of building this example using Microsoft R Server.
-#
+# This execution might require more than one minute.
 #
 #------------------------------------------------------------------------------------------------------------------------------------

@ -46,7 +46,7 @@ if (!require("RevoScaleR")) {
 }

 # Initial some variables.
-github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
+github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
 inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")

--- a/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_R/R_flight_delays_with_MRS_comparison.R
+++ b/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_R/R_flight_delays_with_MRS_comparison.R
@ -23,14 +23,14 @@
 # are labeled 1 if a flight was delayed, and labeled 0 if the flight was on time.
 #
 # The following scripts include five basic steps of building this example using open source R.
-#
+# This execution might require more than one minute.
 #
 #-------------------------------------------------------------------------------------------------------------------------


 #---------------------------Step 0: Get Started---------------------------
 # Initial some variables.
-github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
+github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
 inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")

--- a/examples/MRS_and_Machine_Learning/Machine
+++ b/examples/MRS_and_Machine_Learning/Machine
@ -4,8 +4,8 @@
 # ----------------------------------------------------------------------------

 # ----------------------------------------------------------------------------
-# NOTE: In order to run this script you'll need to have the an Azure ML
-# workspace, with its workspace ID and key.
+# NOTE: In order to run the last part of the script you'll need to have
+# the an Azure ML workspace, with its workspace ID and key.
 # For details about Azure ML, go to http://studio.azureml.net/
 # ----------------------------------------------------------------------------
 # Enter your Azure ML Studio workspace info here before continuing.
@ -23,8 +23,13 @@ library("MASS") # to use the Boston dataset
 # ----------------------------------------------------------------------------
 # fit a model and check model performance
 # ----------------------------------------------------------------------------
+# check the data
+head(Boston)
+ggplot(Boston, aes(x=medv)) + 
+  geom_histogram(binwidth=5) +
+  ggtitle("Histogram of Response Variable")

-# fit a model using all variables except medv as predictors
+# fit a model using medv as response and others as predictors 
 lm1 <- lm(medv ~ ., data = Boston)

 # check model performance
@ -50,52 +55,63 @@ print(paste("Relative Squared Error: ",
 # ----------------------------------------------------------------------------
 # publish and consume a web service
 # ----------------------------------------------------------------------------
-# workspace information
-ws <- workspace(
-  id = ws_id,
-  auth = auth_token)
+# the following works only with valid AzureML workspace information 

-# define predict function
-mypredict <- function(newdata) {
-  res <- predict(lm1, newdata)
-  res
-}
+# get workspace information
+AML <- 0
+tryCatch(
+  {
+    ws <- workspace(id = ws_id, auth = auth_token)
+    AML <- 1
+  },
+  error = function(cond){
+    message("Azure ML workspace information was not valid.")
+  }
+)

-# test the prediction function
-newdata <- Boston[1, 1:13]
-print(mypredict(newdata))
-
-# Publish the service
-ep <- publishWebService(ws = ws, fun = mypredict, 
-                        name = "HousePricePrediction", inputSchema = newdata)
-
-# consume web service - 1st approach
-pred <- consume(ep, newdata)
-pred
-
-# consume web service - 2nd approach
-# retrieve web service information for later use
-service_id <- ep$WebServiceId
-# define endpoint
-ep_price_pred <- endpoints(ws, service_id)
-# consume
-consume(ep_price_pred, newdata)
-
-# ----------------------------------------------------------------------------
-# update a web service and then consume
-# ----------------------------------------------------------------------------
-# define function for testing purpose
-mypredictnew <- function(newdata) {
-  res <- predict(lm1, newdata) + 100
-  res
-}
-
-# update service with the new function
-ep_update <- updateWebService(
-  ws = ws,
-  fun = mypredictnew,
-  inputSchema = newdata,
-  serviceId = ep$WebServiceId)
-
-# consume the updated web service
-consume(ep_price_pred, newdata)
+if (AML) {
+  # define predict function
+  mypredict <- function(newdata) {
+    res <- predict(lm1, newdata)
+    res
+  }
+  
+  # test the prediction function
+  newdata <- Boston[1, 1:13]
+  print(mypredict(newdata))
+  
+  # Publish the service
+  ep <- publishWebService(ws = ws, fun = mypredict, 
+                          name = "HousePricePrediction", inputSchema = newdata)
+  
+  # consume web service - 1st approach
+  pred <- consume(ep, newdata)
+  pred
+  
+  # consume web service - 2nd approach
+  # retrieve web service information for later use
+  service_id <- ep$WebServiceId
+  # define endpoint
+  ep_price_pred <- endpoints(ws, service_id)
+  # consume
+  consume(ep_price_pred, newdata)
+  
+  # define function for testing purpose
+  mypredictnew <- function(newdata) {
+    res <- predict(lm1, newdata) + 100
+    res
+  }
+  
+  # update service with the new function
+  ep_update <- updateWebService(
+    ws = ws,
+    fun = mypredictnew,
+    inputSchema = newdata,
+    serviceId = ep$WebServiceId)
+  
+  # consume the updated web service
+  consume(ep_price_pred, newdata)
+} else {
+  message("Azure ML webservice is not deployed because ", 
+          "the workspace information is invalid")
+}
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_1a_Functions_glm_rxGlm.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_1a_Functions_glm_rxGlm.R
@ -0,0 +1,60 @@
+# ----------------------------------------------------------------------------
+# purpose:  to demonstrate the commonalities and differences among functions
+#           in R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
+# audience: you are expected to have some prior experience with R
+# ----------------------------------------------------------------------------
+
+# to learn more about the differences among R, MRO and MRS, refer to:
+# https://github.com/lixzhang/R-MRO-MRS
+
+# ----------------------------------------------------------------------------
+# check if Microsoft R Server is installed and load libraries
+# ----------------------------------------------------------------------------
+# to check if RevoScaleR is available
+RRE <- require("RevoScaleR") 
+if (!RRE)
+{
+  message(
+    "RevoScaleR package does not seem to exist. \n",
+    "This means that the functions starting with 'rx' will not run. \n",
+    "If you have Microsoft R Server installed, please switch the R engine.\n",
+    "For example, in R Tools for Visual Studio: \n",
+    "R Tools -> Options -> R Engine. \n",
+    "If Microsoft R Server is not installed, you can download it from: \n",
+    "https://www.microsoft.com/en-us/server-cloud/products/r-server/")
+}
+
+# install a package if it's not already installed
+if (!require("ggplot2", quietly = TRUE))
+  install.packages("ggplot2")
+
+# load libraries
+library("ggplot2") # used for plotting
+
+# ----------------------------------------------------------------------------
+# glm vs rxGlm
+# ----------------------------------------------------------------------------
+# check the data
+head(mtcars)
+
+# check the response variable
+ggplot(mtcars, aes(x=vs)) + 
+  geom_histogram(binwidth=.3) +
+  ggtitle("Distribution of Response Values")
+  
+# fit a model with glm(), this can be run on R, MRO, or MRS
+# predict V engine vs straight engine with weight and displacement
+logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
+summary(logistic1)
+
+# fit the same model with rxGlm() if RevoScaleR is installed
+if (RRE){
+  # check the data
+  head(mtcars)
+  
+  # predict V engine vs straight engine with weight and displacement
+  logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
+  summary(logistic2)
+} else {
+  print("rxGlm was not run becauase the RevoScaleR package is not available")
+}
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_1_Functions.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_1_Functions.R
@ -10,9 +10,11 @@
 # ----------------------------------------------------------------------------
 # check if Microsoft R Server is installed and load libraries
 # ----------------------------------------------------------------------------
-if (!require("RevoScaleR"))
+# to check if RevoScaleR is available
+RRE <- require("RevoScaleR") 
+if (!RRE)
 {
-  stop(
+  message(
    "RevoScaleR package does not seem to exist. \n",
    "This means that the functions starting with 'rx' will not run. \n",
    "If you have Microsoft R Server installed, please switch the R engine.\n",
@ -31,27 +33,10 @@ library("MASS") # to use the mvrnorm function
 library("ggplot2") # used for plotting

 # ----------------------------------------------------------------------------
-# glm (R, MRO, MRS) vs rxGlm (MRS)
-# ----------------------------------------------------------------------------
-# check the data
-head(mtcars)
-# fit a model with glm(), this can be run on R, MRO, or MRS
-# predict V engine vs straight engine with weight and displacement
-logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
-summary(logistic1)
-
-# check the data
-head(mtcars)
-# fit the same model with rxGlm(), this can be run on MRS only
-# predict V engine vs straight engine with weight and displacement
-logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
-summary(logistic2)
-
-# ----------------------------------------------------------------------------
-# kmeans (R, MRO, MRS) vs rxKmeans (MRS)
+# simulate data, it works on R, MRO, or MRS
 # ----------------------------------------------------------------------------
 # make sure the results can be replicated
-set.seed(112)
+set.seed(121)

 # function to simulate data 
 simulCluster <- function(nsamples, mean, dimension, group)
@ -83,42 +68,18 @@ ggplot(group_all, aes(x = V1, y = V2)) +
 # assign data 
 mydata <- group_all[, 1:2]

+# ----------------------------------------------------------------------------
+# kmeans 
+# ----------------------------------------------------------------------------
 # cluster analysis with kmeans(), it works on R, MRO, or MRS
 fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")

-# cluster analysis with rxKmeans(), it works on MRS only
-fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
-                          numClusters = nclusters, algorithm = "lloyd")
-
-# ----------------------------------------------------------------------------
-# compare the cluster assignments between kmeans and rxKmeans (MRS): the same
-# ----------------------------------------------------------------------------
-
-# the code below should be run on MRS due to the use of "rx" commands
-
-# save a dataset in XDF format
-dataXDF = tempfile(fileext = ".xdf")
-rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
-# rxKmeans
-clust <- rxKmeans(~ V1 + V2, data = dataXDF, 
-                  numClusters = nclusters, algorithm = "lloyd",
-                  outFile = dataXDF, outColName = "cluster", 
-                  overwrite = TRUE)
-                    
-rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
-
-# append cluster assignment from kmeans and rxKmeans
 mydata_clusters <- cbind(
  group_all,
-  kmeans.cluster = factor(fit.kmeans$cluster),
-  rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
-
-# compare the cluster assignments between kmeans and rxKmeans
-with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))
+  kmeans.cluster = factor(fit.kmeans$cluster))

 # get cluster means 
 clustermeans.kmeans <- fit.kmeans$centers
-clustermeans.rxKmeans <- fit.kmeans$centers

 # plot clusters from kmeans
 ggplot(mydata_clusters, aes(x = V1, y = V2)) +
@ -129,11 +90,47 @@ ggplot(mydata_clusters, aes(x = V1, y = V2)) +
  geom_vline(xintercept = 0) +
  ggtitle("Clusters found by kmeans()")

-# plot clusters from rxKmeans
-ggplot(mydata_clusters, aes(x = V1, y = V2)) +
-  geom_point(aes(colour = rxKmeans.cluster)) +
-  geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
-  xlim(-5, 5) + ylim(-5, 5) +
-  geom_hline(yintercept = 0) +
-  geom_vline(xintercept = 0) +
-  ggtitle("Clusters found by rxKmeans()")
+# ----------------------------------------------------------------------------
+# cluster analysis with rxKmeans(), it works on MRS only
+# ----------------------------------------------------------------------------
+
+# cluster analysis with rxKmeans() if RevoScaleR is installed
+if (RRE){
+  # save a dataset in XDF format
+  dataXDF = tempfile(fileext = ".xdf")
+  rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
+  
+  # rxKmeans
+  clust <- rxKmeans(~ V1 + V2, data = dataXDF, 
+                    numClusters = nclusters, algorithm = "lloyd",
+                    outFile = dataXDF, outColName = "cluster", 
+                    overwrite = TRUE)
+  
+  rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
+  
+  # append cluster assignment from kmeans and rxKmeans
+  mydata_clusters <- cbind(
+    group_all,
+    kmeans.cluster = factor(fit.kmeans$cluster),
+    rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
+  
+  # compare the cluster assignments between kmeans and rxKmeans
+  message("\nComparing cluster assignments between kmeans and rxKmeans:")
+  print(with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster)))
+  
+  # get cluster means 
+  clustermeans.rxKmeans <- fit.kmeans$centers
+  
+  # plot clusters from rxKmeans
+  ggplot(mydata_clusters, aes(x = V1, y = V2)) +
+    geom_point(aes(colour = rxKmeans.cluster)) +
+    geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
+    xlim(-5, 5) + ylim(-5, 5) +
+    geom_hline(yintercept = 0) +
+    geom_vline(xintercept = 0) +
+    ggtitle("Clusters found by rxKmeans()")
+  
+} else{
+  print("rxKmeans was not run becauase the RevoScaleR package is not available")
+}
+
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_2_Capacity.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_2_Capacity.R
@ -10,9 +10,11 @@
 # ----------------------------------------------------------------------------
 # check if Microsoft R Server is installed and load libraries
 # ----------------------------------------------------------------------------
-if (!require("RevoScaleR"))
+# to check if RevoScaleR is available
+RRE <- require("RevoScaleR") 
+if (RRE)
 {
-  stop(
+  message(
    "RevoScaleR package does not seem to exist. \n",
    "This means that the functions starting with 'rx' will not run. \n",
    "If you have Microsoft R Server installed, please switch the R engine.\n",
@ -23,12 +25,12 @@ if (!require("RevoScaleR"))
 }

 # install a package if it's not already installed
-
 if (!require("ggplot2", quietly = TRUE))
  install.packages("ggplot2")

 # load packages
 library("MASS") # to use the mvrnorm function
+library("ggplot2") # used for plotting

 # ----------------------------------------------------------------------------
 # simulate cluster data for analysis, run this on R, MRO, or MRS
@ -50,6 +52,9 @@ simulCluster <- function(nsamples, mean, dimension, group)
 # modify the value for nsamples to test out the capacity limit for kmeans()
 # on a computer with 7 GB RAM, when nsamples is 3*10^7 kmeans() failed 
 # but rxKmeans() worked 
+message("If the sample size is large, it will take some time to finish. \n",
+        "You can use a smaller value for nsamples, say 1000, \n", 
+        "to test the program.")
 nsamples <- 3 * 10 ^ 7
 group_a <- simulCluster(nsamples, -1, 2, "a")
 group_b <- simulCluster(nsamples, 1, 2, "b")
@ -57,16 +62,26 @@ group_all <- rbind(group_a, group_b)

 nclusters <- 2

+# plot sample data
+plot_data <- group_all[sample(nrow(group_all), 1000),] 
+ggplot(plot_data, aes(x = V1, y = V2)) +
+  geom_point(aes(colour = group)) +
+  geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
+  xlim(-5, 5) + ylim(-5, 5) +
+  geom_hline(yintercept = 0) +
+  geom_vline(xintercept = 0) +
+  ggtitle("Simulated data in two overlapping groups")
+
 # save data
 mydata = group_all[, 1:2]
 dataCSV = tempfile(fileext = ".csv")
 dataXDF = tempfile(fileext = ".xdf")
 write.csv(group_all, dataCSV, row.names = FALSE)
-rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)

 # ----------------------------------------------------------------------------
 # cluster analysis with kmeans(), it doesn't work when data is large enough
 # ----------------------------------------------------------------------------
+# this can be run on R, MRO, or MRS
 system_time_R <- 
  system.time(
    {
@ -76,9 +91,12 @@ system_time_R <-
    })

 # ----------------------------------------------------------------------------
-# cluster analysis with rxKmeans(), it works even if kmeans() does not
+# cluster analysis with rxKmeans() on MRS, it works even if kmeans() does not
 # ----------------------------------------------------------------------------
-system_time_MRS <- 
+# cluster analysis with rxKmeans() if RevoScaleR is installed
+if (RRE){
+  rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
+  
  system.time(
    {
      clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
@ -88,3 +106,6 @@ system_time_MRS <-
                         outColName = "cluster",
                         overwrite = TRUE)
    })
+} else {
+  print("rxKmeans was not run becauase the RevoScaleR package is not available")
+}
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3a_Speed_for_Matrix_Calculations.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3a_Speed_for_Matrix_Calculations.R
@ -0,0 +1,53 @@
+# ----------------------------------------------------------------------------
+# purpose:  to demonstrate the speed differences across 
+#           R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
+# audience: you are expected to have some prior experience with R
+# ----------------------------------------------------------------------------
+
+# to learn more about the differences among R, MRO and MRS, refer to:
+# https://github.com/lixzhang/R-MRO-MRS
+
+# ----------------------------------------------------------------------------
+# run the following code on R, MRO, and MRS and 
+# notice the speed improvement with MRO and MRS over R
+# ----------------------------------------------------------------------------
+# the code in this section can be found at the following address
+# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
+
+# print the default number of threads if MKL library is installed
+if (require("RevoUtilsMath"))
+{
+  print(paste("The number of threads is:", getMKLthreads()))
+}
+
+# Initialization
+set.seed(1)
+m <- 10000
+n <- 5000
+A <- matrix(runif(m * n), m, n)
+
+# Matrix multiply
+system.time(B <- crossprod(A))
+
+# Cholesky Factorization
+system.time(C <- chol(B))
+
+# Singular Value Decomposition
+m <- 10000
+n <- 2000
+A <- matrix(runif(m * n), m, n)
+system.time(S <- svd(A, nu = 0, nv = 0))
+
+# Principal Components Analysis
+m <- 10000
+n <- 2000
+A <- matrix(runif(m * n), m, n)
+system.time(P <- prcomp(A))
+
+# Linear Discriminant Analysis
+library("MASS")
+g <- 5
+k <- round(m / 2)
+A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
+train <- sample(1:m, k)
+system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3b_Speed_for_kmeans.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3b_Speed_for_kmeans.R
@ -0,0 +1,44 @@
+# ----------------------------------------------------------------------------
+# purpose:  to demonstrate the speed similarities for kmeans across 
+#           R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
+# audience: you are expected to have some prior experience with R
+# ----------------------------------------------------------------------------
+
+# to learn more about the differences among R, MRO and MRS, refer to:
+# https://github.com/lixzhang/R-MRO-MRS
+
+# ----------------------------------------------------------------------------
+# load libraries
+# ----------------------------------------------------------------------------
+library("MASS") # to use the mvrnorm function
+
+# ----------------------------------------------------------------------------
+# run the following analysis that does not involve matrix manipulation 
+# to observe that the speed is similar on R, MRO and MRS 
+# ----------------------------------------------------------------------------
+set.seed(0)
+
+# function to simulate data
+simulCluster <- function(nsamples, mean, dimension, group)
+{
+  Sigma <- diag(1, dimension, dimension)
+  x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
+  z <- as.data.frame(x)
+  z$group = group
+  z
+}
+
+# simulate data 
+nsamples <- 10 ^ 7 # this was used on different platforms
+# nsamples <- 1000 # for testing purpose
+group_a <- simulCluster(nsamples, -1, 2, "a")
+group_b <- simulCluster(nsamples, 1, 2, "b")
+group_all <- rbind(group_a, group_b)
+
+nclusters <- 2
+
+mydata = group_all[, 1:2]
+# K-Means Cluster Analysis
+system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
+                                           iter.max = 1000,
+                                           algorithm = "Lloyd"))
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3c_Speed_for_kmeans_rxKmeans.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3c_Speed_for_kmeans_rxKmeans.R
@ -1,6 +1,6 @@
 # ----------------------------------------------------------------------------
-# purpose:  to demonstrate the speed differences across 
-#           R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
+# purpose:  to compare the speed of kmeans() with that of rxKmeans()  
+#           on Microsoft R Server (MRS)
 # audience: you are expected to have some prior experience with R
 # ----------------------------------------------------------------------------

@ -8,11 +8,13 @@
 # https://github.com/lixzhang/R-MRO-MRS

 # ----------------------------------------------------------------------------
-# check if Microsoft R Server (RRE 8.0) is installed and load libraries
+# check if Microsoft R Server is installed and load libraries
 # ----------------------------------------------------------------------------
-if (!require("RevoScaleR"))
+# to check if RevoScaleR is available
+RRE <- require("RevoScaleR") 
+if (!RRE)
 {
-  stop(
+  message(
    "RevoScaleR package does not seem to exist. \n",
    "This means that the functions starting with 'rx' will not run. \n",
    "If you have Microsoft R Server installed, please switch the R engine.\n",
@ -31,55 +33,16 @@ library("MASS") # to use the mvrnorm function
 library("ggplot2") # used for plotting

 # ----------------------------------------------------------------------------
-# run the following code on R, MRO, and MRS and 
-# notice the speed improvement with MRO and MRS over R
+# compare the speed of kmeans with that of rxKmeans on MRS
+# for different data sizes
 # ----------------------------------------------------------------------------
-# the code in this section can be found at the following address
-# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
+# to save timing results
+myresult <- data.frame(nsamples = integer(), time_r = double(),
+                       time_rre = double())

-# print the default number of threads if MKL library is installed
-if (require("RevoUtilsMath"))
-{
-  print(paste("The number of threads is:", getMKLthreads()))
-}
-
-# Initialization
-set.seed(1)
-m <- 10000
-n <- 5000
-A <- matrix(runif(m * n), m, n)
-
-# Matrix multiply
-system.time(B <- crossprod(A))
-
-# Cholesky Factorization
-system.time(C <- chol(B))
-
-# Singular Value Decomposition
-m <- 10000
-n <- 2000
-A <- matrix(runif(m * n), m, n)
-system.time(S <- svd(A, nu = 0, nv = 0))
-
-# Principal Components Analysis
-m <- 10000
-n <- 2000
-A <- matrix(runif(m * n), m, n)
-system.time(P <- prcomp(A))
-
-# Linear Discriminant Analysis
-library("MASS")
-g <- 5
-k <- round(m / 2)
-A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
-train <- sample(1:m, k)
-system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
-
-# ----------------------------------------------------------------------------
-# run an analysis that does not involve matrix to show that 
-# the speed is similar on R, MRO and MRS 
-# ----------------------------------------------------------------------------
-set.seed(0)
+# list of sample sizes
+nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
+                   5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)

 # function to simulate data
 simulCluster <- function(nsamples, mean, dimension, group)
@ -91,33 +54,6 @@ simulCluster <- function(nsamples, mean, dimension, group)
  z
 }

-# simulate data 
-nsamples <- 10 ^ 7 # this was used on different platforms
-# nsamples <- 1000 # for testing purpose
-group_a <- simulCluster(nsamples, -1, 2, "a")
-group_b <- simulCluster(nsamples, 1, 2, "b")
-group_all <- rbind(group_a, group_b)
-
-nclusters <- 2
-
-mydata = group_all[, 1:2]
-# K-Means Cluster Analysis
-system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
-                                           iter.max = 1000,
-                                           algorithm = "Lloyd"))
-
-# ----------------------------------------------------------------------------
-# compare the speed of kmeans() with that of rxKmeans() 
-# for different data sizes
-# ----------------------------------------------------------------------------
-# to save timing results
-myresult <- data.frame(nsamples = integer(), time_r = double(),
-                       time_rre = double())
-
-# list of sample sizes
-nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
-                   5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
-
 for (nsamples in nsamples_list)
 {
  # simulate data and append
@ -125,25 +61,29 @@ for (nsamples in nsamples_list)
  group_b <- simulCluster(nsamples, 1, 2, "b")
  group_all <- rbind(group_a, group_b)
  mydata = group_all[, 1:2]
-
+  
  nclusters <- 2
-
+  
  # kmeans with R
  system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
-                                               iter.max = 1000,
-                                               algorithm = "Lloyd"))
-
+                                             iter.max = 1000,
+                                             algorithm = "Lloyd"))
+  
  # kmeans with MRS
-  system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
-                                                     numClusters = nclusters,
-                                                     algorithm = "lloyd"))
-
+  
+  if (RRE){
+    system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
+                                                      numClusters = nclusters,
+                                                      algorithm = "lloyd"))
+  }
+  else system_time_rre <- c(NA,NA,NA)
+  
  # combine
  newrow <- data.frame(nsamples = nsamples,
-                         time_r = as.numeric(system_time_r[3]),
-                         time_rre = as.numeric(system_time_rre[3]))
+                       time_r = as.numeric(system_time_r[3]),
+                       time_rre = as.numeric(system_time_rre[3]))
  myresult <- rbind(myresult, newrow)
-
+  
 }

 myresult$nsamples <- 2 * myresult$nsamples
@ -152,7 +92,8 @@ mydata$nsamples_log <- log10(mydata$nsamples)
 mydata

 # generate plot
-ggplot(data = mydata, aes(x = nsamples_log)) +
+if (RRE){
+  ggplot(data = mydata, aes(x = nsamples_log)) +
    geom_point(aes(y = time_r, colour = "kmeans")) +
    geom_line(aes(y = time_r, colour = "kmeans")) +
    geom_point(aes(y = time_rre, colour = "rxKmeans")) +
@ -161,4 +102,14 @@ ggplot(data = mydata, aes(x = nsamples_log)) +
    scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
    xlab("log10(number of samples)") +
    ylab("time in seconds") +
-    ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
+    ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
+} else {
+  ggplot(data = mydata, aes(x = nsamples_log)) +
+    geom_point(aes(y = time_r, colour = "kmeans")) +
+    geom_line(aes(y = time_r, colour = "kmeans")) +
+    scale_x_continuous(breaks = seq(2, 8, by = 1)) +
+    scale_colour_manual("Function", values = c(kmeans = "red")) +
+    xlab("log10(number of samples)") +
+    ylab("time in seconds") +
+    ggtitle("Time for kmeans. To add time for rxKmean, use the RRE engine")
+}