Merge branch 'master' of github.com:brohrer-ms/RTVS-docs

2016-03-15 10:53:11 -04:00 · 2016-03-15 10:53:11 -04:00 · ac4919044e
--- a/examples/MRS_and_Machine_Learning/Machine
+++ b/examples/MRS_and_Machine_Learning/Machine
@ -7,64 +7,32 @@
 # load packages
 # ----------------------------------------------------------------------------
 (if (!require("MASS")) install.packages("MASS"))
-library("MASS") # use the mvrnorm function
-(if (!require("caret")) install.packages("caret"))
-library("caret") # use the train function for selecting hyper parameters
+library("MASS") # to use the Boston dataset
 (if (!require("gbm")) install.packages("gbm"))
 library("gbm") # Gradient Boosting Machine package

 # ----------------------------------------------------------------------------
-# select hyper parameters
+# fit the model and draw some plots
 # ----------------------------------------------------------------------------
-# ensure results are repeatable
-set.seed(123)
-
-# prepare training scheme
-control <- trainControl(method = "cv", number = 5)
-
-# design the parameter tuning grid 
-grid <- expand.grid(n.trees = c(5000, 10000, 15000),
-                    interaction.depth = c(2, 4, 8),
-                    n.minobsinnode = c(1, 2, 4),
-                    shrinkage = c(0.001, 0.01, 0.1))
-
-# design the parameter tuning grid - smaller grid for testing purpose
-# grid <- expand.grid(n.trees = c(5000, 10000),
-#                     interaction.depth = c(2, 4),
-#                     n.minobsinnode = c(1, 2),
-#                     shrinkage = c(0.001, 0.01))
-
-# tune the parameters
-gbm1 <- train(medv ~ ., data = Boston, method = "gbm", 
-              distribution = "gaussian", trControl = control, 
-              verbose = FALSE, tuneGrid = grid, metric = "RMSE")
-
-# summarize the model
-print(gbm1)
-
-# plot cross-validation results
-plot(gbm1)
-
-# ----------------------------------------------------------------------------
-# fit the model with estimated hyper parameters and draw some plots
-# ----------------------------------------------------------------------------
-gbm2 <- gbm(medv ~ .,
-            distribution = "gaussian",
-            n.trees = 5000,
-            interaction.depth = 4,
-            n.minobsinnode = 1,
-            shrinkage = 0.01,
-            cv.folds = 5,
-            data = Boston)
+# the four hyper-parameters used here - n.trees, interaction.depth, 
+# n.minobsinnode, and shrinkage - were selected from cross-validation 
+fit_gbm <- gbm(medv ~ .,
+               distribution = "gaussian",
+               n.trees = 5000,
+               interaction.depth = 4,
+               n.minobsinnode = 1,
+               shrinkage = 0.01,
+               cv.folds = 5,
+               data = Boston)

 # print the model
-print(gbm2)
+print(fit_gbm)

 # check performance using 5-fold cross-validation
-best.iter <- gbm.perf(gbm2, method = "cv")
+best.iter <- gbm.perf(fit_gbm, method = "cv")

 # check variable importance
-f_imp <- summary(gbm2, n.trees = best.iter, plot = FALSE)
+f_imp <- summary(fit_gbm, n.trees = best.iter, plot = FALSE)

 # use a custom plot to show variable importance
 barplot(f_imp$rel.inf, names.arg = f_imp$var, xlab = "Feature", 
--- a/examples/MRS_and_Machine_Learning/Machine
+++ b/examples/MRS_and_Machine_Learning/Machine
@ -12,7 +12,7 @@
 (if (!require("glmnet")) install.packages("glmnet"))
 library("glmnet") # use this package to fit a glmnet model
 (if (!require("MASS")) install.packages("MASS"))
-library("MASS") # use the Boston dataset
+library("MASS") # to use the Boston dataset

 # ----------------------------------------------------------------------------
 # identify the optimal value of lambda
@ -45,8 +45,8 @@ print(coef(model1, s = "lambda.min"))
 model2 <- glmnet(x = train_X, y = train_y, alpha = 1, family = "gaussian")

 # identify variable names
-vid <- as.character(seq(1,13))
 vn = colnames(train_X)
+vid <- as.character(seq(1,length(vn)))

 # check and exclude the variables with coefficient value 0 
 vnat = coef(model2)
@ -65,15 +65,13 @@ mycl <- seq(1,nvars)
 plot(model2, xvar = "lambda", label = TRUE, col = mycl, xlim = c(-5.5, 2))
 legend(-0.5,-2, legend_desc, lty = mylty, col = mycl, cex = 0.8) 

-# check coefficients from using glmnet() to compare with 
-# those from cv.glmnet(): the same
-coef(model2, s = model1$lambda.min)
-
 # ----------------------------------------------------------------------------
 # make predictions
 # ----------------------------------------------------------------------------
-# make predictions with model 1
+# data
 x_new <- data.matrix(train_X[1:2, - response_column])
+
+# make predictions with model 1
 predictions_train <- predict(model1, newx = x_new, s = "lambda.min")
 print(predictions_train)

--- a/examples/MRS_and_Machine_Learning/Machine
+++ b/examples/MRS_and_Machine_Learning/Machine
@ -34,9 +34,9 @@ pred <- predict(lm1)
 mae <- mean(abs(pred - Boston$medv))
 rmse <- sqrt(mean((pred - Boston$medv) ^ 2))
 rae <- mean(abs(pred - Boston$medv)) / mean(abs(Boston$medv - 
-                                                    mean(Boston$medv)))
+                                                  mean(Boston$medv)))
 rse <- mean((pred - Boston$medv) ^ 2) / mean((Boston$medv - 
-                                                  mean(Boston$medv)) ^ 2)
+                                                mean(Boston$medv)) ^ 2)

 print(paste("Mean Absolute Error: ", 
            as.character(round(mae, digit = 6)), sep = ""))
@ -47,16 +47,18 @@ print(paste("Relative Absolute Error: ",
 print(paste("Relative Squared Error: ", 
            as.character(round(rse, digit = 6)), sep = ""))

-
+# ----------------------------------------------------------------------------
+# publish and consume a web service
+# ----------------------------------------------------------------------------
 # workspace information
 ws <- workspace(
-    id = ws_id,
-    auth = auth_token)
+  id = ws_id,
+  auth = auth_token)

 # define predict function
 mypredict <- function(newdata) {
-    res <- predict(lm1, newdata)
-    res
+  res <- predict(lm1, newdata)
+  res
 }

 # test the prediction function
@ -67,9 +69,6 @@ print(mypredict(newdata))
 ep <- publishWebService(ws = ws, fun = mypredict, 
                        name = "HousePricePrediction", inputSchema = newdata)

-# ----------------------------------------------------------------------------
-# consume the web service
-# ----------------------------------------------------------------------------
 # consume web service - 1st approach
 pred <- consume(ep, newdata)
 pred
@ -87,16 +86,16 @@ consume(ep_price_pred, newdata)
 # ----------------------------------------------------------------------------
 # define function for testing purpose
 mypredictnew <- function(newdata) {
-    res <- predict(lm1, newdata) + 100
-    res
+  res <- predict(lm1, newdata) + 100
+  res
 }

 # update service with the new function
 ep_update <- updateWebService(
-    ws = ws,
-    fun = mypredictnew,
-    inputSchema = newdata,
-    serviceId = ep$WebServiceId)
+  ws = ws,
+  fun = mypredictnew,
+  inputSchema = newdata,
+  serviceId = ep$WebServiceId)

 # consume the updated web service
 consume(ep_price_pred, newdata)
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_1_Andrie.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_1_Andrie.R
@ -1,150 +0,0 @@
-# ----------------------------------------------------------------------------
-# purpose:  to demonstrate the commonalities and differences among functions
-#           in R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
-# audience: you are expected to have some prior experience with R
-# ----------------------------------------------------------------------------
-
-# to learn more about the differences among R, MRO and MRS, refer to:
-# https://github.com/lixzhang/R-MRO-MRS
-
-# ----------------------------------------------------------------------------
-# check if Microsoft R Server is installed
-# ----------------------------------------------------------------------------
-if (!require("RevoScaleR"))
-{
-  stop(
-    "RevoScaleR package does not seem to exist. \n",
-    "This means that the functions starting with 'rx' will not run. \n",
-    "If you have Microsoft R Server installed, please switch the R engine.\n",
-    "For example, in R Tools for Visual Studio: \n",
-    "R Tools -> Options -> R Engine. \n",
-    "If Microsoft R Server is not installed, you can download it from: \n",
-    "https://www.microsoft.com/en-us/server-cloud/products/r-server/")
-}
-
-# install a package if it's not already installed
-
-if (!require("ggplot2", quietly = TRUE))
-  install.packages("ggplot2")
-
-
-# load packages
-
-library("MASS") # to use the mvrnorm function
-library("ggplot2") # used for plotting
-
-
-# fit a model with glm(), this can be run on R, MRO, or MRS
-
-# check the data
-head(mtcars)
-# predict V engine vs straight engine with weight and displacement
-logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
-summary(logistic1)
-
-
-# fit the same model with rxGlm(), this can be run on MRS only
-
-# check the data
-head(mtcars)
-# predict V engine vs straight engine with weight and displacement
-logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
-summary(logistic2)
-
-
-# simulate cluster data for analysis, on R, MRO, or MRS
-
-# make sure the results can be replicated
-set.seed(0)
-
-# function to simulate data 
-simulCluster <- function(nsamples, mean, dimension, group)
-{
-  Sigma <- diag(1, dimension, dimension)
-  x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
-  z <- as.data.frame(x)
-  z$group = group
-  z
-}
-
-# simulate data with 2 clusters
-nsamples <- 1000
-group_all <- rbind(
-  simulCluster(nsamples, -1, 2, "a"),
-  simulCluster(nsamples, 1, 2, "b"))
-
-nclusters <- 2
-
-# plot data 
-
-ggplot(group_all, aes(x = V1, y = V2)) +
-    geom_point(aes(colour = group)) +
-    geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
-    xlim(-5, 5) + ylim(-5, 5) +
-    geom_hline(yintercept = 0) +
-    geom_vline(xintercept = 0) +
-    ggtitle("Simulated data in two overlapping groups")
-
-
-# assign data 
-mydata <- group_all[, 1:2]
-
-# ----------------------------------------------------------------------------
-# cluster analysis with kmeans(), it works on R, MRO, or MRS
-# ----------------------------------------------------------------------------
-fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")
-
-# ----------------------------------------------------------------------------
-# cluster analysis with rxKmeans(), it works on MRS only
-# ----------------------------------------------------------------------------
-fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
-                         numClusters = nclusters, algorithm = "lloyd")
-
-# ----------------------------------------------------------------------------
-# compare the cluster assignments between kmeans() and rxKmeans(): the same
-# the code below should be run on MRS due to the use of "rx" commands
-# ----------------------------------------------------------------------------
-# save a dataset in XDF format
-dataXDF = "testData.xdf"
-rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
-# rxKmeans
-clust <- rxKmeans( ~ V1 + V2, data = dataXDF, numClusters = nclusters,
-                  algorithm = "lloyd", outFile = dataXDF,
-                  outColName = "cluster", overwrite = TRUE)
-
-rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
-
-# append cluster assignment from kmeans
-mydata_clusters <- cbind(
-  group_all,
-  kmeans.cluster = factor(fit.kmeans$cluster),
-  rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
-
-
-# compare the cluster assignments between kmeans and rxKmeans
-with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))
-
-# get cluster means 
-clustermeans.kmeans <- fit.kmeans$centers
-clustermeans.rxKmeans <- fit.kmeans$centers
-
-# plot clusters from kmeans
-ggplot(mydata_clusters, aes(x = V1, y = V2)) +
-    geom_point(aes(colour = kmeans.cluster)) +
-    geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
-    xlim(-5, 5) + ylim(-5, 5) +
-    geom_hline(yintercept = 0) +
-    geom_vline(xintercept = 0) +
-    ggtitle("Clusters found by kmeans()")
-
-
-# plot clusters from rxKmeans
-ggplot(mydata_clusters, aes(x = V1, y = V2)) +
-    geom_point(aes(colour = rxKmeans.cluster)) +
-    geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
-    xlim(-5, 5) + ylim(-5, 5) +
-    geom_hline(yintercept = 0) +
-    geom_vline(xintercept = 0) +
-    ggtitle("Clusters found by rxKmeans()")
-
-
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_1_Functions.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_1_Functions.R
@ -8,51 +8,50 @@
 # https://github.com/lixzhang/R-MRO-MRS

 # ----------------------------------------------------------------------------
-# check if Microsoft R Server (RRE 8.0) is installed
+# check if Microsoft R Server is installed and load libraries
 # ----------------------------------------------------------------------------
-if (require("RevoScaleR")) {
-    library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
-    message("RevoScaleR package is succesfully loaded.")
-} else {
-    message("Can't find RevoScaleR package...")
-    message("If you have Microsoft R Server installed,")
-    message("please switch the R engine")
-    message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
-    message("If Microsoft R Server is not installed,")
-    message("please download it from here:")
-    message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
+if (!require("RevoScaleR"))
+{
+  stop(
+    "RevoScaleR package does not seem to exist. \n",
+    "This means that the functions starting with 'rx' will not run. \n",
+    "If you have Microsoft R Server installed, please switch the R engine.\n",
+    "For example, in R Tools for Visual Studio: \n",
+    "R Tools -> Options -> R Engine. \n",
+    "If Microsoft R Server is not installed, you can download it from: \n",
+    "https://www.microsoft.com/en-us/server-cloud/products/r-server/")
 }

-# ----------------------------------------------------------------------------
-# install a library if it's not already installed
-# ----------------------------------------------------------------------------
-(if (!require("ggplot2")) install.packages("ggplot2"))
-library("ggplot2")
-(if (!require("MASS")) install.packages("MASS"))
-library("MASS") # used for plotting
+# install a package if it's not already installed
+if (!require("ggplot2", quietly = TRUE))
+  install.packages("ggplot2")

-# fit a model with glm(), this can be run on R, MRO, or MRS
+# load packages
+library("MASS") # to use the mvrnorm function
+library("ggplot2") # used for plotting

+# ----------------------------------------------------------------------------
+# glm (R, MRO, MRS) vs rxGlm (MRS)
+# ----------------------------------------------------------------------------
 # check the data
 head(mtcars)
+# fit a model with glm(), this can be run on R, MRO, or MRS
 # predict V engine vs straight engine with weight and displacement
 logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
 summary(logistic1)

-
-# fit the same model with rxGlm(), this can be run on MRS only
-
 # check the data
 head(mtcars)
+# fit the same model with rxGlm(), this can be run on MRS only
 # predict V engine vs straight engine with weight and displacement
 logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
 summary(logistic2)

-
-# simulate cluster data for analysis, on R, MRO, or MRS
-
+# ----------------------------------------------------------------------------
+# kmeans (R, MRO, MRS) vs rxKmeans (MRS)
+# ----------------------------------------------------------------------------
 # make sure the results can be replicated
-set.seed(0)
+set.seed(112)

 # function to simulate data 
 simulCluster <- function(nsamples, mean, dimension, group)
@ -73,51 +72,47 @@ group_all <- rbind(
 nclusters <- 2

 # plot data 
-
 ggplot(group_all, aes(x = V1, y = V2)) +
-    geom_point(aes(colour = group)) +
-    geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
-    xlim(-5, 5) + ylim(-5, 5) +
-    geom_hline(yintercept = 0) +
-    geom_vline(xintercept = 0) +
-    ggtitle("Simulated data in two overlapping groups")
-
+  geom_point(aes(colour = group)) +
+  geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
+  xlim(-5, 5) + ylim(-5, 5) +
+  geom_hline(yintercept = 0) +
+  geom_vline(xintercept = 0) +
+  ggtitle("Simulated data in two overlapping groups")

 # assign data 
 mydata <- group_all[, 1:2]

-# ----------------------------------------------------------------------------
 # cluster analysis with kmeans(), it works on R, MRO, or MRS
-# ----------------------------------------------------------------------------
 fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")

-# ----------------------------------------------------------------------------
 # cluster analysis with rxKmeans(), it works on MRS only
-# ----------------------------------------------------------------------------
 fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
-                         numClusters = nclusters, algorithm = "lloyd")
+                          numClusters = nclusters, algorithm = "lloyd")

 # ----------------------------------------------------------------------------
-# compare the cluster assignments between kmeans() and rxKmeans(): the same
-# the code below should be run on MRS due to the use of "rx" commands
+# compare the cluster assignments between kmeans and rxKmeans (MRS): the same
 # ----------------------------------------------------------------------------
+
+# the code below should be run on MRS due to the use of "rx" commands
+
 # save a dataset in XDF format
-dataXDF = "testData.xdf"
+dataXDF = tempfile(fileext = ".xdf")
 rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
 # rxKmeans
-clust <- rxKmeans( ~ V1 + V2, data = dataXDF, numClusters = nclusters,
-                  algorithm = "lloyd", outFile = dataXDF,
-                  outColName = "cluster", overwrite = TRUE)
-
+clust <- rxKmeans(~ V1 + V2, data = dataXDF, 
+                  numClusters = nclusters, algorithm = "lloyd",
+                  outFile = dataXDF, outColName = "cluster", 
+                  overwrite = TRUE)
+                    
 rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")

-# append cluster assignment from kmeans
+# append cluster assignment from kmeans and rxKmeans
 mydata_clusters <- cbind(
  group_all,
  kmeans.cluster = factor(fit.kmeans$cluster),
  rxKmeans.cluster = factor(rxKmeans.cluster$cluster))

-
 # compare the cluster assignments between kmeans and rxKmeans
 with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))

@ -127,20 +122,18 @@ clustermeans.rxKmeans <- fit.kmeans$centers

 # plot clusters from kmeans
 ggplot(mydata_clusters, aes(x = V1, y = V2)) +
-    geom_point(aes(colour = kmeans.cluster)) +
-    geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
-    xlim(-5, 5) + ylim(-5, 5) +
-    geom_hline(yintercept = 0) +
-    geom_vline(xintercept = 0) +
-    ggtitle("Clusters found by kmeans()")
-
+  geom_point(aes(colour = kmeans.cluster)) +
+  geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
+  xlim(-5, 5) + ylim(-5, 5) +
+  geom_hline(yintercept = 0) +
+  geom_vline(xintercept = 0) +
+  ggtitle("Clusters found by kmeans()")

 # plot clusters from rxKmeans
 ggplot(mydata_clusters, aes(x = V1, y = V2)) +
-    geom_point(aes(colour = rxKmeans.cluster)) +
-    geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
-    xlim(-5, 5) + ylim(-5, 5) +
-    geom_hline(yintercept = 0) +
-    geom_vline(xintercept = 0) +
-    ggtitle("Clusters found by rxKmeans()")
-
+  geom_point(aes(colour = rxKmeans.cluster)) +
+  geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
+  xlim(-5, 5) + ylim(-5, 5) +
+  geom_hline(yintercept = 0) +
+  geom_vline(xintercept = 0) +
+  ggtitle("Clusters found by rxKmeans()")
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_2_Andrie.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_2_Andrie.R
@ -1,90 +0,0 @@
-# ----------------------------------------------------------------------------
-# purpose:  to demonstrate that MRS's rxKmeans() function works 
-#           successfully even when kmeans() does not for large datasets
-# audience: you are expected to have some prior experience with R
-# ----------------------------------------------------------------------------
-
-# to learn more about the differences among R, MRO and MRS, refer to:
-# https://github.com/lixzhang/R-MRO-MRS
-
-# ----------------------------------------------------------------------------
-# check if Microsoft R Server is installed
-# ----------------------------------------------------------------------------
-if (!require("RevoScaleR"))
-{
-  stop(
-    "RevoScaleR package does not seem to exist. \n",
-    "This means that the functions starting with 'rx' will not run. \n",
-    "If you have Microsoft R Server installed, please switch the R engine.\n",
-    "For example, in R Tools for Visual Studio: \n",
-    "R Tools -> Options -> R Engine. \n",
-    "If Microsoft R Server is not installed, you can download it from: \n",
-    "https://www.microsoft.com/en-us/server-cloud/products/r-server/")
-}
-
-# install a package if it's not already installed
-
-if (!require("ggplot2", quietly = TRUE))
-  install.packages("ggplot2")
-
-# load packages
-
-library("MASS") # to use the mvrnorm function
-
-# ----------------------------------------------------------------------------
-# simulate cluster data for analysis, run this on R, MRO, or MRS
-# ----------------------------------------------------------------------------
-# make sure the results can be replicated
-set.seed(0)
-
-# function to simulate data
-simulCluster <- function(nsamples, mean, dimension, group)
-{
-  Sigma <- diag(1, dimension, dimension)
-  x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
-  z <- as.data.frame(x)
-  z$group = group
-  z
-}
-
-# simulate data and append
-# modify the value for nsamples to test out the capacity limit for kmeans()
-# kmeans() failed when nsamples is 3*10^7 but rxKmeans() 
-# worked on a computer with 7 GB RAM
-nsamples <- 3 * 10 ^ 7
-group_a <- simulCluster(nsamples, -1, 2, "a")
-group_b <- simulCluster(nsamples, 1, 2, "b")
-group_all <- rbind(group_a, group_b)
-
-nclusters <- 2
-
-# save data
-mydata = group_all[, 1:2]
-write.csv(group_all, "simData.csv", row.names = FALSE)
-dataCSV = "simData.csv"
-dataXDF = "simData.xdf"
-rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
-
-# ----------------------------------------------------------------------------
-# cluster analysis with kmeans(), it doesn't work when data is large enough
-# ----------------------------------------------------------------------------
-system.time(
-{
-  fit <- kmeans(mydata, nclusters,
-                iter.max = 1000,
-                algorithm = "Lloyd")
-})
-
-# ----------------------------------------------------------------------------
-# cluster analysis with rxKmeans(), it works even if kmeans() does not
-# ----------------------------------------------------------------------------
-system.time(
-{
-  clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
-                    numClusters = nclusters,
-                    algorithm = "lloyd",
-                    outFile = dataXDF,
-                    outColName = "cluster",
-                    overwrite = TRUE)
-})
-
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_2_Capacity.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_2_Capacity.R
@ -2,37 +2,33 @@
 # purpose:  to demonstrate that MRS's rxKmeans() function works 
 #           successfully even when kmeans() does not for large datasets
 # audience: you are expected to have some prior experience with R
-#
-# NOTE: On a computer with less than 7GB of RAM available, this 
-# script may not be able to run to completion.
 # ----------------------------------------------------------------------------

 # to learn more about the differences among R, MRO and MRS, refer to:
 # https://github.com/lixzhang/R-MRO-MRS

 # ----------------------------------------------------------------------------
-# check if Microsoft R Server is installed
+# check if Microsoft R Server is installed and load libraries
 # ----------------------------------------------------------------------------
-if (require("RevoScaleR")) {
-    library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
-    message("RevoScaleR package is succesfully loaded.")
-} else {
-    message("Can't find RevoScaleR package...")
-    message("If you have Microsoft R Server installed,")
-    message("please switch the R engine")
-    message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
-    message("If Microsoft R Server is not installed,")
-    message("please download it from here:")
-    message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
+if (!require("RevoScaleR"))
+{
+  stop(
+    "RevoScaleR package does not seem to exist. \n",
+    "This means that the functions starting with 'rx' will not run. \n",
+    "If you have Microsoft R Server installed, please switch the R engine.\n",
+    "For example, in R Tools for Visual Studio: \n",
+    "R Tools -> Options -> R Engine. \n",
+    "If Microsoft R Server is not installed, you can download it from: \n",
+    "https://www.microsoft.com/en-us/server-cloud/products/r-server/")
 }

-# ----------------------------------------------------------------------------
-# install a library if it's not already installed
-# ----------------------------------------------------------------------------
-(if (!require("ggplot2")) install.packages("ggplot2"))
-library("ggplot2")
-(if (!require("MASS")) install.packages("MASS"))
-library("MASS") # used for plotting
+# install a package if it's not already installed
+
+if (!require("ggplot2", quietly = TRUE))
+  install.packages("ggplot2")
+
+# load packages
+library("MASS") # to use the mvrnorm function

 # ----------------------------------------------------------------------------
 # simulate cluster data for analysis, run this on R, MRO, or MRS
@ -51,9 +47,9 @@ simulCluster <- function(nsamples, mean, dimension, group)
 }

 # simulate data and append
-# Modify the value for nsamples to test out the capacity limit for kmeans().
-# kmeans() failed when nsamples is 3*10^7 but rxKmeans() 
-# worked on a computer with 7 GB RAM
+# modify the value for nsamples to test out the capacity limit for kmeans()
+# on a computer with 7 GB RAM, when nsamples is 3*10^7 kmeans() failed 
+# but rxKmeans() worked 
 nsamples <- 3 * 10 ^ 7
 group_a <- simulCluster(nsamples, -1, 2, "a")
 group_b <- simulCluster(nsamples, 1, 2, "b")
@ -63,30 +59,32 @@ nclusters <- 2

 # save data
 mydata = group_all[, 1:2]
-write.csv(group_all, "simData.csv", row.names = FALSE)
-dataCSV = "simData.csv"
-dataXDF = "simData.xdf"
+dataCSV = tempfile(fileext = ".csv")
+dataXDF = tempfile(fileext = ".xdf")
+write.csv(group_all, dataCSV, row.names = FALSE)
 rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)

 # ----------------------------------------------------------------------------
 # cluster analysis with kmeans(), it doesn't work when data is large enough
 # ----------------------------------------------------------------------------
-system.time(
-{
-  fit <- kmeans(mydata, nclusters,
-                iter.max = 1000,
-                algorithm = "Lloyd")
-})
+system_time_R <- 
+  system.time(
+    {
+      fit <- kmeans(mydata, nclusters,
+                    iter.max = 1000,
+                    algorithm = "Lloyd")
+    })

 # ----------------------------------------------------------------------------
 # cluster analysis with rxKmeans(), it works even if kmeans() does not
 # ----------------------------------------------------------------------------
-system.time(
-{
-  clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
-                    numClusters = nclusters,
-                    algorithm = "lloyd",
-                    outFile = dataXDF,
-                    outColName = "cluster",
-                    overwrite = TRUE)
-})
+system_time_MRS <- 
+  system.time(
+    {
+      clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
+                         numClusters = nclusters,
+                         algorithm = "lloyd",
+                         outFile = dataXDF,
+                         outColName = "cluster",
+                         overwrite = TRUE)
+    })
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_3_Andrie.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_3_Andrie.R
@ -1,165 +0,0 @@
-# ----------------------------------------------------------------------------
-# purpose:  to demonstrate the speed differences across 
-#           R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
-# audience: you are expected to have some prior experience with R
-# ----------------------------------------------------------------------------
-
-# to learn more about the differences among R, MRO and MRS, refer to:
-# https://github.com/lixzhang/R-MRO-MRS
-
-# ----------------------------------------------------------------------------
-# check if Microsoft R Server (RRE 8.0) is installed
-# ----------------------------------------------------------------------------
-if (!require("RevoScaleR"))
-{
-  stop(
-    "RevoScaleR package does not seem to exist. \n",
-    "This means that the functions starting with 'rx' will not run. \n",
-    "If you have Microsoft R Server installed, please switch the R engine.\n",
-    "For example, in R Tools for Visual Studio: \n",
-    "R Tools -> Options -> R Engine. \n",
-    "If Microsoft R Server is not installed, you can download it from: \n",
-    "https://www.microsoft.com/en-us/server-cloud/products/r-server/")
-}
-
-# install a package if it's not already installed
-if (!require("ggplot2", quietly = TRUE))
-  install.packages("ggplot2")
-
-# ----------------------------------------------------------------------------
-# load libraries
-# ----------------------------------------------------------------------------
-library("MASS") # to use the mvrnorm function
-library("ggplot2") # used for plotting
-
-# ----------------------------------------------------------------------------
-# run the following code on R, MRO, and MRS and 
-# notice the speed improvement with MRO and MRS over R
-# ----------------------------------------------------------------------------
-# the code in this section can be found at the following address
-# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
-
-# print the default number of threads if MKL library is installed
-if (require("RevoUtilsMath"))
-{
-  print(paste("The number of threads is:", getMKLthreads()))
-}
-
-# Initialization
-set.seed(1)
-m <- 10000
-n <- 5000
-A <- matrix(runif(m * n), m, n)
-
-# Matrix multiply
-system.time(B <- crossprod(A))
-
-# Cholesky Factorization
-system.time(C <- chol(B))
-
-# Singular Value Decomposition
-m <- 10000
-n <- 2000
-A <- matrix(runif(m * n), m, n)
-system.time(S <- svd(A, nu = 0, nv = 0))
-
-# Principal Components Analysis
-m <- 10000
-n <- 2000
-A <- matrix(runif(m * n), m, n)
-system.time(P <- prcomp(A))
-
-# Linear Discriminant Analysis
-library("MASS")
-g <- 5
-k <- round(m / 2)
-A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
-train <- sample(1:m, k)
-system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
-
-# ----------------------------------------------------------------------------
-# run an analysis that does not involve matrix to show that 
-# the speed is similar on R, MRO and MRS
-# ----------------------------------------------------------------------------
-set.seed(0)
-
-# function to simulate data
-simulCluster <- function(nsamples, mean, dimension, group)
-{
-  Sigma <- diag(1, dimension, dimension)
-  x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
-  z <- as.data.frame(x)
-  z$group = group
-  z
-}
-
-# simulate data 
-nsamples <- 10 ^ 7 # this was used on different platforms
-# nsamples <- 1000 # for testing purpose
-group_a <- simulCluster(nsamples, -1, 2, "a")
-group_b <- simulCluster(nsamples, 1, 2, "b")
-group_all <- rbind(group_a, group_b)
-
-nclusters <- 2
-
-mydata = group_all[, 1:2]
-# K-Means Cluster Analysis
-system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
-                                           iter.max = 1000,
-                                           algorithm = "Lloyd"))
-
-# ----------------------------------------------------------------------------
-# compare the speed of kmeans() with that of rxKmeans() 
-# for different data sizes
-# ----------------------------------------------------------------------------
-myresult <- data.frame(nsamples = integer(), time_r = double(),
-                       time_rre = double())
-
-nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
-                   5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
-
-for (nsamples in nsamples_list)
-{
-  # simulate data and append
-  group_a <- simulCluster(nsamples, -1, 2, "a")
-  group_b <- simulCluster(nsamples, 1, 2, "b")
-  group_all <- rbind(group_a, group_b)
-  mydata = group_all[, 1:2]
-
-  nclusters <- 2
-
-  # kmeans with R
-  system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
-                                               iter.max = 1000,
-                                               algorithm = "Lloyd"))
-
-  # kmeans with MRS
-  system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
-                                                     numClusters = nclusters,
-                                                     algorithm = "lloyd"))
-
-  # combine
-  newrow <- data.frame(nsamples = nsamples,
-                         time_r = as.numeric(system_time_r[3]),
-                         time_rre = as.numeric(system_time_rre[3]))
-  myresult <- rbind(myresult, newrow)
-
-}
-
-myresult$nsamples <- 2 * myresult$nsamples
-mydata <- myresult
-mydata$nsamples_log <- log10(mydata$nsamples)
-
-mydata
-
-ggplot(data = mydata, aes(x = nsamples_log)) +
-    geom_point(aes(y = time_r, colour = "kmeans")) +
-    geom_line(aes(y = time_r, colour = "kmeans")) +
-    geom_point(aes(y = time_rre, colour = "rxKmeans")) +
-    geom_line(aes(y = time_rre, colour = "rxKmeans")) +
-    scale_x_continuous(breaks = seq(2, 8, by = 1)) +
-    scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
-    xlab("log10(number of samples)") +
-    ylab("time in seconds") +
-    ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
-
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_3_Speed.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/R_MRO_MRS_Comparison_Part_3_Speed.R
@ -8,28 +8,27 @@
 # https://github.com/lixzhang/R-MRO-MRS

 # ----------------------------------------------------------------------------
-# check if Microsoft R Server (RRE 8.0) is installed
+# check if Microsoft R Server (RRE 8.0) is installed and load libraries
 # ----------------------------------------------------------------------------
-if (require("RevoScaleR")) {
-    library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
-    message("RevoScaleR package is succesfully loaded.")
-} else {
-    message("Can't find RevoScaleR package...")
-    message("If you have Microsoft R Server installed,")
-    message("please switch the R engine")
-    message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
-    message("If Microsoft R Server is not installed,")
-    message("please download it from here:")
-    message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
+if (!require("RevoScaleR"))
+{
+  stop(
+    "RevoScaleR package does not seem to exist. \n",
+    "This means that the functions starting with 'rx' will not run. \n",
+    "If you have Microsoft R Server installed, please switch the R engine.\n",
+    "For example, in R Tools for Visual Studio: \n",
+    "R Tools -> Options -> R Engine. \n",
+    "If Microsoft R Server is not installed, you can download it from: \n",
+    "https://www.microsoft.com/en-us/server-cloud/products/r-server/")
 }

-# ----------------------------------------------------------------------------
-# install a library if it's not already installed
-# ----------------------------------------------------------------------------
-(if (!require("ggplot2")) install.packages("ggplot2"))
-library("ggplot2")
-(if (!require("MASS")) install.packages("MASS"))
-library("MASS") # used for plotting
+# install a package if it's not already installed
+if (!require("ggplot2", quietly = TRUE))
+  install.packages("ggplot2")
+
+# load libraries
+library("MASS") # to use the mvrnorm function
+library("ggplot2") # used for plotting

 # ----------------------------------------------------------------------------
 # run the following code on R, MRO, and MRS and 
@ -69,6 +68,7 @@ A <- matrix(runif(m * n), m, n)
 system.time(P <- prcomp(A))

 # Linear Discriminant Analysis
+library("MASS")
 g <- 5
 k <- round(m / 2)
 A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
@ -77,7 +77,7 @@ system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))

 # ----------------------------------------------------------------------------
 # run an analysis that does not involve matrix to show that 
-# the speed is similar on R, MRO and MRS
+# the speed is similar on R, MRO and MRS 
 # ----------------------------------------------------------------------------
 set.seed(0)

@ -110,9 +110,11 @@ system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
 # compare the speed of kmeans() with that of rxKmeans() 
 # for different data sizes
 # ----------------------------------------------------------------------------
+# to save timing results
 myresult <- data.frame(nsamples = integer(), time_r = double(),
                       time_rre = double())

+# list of sample sizes
 nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
                   5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)

@ -147,9 +149,9 @@ for (nsamples in nsamples_list)
 myresult$nsamples <- 2 * myresult$nsamples
 mydata <- myresult
 mydata$nsamples_log <- log10(mydata$nsamples)
-
 mydata

+# generate plot
 ggplot(data = mydata, aes(x = nsamples_log)) +
    geom_point(aes(y = time_r, colour = "kmeans")) +
    geom_line(aes(y = time_r, colour = "kmeans")) +
@ -159,4 +161,4 @@ ggplot(data = mydata, aes(x = nsamples_log)) +
    scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
    xlab("log10(number of samples)") +
    ylab("time in seconds") +
-    ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
+    ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")