merged

2016-03-18 11:49:40 -04:00 · 2016-03-18 11:49:40 -04:00 · 0c39f9ea09
--- a/examples/MRS_and_Machine_Learning/Machine
+++ b/examples/MRS_and_Machine_Learning/Machine
@ -6,9 +6,9 @@
 # ----------------------------------------------------------------------------
 # load packages
 # ----------------------------------------------------------------------------
-(if (!require("MASS")) install.packages("MASS"))
+(if (!require("MASS", quietly = TRUE)) install.packages("MASS"))
 library("MASS") # to use the Boston dataset
-(if (!require("gbm")) install.packages("gbm"))
+(if (!require("gbm", quietly = TRUE)) install.packages("gbm"))
 library("gbm") # Gradient Boosting Machine package

 # ----------------------------------------------------------------------------
--- a/examples/MRS_and_Machine_Learning/Machine
+++ b/examples/MRS_and_Machine_Learning/Machine
@ -9,9 +9,9 @@
 # ----------------------------------------------------------------------------
 # load packages
 # ----------------------------------------------------------------------------
-(if (!require("glmnet")) install.packages("glmnet"))
+(if (!require("glmnet", quietly = TRUE)) install.packages("glmnet"))
 library("glmnet") # use this package to fit a glmnet model
-(if (!require("MASS")) install.packages("MASS"))
+(if (!require("MASS", quietly = TRUE)) install.packages("MASS"))
 library("MASS") # to use the Boston dataset

 # ----------------------------------------------------------------------------
--- a/examples/MRS_and_Machine_Learning/Machine
+++ b/examples/MRS_and_Machine_Learning/Machine
@ -15,10 +15,13 @@ auth_token <- ""
 # ----------------------------------------------------------------------------
 # load packages
 # ----------------------------------------------------------------------------
-(if (!require("AzureML")) install.packages("AzureML"))
+# install packages if they are not already installed
+(if (!require("AzureML", quietly = TRUE)) install.packages("AzureML"))
 library("AzureML") # load the package for deploying Azure ML web service
-(if (!require("MASS")) install.packages("MASS"))
+(if (!require("MASS", quietly = TRUE)) install.packages("MASS"))
 library("MASS") # to use the Boston dataset
+if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")
+library("ggplot2") # used for plotting

 # ----------------------------------------------------------------------------
 # fit a model and check model performance
@ -26,7 +29,7 @@ library("MASS") # to use the Boston dataset
 # check the data
 head(Boston)
 ggplot(Boston, aes(x=medv)) + 
-  geom_histogram(binwidth=5) +
+  geom_histogram(binwidth=2) +
  ggtitle("Histogram of Response Variable")

 # fit a model using medv as response and others as predictors 
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_1a_Functions_glm_rxGlm.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_1a_Functions_glm_rxGlm.R
@ -25,10 +25,7 @@ if (!RRE)
 }

 # install a package if it's not already installed
-if (!require("ggplot2", quietly = TRUE))
-  install.packages("ggplot2")
-
-# load libraries
+if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")
 library("ggplot2") # used for plotting

 # ----------------------------------------------------------------------------
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_1b_Functions_kmeans_rxKmeans.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_1b_Functions_kmeans_rxKmeans.R
@ -25,8 +25,7 @@ if (!RRE)
 }

 # install a package if it's not already installed
-if (!require("ggplot2", quietly = TRUE))
-  install.packages("ggplot2")
+if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")

 # load packages
 library("MASS") # to use the mvrnorm function
@ -63,7 +62,7 @@ ggplot(group_all, aes(x = V1, y = V2)) +
  xlim(-5, 5) + ylim(-5, 5) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
-  ggtitle("Simulated data in two overlapping groups")
+  ggtitle("Simulated Data in Two Overlapping Groups")

 # assign data 
 mydata <- group_all[, 1:2]
@ -88,7 +87,7 @@ ggplot(mydata_clusters, aes(x = V1, y = V2)) +
  xlim(-5, 5) + ylim(-5, 5) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
-  ggtitle("Clusters found by kmeans()")
+  ggtitle("Clusters Found by kmeans()")

 # ----------------------------------------------------------------------------
 # cluster analysis with rxKmeans(), it works on MRS only
@ -128,7 +127,7 @@ if (RRE){
    xlim(-5, 5) + ylim(-5, 5) +
    geom_hline(yintercept = 0) +
    geom_vline(xintercept = 0) +
-    ggtitle("Clusters found by rxKmeans()")
+    ggtitle("Clusters Found by rxKmeans()")
  
 } else{
  print("rxKmeans was not run becauase the RevoScaleR package is not available")
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_2_Capacity.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_2_Capacity.R
@ -25,8 +25,7 @@ if (RRE)
 }

 # install a package if it's not already installed
-if (!require("ggplot2", quietly = TRUE))
-  install.packages("ggplot2")
+if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")

 # load packages
 library("MASS") # to use the mvrnorm function
@ -63,14 +62,14 @@ group_all <- rbind(group_a, group_b)
 nclusters <- 2

 # plot sample data
-plot_data <- group_all[sample(nrow(group_all), 1000),] 
+plot_data <- group_all[sample(2 * nsamples, min(1000, 2 * nsamples)),] 
 ggplot(plot_data, aes(x = V1, y = V2)) +
  geom_point(aes(colour = group)) +
  geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
  xlim(-5, 5) + ylim(-5, 5) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
-  ggtitle("Simulated data in two overlapping groups")
+  ggtitle("Simulated Data in Two Overlapping Groups")

 # save data
 mydata = group_all[, 1:2]
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3a_Speed_for_Matrix_Calculations.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3a_Speed_for_Matrix_Calculations.R
@ -51,3 +51,5 @@ k <- round(m / 2)
 A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
 train <- sample(1:m, k)
 system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
+
+message("Save the time and run the code on R, MRO and MRS to compare speed.")
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3b_Speed_for_kmeans.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3b_Speed_for_kmeans.R
@ -38,7 +38,12 @@ group_all <- rbind(group_a, group_b)
 nclusters <- 2

 mydata = group_all[, 1:2]
+
+message("It might take a while for this to finish if nsamples is large.")
 # K-Means Cluster Analysis
 system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
                                           iter.max = 1000,
                                           algorithm = "Lloyd"))
+system_time_r
+
+message("Save the time and run the code on R, MRO and MRS to compare speed.")
--- a/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3c_Speed_for_kmeans_rxKmeans.R
+++ b/examples/MRS_and_Machine_Learning/R_MRO_MRS_Comparison/Part_3c_Speed_for_kmeans_rxKmeans.R
@ -50,6 +50,9 @@ simulCluster <- function(nsamples, mean, dimension, group)
  z
 }

+message("It might take a while for this to finish if any of the elements in ", 
+  "nsamples_list is large.")
+
 for (nsamples in nsamples_list)
 {
  # simulate data and append
@ -68,7 +71,8 @@ for (nsamples in nsamples_list)
  # kmeans with MRS
  
  if (RRE){
-    system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
+    system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, 
+                                                      data = mydata,
                                                      numClusters = nclusters,
                                                      algorithm = "lloyd"))
  }
@ -95,10 +99,12 @@ if (RRE){
    geom_point(aes(y = time_rre, colour = "rxKmeans")) +
    geom_line(aes(y = time_rre, colour = "rxKmeans")) +
    scale_x_continuous(breaks = seq(2, 8, by = 1)) +
-    scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
+    scale_colour_manual("Function", 
+                        values = c(kmeans = "red", rxKmeans = "blue")) +
    xlab("log10(number of samples)") +
    ylab("time in seconds") +
-    ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
+    ggtitle(paste("If data fits in memory,", 
+                  "kmeans() and rxKmeans() are equally performant"))
 } else {
  ggplot(data = mydata, aes(x = nsamples_log)) +
    geom_point(aes(y = time_r, colour = "kmeans")) +
@ -107,5 +113,6 @@ if (RRE){
    scale_colour_manual("Function", values = c(kmeans = "red")) +
    xlab("log10(number of samples)") +
    ylab("time in seconds") +
-    ggtitle("Time for kmeans. To add time for rxKmean, use the RRE engine")
+    ggtitle(paste("Time for kmeans \n", 
+                  "To add time for rxKmeans, use the R Server engine"))
 }