This commit is contained in:
Brandon Rohrer 2016-03-18 11:49:40 -04:00
Родитель eb9a7467ab 87156cdae8
Коммит 0c39f9ea09
9 изменённых файлов: 36 добавлений и 24 удалений

Просмотреть файл

@ -6,9 +6,9 @@
# ----------------------------------------------------------------------------
# load packages
# ----------------------------------------------------------------------------
(if (!require("MASS")) install.packages("MASS"))
(if (!require("MASS", quietly = TRUE)) install.packages("MASS"))
library("MASS") # to use the Boston dataset
(if (!require("gbm")) install.packages("gbm"))
(if (!require("gbm", quietly = TRUE)) install.packages("gbm"))
library("gbm") # Gradient Boosting Machine package
# ----------------------------------------------------------------------------

Просмотреть файл

@ -9,9 +9,9 @@
# ----------------------------------------------------------------------------
# load packages
# ----------------------------------------------------------------------------
(if (!require("glmnet")) install.packages("glmnet"))
(if (!require("glmnet", quietly = TRUE)) install.packages("glmnet"))
library("glmnet") # use this package to fit a glmnet model
(if (!require("MASS")) install.packages("MASS"))
(if (!require("MASS", quietly = TRUE)) install.packages("MASS"))
library("MASS") # to use the Boston dataset
# ----------------------------------------------------------------------------

Просмотреть файл

@ -15,10 +15,13 @@ auth_token <- ""
# ----------------------------------------------------------------------------
# load packages
# ----------------------------------------------------------------------------
(if (!require("AzureML")) install.packages("AzureML"))
# install packages if they are not already installed
(if (!require("AzureML", quietly = TRUE)) install.packages("AzureML"))
library("AzureML") # load the package for deploying Azure ML web service
(if (!require("MASS")) install.packages("MASS"))
(if (!require("MASS", quietly = TRUE)) install.packages("MASS"))
library("MASS") # to use the Boston dataset
if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------
# fit a model and check model performance
@ -26,7 +29,7 @@ library("MASS") # to use the Boston dataset
# check the data
head(Boston)
ggplot(Boston, aes(x=medv)) +
geom_histogram(binwidth=5) +
geom_histogram(binwidth=2) +
ggtitle("Histogram of Response Variable")
# fit a model using medv as response and others as predictors

Просмотреть файл

@ -25,10 +25,7 @@ if (!RRE)
}
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# load libraries
if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------

Просмотреть файл

@ -25,8 +25,7 @@ if (!RRE)
}
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")
# load packages
library("MASS") # to use the mvrnorm function
@ -63,7 +62,7 @@ ggplot(group_all, aes(x = V1, y = V2)) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Simulated data in two overlapping groups")
ggtitle("Simulated Data in Two Overlapping Groups")
# assign data
mydata <- group_all[, 1:2]
@ -88,7 +87,7 @@ ggplot(mydata_clusters, aes(x = V1, y = V2)) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by kmeans()")
ggtitle("Clusters Found by kmeans()")
# ----------------------------------------------------------------------------
# cluster analysis with rxKmeans(), it works on MRS only
@ -128,7 +127,7 @@ if (RRE){
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by rxKmeans()")
ggtitle("Clusters Found by rxKmeans()")
} else{
print("rxKmeans was not run becauase the RevoScaleR package is not available")

Просмотреть файл

@ -25,8 +25,7 @@ if (RRE)
}
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
if (!require("ggplot2", quietly = TRUE)) install.packages("ggplot2")
# load packages
library("MASS") # to use the mvrnorm function
@ -63,14 +62,14 @@ group_all <- rbind(group_a, group_b)
nclusters <- 2
# plot sample data
plot_data <- group_all[sample(nrow(group_all), 1000),]
plot_data <- group_all[sample(2 * nsamples, min(1000, 2 * nsamples)),]
ggplot(plot_data, aes(x = V1, y = V2)) +
geom_point(aes(colour = group)) +
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Simulated data in two overlapping groups")
ggtitle("Simulated Data in Two Overlapping Groups")
# save data
mydata = group_all[, 1:2]

Просмотреть файл

@ -51,3 +51,5 @@ k <- round(m / 2)
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
train <- sample(1:m, k)
system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
message("Save the time and run the code on R, MRO and MRS to compare speed.")

Просмотреть файл

@ -38,7 +38,12 @@ group_all <- rbind(group_a, group_b)
nclusters <- 2
mydata = group_all[, 1:2]
message("It might take a while for this to finish if nsamples is large.")
# K-Means Cluster Analysis
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd"))
system_time_r
message("Save the time and run the code on R, MRO and MRS to compare speed.")

Просмотреть файл

@ -50,6 +50,9 @@ simulCluster <- function(nsamples, mean, dimension, group)
z
}
message("It might take a while for this to finish if any of the elements in ",
"nsamples_list is large.")
for (nsamples in nsamples_list)
{
# simulate data and append
@ -68,7 +71,8 @@ for (nsamples in nsamples_list)
# kmeans with MRS
if (RRE){
system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2,
data = mydata,
numClusters = nclusters,
algorithm = "lloyd"))
}
@ -95,10 +99,12 @@ if (RRE){
geom_point(aes(y = time_rre, colour = "rxKmeans")) +
geom_line(aes(y = time_rre, colour = "rxKmeans")) +
scale_x_continuous(breaks = seq(2, 8, by = 1)) +
scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
scale_colour_manual("Function",
values = c(kmeans = "red", rxKmeans = "blue")) +
xlab("log10(number of samples)") +
ylab("time in seconds") +
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
ggtitle(paste("If data fits in memory,",
"kmeans() and rxKmeans() are equally performant"))
} else {
ggplot(data = mydata, aes(x = nsamples_log)) +
geom_point(aes(y = time_r, colour = "kmeans")) +
@ -107,5 +113,6 @@ if (RRE){
scale_colour_manual("Function", values = c(kmeans = "red")) +
xlab("log10(number of samples)") +
ylab("time in seconds") +
ggtitle("Time for kmeans. To add time for rxKmean, use the RRE engine")
ggtitle(paste("Time for kmeans \n",
"To add time for rxKmeans, use the R Server engine"))
}