Merge branch 'master' of github.com:brohrer-ms/RTVS-docs

This commit is contained in:
Brandon Rohrer 2016-03-15 10:53:11 -04:00
Родитель 241d6f422a 05b5a4b4ba
Коммит ac4919044e
9 изменённых файлов: 156 добавлений и 603 удалений

Просмотреть файл

@ -7,64 +7,32 @@
# load packages
# ----------------------------------------------------------------------------
(if (!require("MASS")) install.packages("MASS"))
library("MASS") # use the mvrnorm function
(if (!require("caret")) install.packages("caret"))
library("caret") # use the train function for selecting hyper parameters
library("MASS") # to use the Boston dataset
(if (!require("gbm")) install.packages("gbm"))
library("gbm") # Gradient Boosting Machine package
# ----------------------------------------------------------------------------
# select hyper parameters
# fit the model and draw some plots
# ----------------------------------------------------------------------------
# ensure results are repeatable
set.seed(123)
# prepare training scheme
control <- trainControl(method = "cv", number = 5)
# design the parameter tuning grid
grid <- expand.grid(n.trees = c(5000, 10000, 15000),
interaction.depth = c(2, 4, 8),
n.minobsinnode = c(1, 2, 4),
shrinkage = c(0.001, 0.01, 0.1))
# design the parameter tuning grid - smaller grid for testing purpose
# grid <- expand.grid(n.trees = c(5000, 10000),
# interaction.depth = c(2, 4),
# n.minobsinnode = c(1, 2),
# shrinkage = c(0.001, 0.01))
# tune the parameters
gbm1 <- train(medv ~ ., data = Boston, method = "gbm",
distribution = "gaussian", trControl = control,
verbose = FALSE, tuneGrid = grid, metric = "RMSE")
# summarize the model
print(gbm1)
# plot cross-validation results
plot(gbm1)
# ----------------------------------------------------------------------------
# fit the model with estimated hyper parameters and draw some plots
# ----------------------------------------------------------------------------
gbm2 <- gbm(medv ~ .,
distribution = "gaussian",
n.trees = 5000,
interaction.depth = 4,
n.minobsinnode = 1,
shrinkage = 0.01,
cv.folds = 5,
data = Boston)
# the four hyper-parameters used here - n.trees, interaction.depth,
# n.minobsinnode, and shrinkage - were selected from cross-validation
fit_gbm <- gbm(medv ~ .,
distribution = "gaussian",
n.trees = 5000,
interaction.depth = 4,
n.minobsinnode = 1,
shrinkage = 0.01,
cv.folds = 5,
data = Boston)
# print the model
print(gbm2)
print(fit_gbm)
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(gbm2, method = "cv")
best.iter <- gbm.perf(fit_gbm, method = "cv")
# check variable importance
f_imp <- summary(gbm2, n.trees = best.iter, plot = FALSE)
f_imp <- summary(fit_gbm, n.trees = best.iter, plot = FALSE)
# use a custom plot to show variable importance
barplot(f_imp$rel.inf, names.arg = f_imp$var, xlab = "Feature",

Просмотреть файл

@ -12,7 +12,7 @@
(if (!require("glmnet")) install.packages("glmnet"))
library("glmnet") # use this package to fit a glmnet model
(if (!require("MASS")) install.packages("MASS"))
library("MASS") # use the Boston dataset
library("MASS") # to use the Boston dataset
# ----------------------------------------------------------------------------
# identify the optimal value of lambda
@ -45,8 +45,8 @@ print(coef(model1, s = "lambda.min"))
model2 <- glmnet(x = train_X, y = train_y, alpha = 1, family = "gaussian")
# identify variable names
vid <- as.character(seq(1,13))
vn = colnames(train_X)
vid <- as.character(seq(1,length(vn)))
# check and exclude the variables with coefficient value 0
vnat = coef(model2)
@ -65,15 +65,13 @@ mycl <- seq(1,nvars)
plot(model2, xvar = "lambda", label = TRUE, col = mycl, xlim = c(-5.5, 2))
legend(-0.5,-2, legend_desc, lty = mylty, col = mycl, cex = 0.8)
# check coefficients from using glmnet() to compare with
# those from cv.glmnet(): the same
coef(model2, s = model1$lambda.min)
# ----------------------------------------------------------------------------
# make predictions
# ----------------------------------------------------------------------------
# make predictions with model 1
# data
x_new <- data.matrix(train_X[1:2, - response_column])
# make predictions with model 1
predictions_train <- predict(model1, newx = x_new, s = "lambda.min")
print(predictions_train)

Просмотреть файл

@ -34,9 +34,9 @@ pred <- predict(lm1)
mae <- mean(abs(pred - Boston$medv))
rmse <- sqrt(mean((pred - Boston$medv) ^ 2))
rae <- mean(abs(pred - Boston$medv)) / mean(abs(Boston$medv -
mean(Boston$medv)))
mean(Boston$medv)))
rse <- mean((pred - Boston$medv) ^ 2) / mean((Boston$medv -
mean(Boston$medv)) ^ 2)
mean(Boston$medv)) ^ 2)
print(paste("Mean Absolute Error: ",
as.character(round(mae, digit = 6)), sep = ""))
@ -47,16 +47,18 @@ print(paste("Relative Absolute Error: ",
print(paste("Relative Squared Error: ",
as.character(round(rse, digit = 6)), sep = ""))
# ----------------------------------------------------------------------------
# publish and consume a web service
# ----------------------------------------------------------------------------
# workspace information
ws <- workspace(
id = ws_id,
auth = auth_token)
id = ws_id,
auth = auth_token)
# define predict function
mypredict <- function(newdata) {
res <- predict(lm1, newdata)
res
res <- predict(lm1, newdata)
res
}
# test the prediction function
@ -67,9 +69,6 @@ print(mypredict(newdata))
ep <- publishWebService(ws = ws, fun = mypredict,
name = "HousePricePrediction", inputSchema = newdata)
# ----------------------------------------------------------------------------
# consume the web service
# ----------------------------------------------------------------------------
# consume web service - 1st approach
pred <- consume(ep, newdata)
pred
@ -87,16 +86,16 @@ consume(ep_price_pred, newdata)
# ----------------------------------------------------------------------------
# define function for testing purpose
mypredictnew <- function(newdata) {
res <- predict(lm1, newdata) + 100
res
res <- predict(lm1, newdata) + 100
res
}
# update service with the new function
ep_update <- updateWebService(
ws = ws,
fun = mypredictnew,
inputSchema = newdata,
serviceId = ep$WebServiceId)
ws = ws,
fun = mypredictnew,
inputSchema = newdata,
serviceId = ep$WebServiceId)
# consume the updated web service
consume(ep_price_pred, newdata)

Просмотреть файл

@ -1,150 +0,0 @@
# ----------------------------------------------------------------------------
# purpose: to demonstrate the commonalities and differences among functions
# in R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
# to learn more about the differences among R, MRO and MRS, refer to:
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# check if Microsoft R Server is installed
# ----------------------------------------------------------------------------
if (!require("RevoScaleR"))
{
stop(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
"For example, in R Tools for Visual Studio: \n",
"R Tools -> Options -> R Engine. \n",
"If Microsoft R Server is not installed, you can download it from: \n",
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
}
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# load packages
library("MASS") # to use the mvrnorm function
library("ggplot2") # used for plotting
# fit a model with glm(), this can be run on R, MRO, or MRS
# check the data
head(mtcars)
# predict V engine vs straight engine with weight and displacement
logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
summary(logistic1)
# fit the same model with rxGlm(), this can be run on MRS only
# check the data
head(mtcars)
# predict V engine vs straight engine with weight and displacement
logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
summary(logistic2)
# simulate cluster data for analysis, on R, MRO, or MRS
# make sure the results can be replicated
set.seed(0)
# function to simulate data
simulCluster <- function(nsamples, mean, dimension, group)
{
Sigma <- diag(1, dimension, dimension)
x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
z <- as.data.frame(x)
z$group = group
z
}
# simulate data with 2 clusters
nsamples <- 1000
group_all <- rbind(
simulCluster(nsamples, -1, 2, "a"),
simulCluster(nsamples, 1, 2, "b"))
nclusters <- 2
# plot data
ggplot(group_all, aes(x = V1, y = V2)) +
geom_point(aes(colour = group)) +
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Simulated data in two overlapping groups")
# assign data
mydata <- group_all[, 1:2]
# ----------------------------------------------------------------------------
# cluster analysis with kmeans(), it works on R, MRO, or MRS
# ----------------------------------------------------------------------------
fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")
# ----------------------------------------------------------------------------
# cluster analysis with rxKmeans(), it works on MRS only
# ----------------------------------------------------------------------------
fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
numClusters = nclusters, algorithm = "lloyd")
# ----------------------------------------------------------------------------
# compare the cluster assignments between kmeans() and rxKmeans(): the same
# the code below should be run on MRS due to the use of "rx" commands
# ----------------------------------------------------------------------------
# save a dataset in XDF format
dataXDF = "testData.xdf"
rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
# rxKmeans
clust <- rxKmeans( ~ V1 + V2, data = dataXDF, numClusters = nclusters,
algorithm = "lloyd", outFile = dataXDF,
outColName = "cluster", overwrite = TRUE)
rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
# append cluster assignment from kmeans
mydata_clusters <- cbind(
group_all,
kmeans.cluster = factor(fit.kmeans$cluster),
rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
# compare the cluster assignments between kmeans and rxKmeans
with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))
# get cluster means
clustermeans.kmeans <- fit.kmeans$centers
clustermeans.rxKmeans <- fit.kmeans$centers
# plot clusters from kmeans
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
geom_point(aes(colour = kmeans.cluster)) +
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by kmeans()")
# plot clusters from rxKmeans
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
geom_point(aes(colour = rxKmeans.cluster)) +
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by rxKmeans()")

Просмотреть файл

@ -8,51 +8,50 @@
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# check if Microsoft R Server (RRE 8.0) is installed
# check if Microsoft R Server is installed and load libraries
# ----------------------------------------------------------------------------
if (require("RevoScaleR")) {
library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
message("RevoScaleR package is succesfully loaded.")
} else {
message("Can't find RevoScaleR package...")
message("If you have Microsoft R Server installed,")
message("please switch the R engine")
message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
message("If Microsoft R Server is not installed,")
message("please download it from here:")
message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
if (!require("RevoScaleR"))
{
stop(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
"For example, in R Tools for Visual Studio: \n",
"R Tools -> Options -> R Engine. \n",
"If Microsoft R Server is not installed, you can download it from: \n",
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
}
# ----------------------------------------------------------------------------
# install a library if it's not already installed
# ----------------------------------------------------------------------------
(if (!require("ggplot2")) install.packages("ggplot2"))
library("ggplot2")
(if (!require("MASS")) install.packages("MASS"))
library("MASS") # used for plotting
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# fit a model with glm(), this can be run on R, MRO, or MRS
# load packages
library("MASS") # to use the mvrnorm function
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------
# glm (R, MRO, MRS) vs rxGlm (MRS)
# ----------------------------------------------------------------------------
# check the data
head(mtcars)
# fit a model with glm(), this can be run on R, MRO, or MRS
# predict V engine vs straight engine with weight and displacement
logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
summary(logistic1)
# fit the same model with rxGlm(), this can be run on MRS only
# check the data
head(mtcars)
# fit the same model with rxGlm(), this can be run on MRS only
# predict V engine vs straight engine with weight and displacement
logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
summary(logistic2)
# simulate cluster data for analysis, on R, MRO, or MRS
# ----------------------------------------------------------------------------
# kmeans (R, MRO, MRS) vs rxKmeans (MRS)
# ----------------------------------------------------------------------------
# make sure the results can be replicated
set.seed(0)
set.seed(112)
# function to simulate data
simulCluster <- function(nsamples, mean, dimension, group)
@ -73,51 +72,47 @@ group_all <- rbind(
nclusters <- 2
# plot data
ggplot(group_all, aes(x = V1, y = V2)) +
geom_point(aes(colour = group)) +
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Simulated data in two overlapping groups")
geom_point(aes(colour = group)) +
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Simulated data in two overlapping groups")
# assign data
mydata <- group_all[, 1:2]
# ----------------------------------------------------------------------------
# cluster analysis with kmeans(), it works on R, MRO, or MRS
# ----------------------------------------------------------------------------
fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")
# ----------------------------------------------------------------------------
# cluster analysis with rxKmeans(), it works on MRS only
# ----------------------------------------------------------------------------
fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
numClusters = nclusters, algorithm = "lloyd")
numClusters = nclusters, algorithm = "lloyd")
# ----------------------------------------------------------------------------
# compare the cluster assignments between kmeans() and rxKmeans(): the same
# the code below should be run on MRS due to the use of "rx" commands
# compare the cluster assignments between kmeans and rxKmeans (MRS): the same
# ----------------------------------------------------------------------------
# the code below should be run on MRS due to the use of "rx" commands
# save a dataset in XDF format
dataXDF = "testData.xdf"
dataXDF = tempfile(fileext = ".xdf")
rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
# rxKmeans
clust <- rxKmeans( ~ V1 + V2, data = dataXDF, numClusters = nclusters,
algorithm = "lloyd", outFile = dataXDF,
outColName = "cluster", overwrite = TRUE)
clust <- rxKmeans(~ V1 + V2, data = dataXDF,
numClusters = nclusters, algorithm = "lloyd",
outFile = dataXDF, outColName = "cluster",
overwrite = TRUE)
rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
# append cluster assignment from kmeans
# append cluster assignment from kmeans and rxKmeans
mydata_clusters <- cbind(
group_all,
kmeans.cluster = factor(fit.kmeans$cluster),
rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
# compare the cluster assignments between kmeans and rxKmeans
with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))
@ -127,20 +122,18 @@ clustermeans.rxKmeans <- fit.kmeans$centers
# plot clusters from kmeans
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
geom_point(aes(colour = kmeans.cluster)) +
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by kmeans()")
geom_point(aes(colour = kmeans.cluster)) +
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by kmeans()")
# plot clusters from rxKmeans
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
geom_point(aes(colour = rxKmeans.cluster)) +
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by rxKmeans()")
geom_point(aes(colour = rxKmeans.cluster)) +
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by rxKmeans()")

Просмотреть файл

@ -1,90 +0,0 @@
# ----------------------------------------------------------------------------
# purpose: to demonstrate that MRS's rxKmeans() function works
# successfully even when kmeans() does not for large datasets
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
# to learn more about the differences among R, MRO and MRS, refer to:
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# check if Microsoft R Server is installed
# ----------------------------------------------------------------------------
if (!require("RevoScaleR"))
{
stop(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
"For example, in R Tools for Visual Studio: \n",
"R Tools -> Options -> R Engine. \n",
"If Microsoft R Server is not installed, you can download it from: \n",
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
}
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# load packages
library("MASS") # to use the mvrnorm function
# ----------------------------------------------------------------------------
# simulate cluster data for analysis, run this on R, MRO, or MRS
# ----------------------------------------------------------------------------
# make sure the results can be replicated
set.seed(0)
# function to simulate data
simulCluster <- function(nsamples, mean, dimension, group)
{
Sigma <- diag(1, dimension, dimension)
x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
z <- as.data.frame(x)
z$group = group
z
}
# simulate data and append
# modify the value for nsamples to test out the capacity limit for kmeans()
# kmeans() failed when nsamples is 3*10^7 but rxKmeans()
# worked on a computer with 7 GB RAM
nsamples <- 3 * 10 ^ 7
group_a <- simulCluster(nsamples, -1, 2, "a")
group_b <- simulCluster(nsamples, 1, 2, "b")
group_all <- rbind(group_a, group_b)
nclusters <- 2
# save data
mydata = group_all[, 1:2]
write.csv(group_all, "simData.csv", row.names = FALSE)
dataCSV = "simData.csv"
dataXDF = "simData.xdf"
rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
# ----------------------------------------------------------------------------
# cluster analysis with kmeans(), it doesn't work when data is large enough
# ----------------------------------------------------------------------------
system.time(
{
fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd")
})
# ----------------------------------------------------------------------------
# cluster analysis with rxKmeans(), it works even if kmeans() does not
# ----------------------------------------------------------------------------
system.time(
{
clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
numClusters = nclusters,
algorithm = "lloyd",
outFile = dataXDF,
outColName = "cluster",
overwrite = TRUE)
})

Просмотреть файл

@ -2,37 +2,33 @@
# purpose: to demonstrate that MRS's rxKmeans() function works
# successfully even when kmeans() does not for large datasets
# audience: you are expected to have some prior experience with R
#
# NOTE: On a computer with less than 7GB of RAM available, this
# script may not be able to run to completion.
# ----------------------------------------------------------------------------
# to learn more about the differences among R, MRO and MRS, refer to:
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# check if Microsoft R Server is installed
# check if Microsoft R Server is installed and load libraries
# ----------------------------------------------------------------------------
if (require("RevoScaleR")) {
library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
message("RevoScaleR package is succesfully loaded.")
} else {
message("Can't find RevoScaleR package...")
message("If you have Microsoft R Server installed,")
message("please switch the R engine")
message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
message("If Microsoft R Server is not installed,")
message("please download it from here:")
message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
if (!require("RevoScaleR"))
{
stop(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
"For example, in R Tools for Visual Studio: \n",
"R Tools -> Options -> R Engine. \n",
"If Microsoft R Server is not installed, you can download it from: \n",
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
}
# ----------------------------------------------------------------------------
# install a library if it's not already installed
# ----------------------------------------------------------------------------
(if (!require("ggplot2")) install.packages("ggplot2"))
library("ggplot2")
(if (!require("MASS")) install.packages("MASS"))
library("MASS") # used for plotting
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# load packages
library("MASS") # to use the mvrnorm function
# ----------------------------------------------------------------------------
# simulate cluster data for analysis, run this on R, MRO, or MRS
@ -51,9 +47,9 @@ simulCluster <- function(nsamples, mean, dimension, group)
}
# simulate data and append
# Modify the value for nsamples to test out the capacity limit for kmeans().
# kmeans() failed when nsamples is 3*10^7 but rxKmeans()
# worked on a computer with 7 GB RAM
# modify the value for nsamples to test out the capacity limit for kmeans()
# on a computer with 7 GB RAM, when nsamples is 3*10^7 kmeans() failed
# but rxKmeans() worked
nsamples <- 3 * 10 ^ 7
group_a <- simulCluster(nsamples, -1, 2, "a")
group_b <- simulCluster(nsamples, 1, 2, "b")
@ -63,30 +59,32 @@ nclusters <- 2
# save data
mydata = group_all[, 1:2]
write.csv(group_all, "simData.csv", row.names = FALSE)
dataCSV = "simData.csv"
dataXDF = "simData.xdf"
dataCSV = tempfile(fileext = ".csv")
dataXDF = tempfile(fileext = ".xdf")
write.csv(group_all, dataCSV, row.names = FALSE)
rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
# ----------------------------------------------------------------------------
# cluster analysis with kmeans(), it doesn't work when data is large enough
# ----------------------------------------------------------------------------
system.time(
{
fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd")
})
system_time_R <-
system.time(
{
fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd")
})
# ----------------------------------------------------------------------------
# cluster analysis with rxKmeans(), it works even if kmeans() does not
# ----------------------------------------------------------------------------
system.time(
{
clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
numClusters = nclusters,
algorithm = "lloyd",
outFile = dataXDF,
outColName = "cluster",
overwrite = TRUE)
})
system_time_MRS <-
system.time(
{
clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
numClusters = nclusters,
algorithm = "lloyd",
outFile = dataXDF,
outColName = "cluster",
overwrite = TRUE)
})

Просмотреть файл

@ -1,165 +0,0 @@
# ----------------------------------------------------------------------------
# purpose: to demonstrate the speed differences across
# R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
# to learn more about the differences among R, MRO and MRS, refer to:
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# check if Microsoft R Server (RRE 8.0) is installed
# ----------------------------------------------------------------------------
if (!require("RevoScaleR"))
{
stop(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
"For example, in R Tools for Visual Studio: \n",
"R Tools -> Options -> R Engine. \n",
"If Microsoft R Server is not installed, you can download it from: \n",
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
}
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# ----------------------------------------------------------------------------
# load libraries
# ----------------------------------------------------------------------------
library("MASS") # to use the mvrnorm function
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------
# run the following code on R, MRO, and MRS and
# notice the speed improvement with MRO and MRS over R
# ----------------------------------------------------------------------------
# the code in this section can be found at the following address
# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
# print the default number of threads if MKL library is installed
if (require("RevoUtilsMath"))
{
print(paste("The number of threads is:", getMKLthreads()))
}
# Initialization
set.seed(1)
m <- 10000
n <- 5000
A <- matrix(runif(m * n), m, n)
# Matrix multiply
system.time(B <- crossprod(A))
# Cholesky Factorization
system.time(C <- chol(B))
# Singular Value Decomposition
m <- 10000
n <- 2000
A <- matrix(runif(m * n), m, n)
system.time(S <- svd(A, nu = 0, nv = 0))
# Principal Components Analysis
m <- 10000
n <- 2000
A <- matrix(runif(m * n), m, n)
system.time(P <- prcomp(A))
# Linear Discriminant Analysis
library("MASS")
g <- 5
k <- round(m / 2)
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
train <- sample(1:m, k)
system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
# ----------------------------------------------------------------------------
# run an analysis that does not involve matrix to show that
# the speed is similar on R, MRO and MRS
# ----------------------------------------------------------------------------
set.seed(0)
# function to simulate data
simulCluster <- function(nsamples, mean, dimension, group)
{
Sigma <- diag(1, dimension, dimension)
x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
z <- as.data.frame(x)
z$group = group
z
}
# simulate data
nsamples <- 10 ^ 7 # this was used on different platforms
# nsamples <- 1000 # for testing purpose
group_a <- simulCluster(nsamples, -1, 2, "a")
group_b <- simulCluster(nsamples, 1, 2, "b")
group_all <- rbind(group_a, group_b)
nclusters <- 2
mydata = group_all[, 1:2]
# K-Means Cluster Analysis
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd"))
# ----------------------------------------------------------------------------
# compare the speed of kmeans() with that of rxKmeans()
# for different data sizes
# ----------------------------------------------------------------------------
myresult <- data.frame(nsamples = integer(), time_r = double(),
time_rre = double())
nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
for (nsamples in nsamples_list)
{
# simulate data and append
group_a <- simulCluster(nsamples, -1, 2, "a")
group_b <- simulCluster(nsamples, 1, 2, "b")
group_all <- rbind(group_a, group_b)
mydata = group_all[, 1:2]
nclusters <- 2
# kmeans with R
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd"))
# kmeans with MRS
system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
numClusters = nclusters,
algorithm = "lloyd"))
# combine
newrow <- data.frame(nsamples = nsamples,
time_r = as.numeric(system_time_r[3]),
time_rre = as.numeric(system_time_rre[3]))
myresult <- rbind(myresult, newrow)
}
myresult$nsamples <- 2 * myresult$nsamples
mydata <- myresult
mydata$nsamples_log <- log10(mydata$nsamples)
mydata
ggplot(data = mydata, aes(x = nsamples_log)) +
geom_point(aes(y = time_r, colour = "kmeans")) +
geom_line(aes(y = time_r, colour = "kmeans")) +
geom_point(aes(y = time_rre, colour = "rxKmeans")) +
geom_line(aes(y = time_rre, colour = "rxKmeans")) +
scale_x_continuous(breaks = seq(2, 8, by = 1)) +
scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
xlab("log10(number of samples)") +
ylab("time in seconds") +
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")

Просмотреть файл

@ -8,28 +8,27 @@
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# check if Microsoft R Server (RRE 8.0) is installed
# check if Microsoft R Server (RRE 8.0) is installed and load libraries
# ----------------------------------------------------------------------------
if (require("RevoScaleR")) {
library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
message("RevoScaleR package is succesfully loaded.")
} else {
message("Can't find RevoScaleR package...")
message("If you have Microsoft R Server installed,")
message("please switch the R engine")
message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
message("If Microsoft R Server is not installed,")
message("please download it from here:")
message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
if (!require("RevoScaleR"))
{
stop(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
"For example, in R Tools for Visual Studio: \n",
"R Tools -> Options -> R Engine. \n",
"If Microsoft R Server is not installed, you can download it from: \n",
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
}
# ----------------------------------------------------------------------------
# install a library if it's not already installed
# ----------------------------------------------------------------------------
(if (!require("ggplot2")) install.packages("ggplot2"))
library("ggplot2")
(if (!require("MASS")) install.packages("MASS"))
library("MASS") # used for plotting
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# load libraries
library("MASS") # to use the mvrnorm function
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------
# run the following code on R, MRO, and MRS and
@ -69,6 +68,7 @@ A <- matrix(runif(m * n), m, n)
system.time(P <- prcomp(A))
# Linear Discriminant Analysis
library("MASS")
g <- 5
k <- round(m / 2)
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
@ -77,7 +77,7 @@ system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
# ----------------------------------------------------------------------------
# run an analysis that does not involve matrix to show that
# the speed is similar on R, MRO and MRS
# the speed is similar on R, MRO and MRS
# ----------------------------------------------------------------------------
set.seed(0)
@ -110,9 +110,11 @@ system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
# compare the speed of kmeans() with that of rxKmeans()
# for different data sizes
# ----------------------------------------------------------------------------
# to save timing results
myresult <- data.frame(nsamples = integer(), time_r = double(),
time_rre = double())
# list of sample sizes
nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
@ -147,9 +149,9 @@ for (nsamples in nsamples_list)
myresult$nsamples <- 2 * myresult$nsamples
mydata <- myresult
mydata$nsamples_log <- log10(mydata$nsamples)
mydata
# generate plot
ggplot(data = mydata, aes(x = nsamples_log)) +
geom_point(aes(y = time_r, colour = "kmeans")) +
geom_line(aes(y = time_r, colour = "kmeans")) +
@ -159,4 +161,4 @@ ggplot(data = mydata, aes(x = nsamples_log)) +
scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
xlab("log10(number of samples)") +
ylab("time in seconds") +
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")