зеркало из https://github.com/microsoft/RTVS-docs.git
Merge branch 'master' of github.com:brohrer-ms/RTVS-docs
This commit is contained in:
Коммит
ac4919044e
|
@ -7,64 +7,32 @@
|
|||
# load packages
|
||||
# ----------------------------------------------------------------------------
|
||||
(if (!require("MASS")) install.packages("MASS"))
|
||||
library("MASS") # use the mvrnorm function
|
||||
(if (!require("caret")) install.packages("caret"))
|
||||
library("caret") # use the train function for selecting hyper parameters
|
||||
library("MASS") # to use the Boston dataset
|
||||
(if (!require("gbm")) install.packages("gbm"))
|
||||
library("gbm") # Gradient Boosting Machine package
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# select hyper parameters
|
||||
# fit the model and draw some plots
|
||||
# ----------------------------------------------------------------------------
|
||||
# ensure results are repeatable
|
||||
set.seed(123)
|
||||
|
||||
# prepare training scheme
|
||||
control <- trainControl(method = "cv", number = 5)
|
||||
|
||||
# design the parameter tuning grid
|
||||
grid <- expand.grid(n.trees = c(5000, 10000, 15000),
|
||||
interaction.depth = c(2, 4, 8),
|
||||
n.minobsinnode = c(1, 2, 4),
|
||||
shrinkage = c(0.001, 0.01, 0.1))
|
||||
|
||||
# design the parameter tuning grid - smaller grid for testing purpose
|
||||
# grid <- expand.grid(n.trees = c(5000, 10000),
|
||||
# interaction.depth = c(2, 4),
|
||||
# n.minobsinnode = c(1, 2),
|
||||
# shrinkage = c(0.001, 0.01))
|
||||
|
||||
# tune the parameters
|
||||
gbm1 <- train(medv ~ ., data = Boston, method = "gbm",
|
||||
distribution = "gaussian", trControl = control,
|
||||
verbose = FALSE, tuneGrid = grid, metric = "RMSE")
|
||||
|
||||
# summarize the model
|
||||
print(gbm1)
|
||||
|
||||
# plot cross-validation results
|
||||
plot(gbm1)
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# fit the model with estimated hyper parameters and draw some plots
|
||||
# ----------------------------------------------------------------------------
|
||||
gbm2 <- gbm(medv ~ .,
|
||||
distribution = "gaussian",
|
||||
n.trees = 5000,
|
||||
interaction.depth = 4,
|
||||
n.minobsinnode = 1,
|
||||
shrinkage = 0.01,
|
||||
cv.folds = 5,
|
||||
data = Boston)
|
||||
# the four hyper-parameters used here - n.trees, interaction.depth,
|
||||
# n.minobsinnode, and shrinkage - were selected from cross-validation
|
||||
fit_gbm <- gbm(medv ~ .,
|
||||
distribution = "gaussian",
|
||||
n.trees = 5000,
|
||||
interaction.depth = 4,
|
||||
n.minobsinnode = 1,
|
||||
shrinkage = 0.01,
|
||||
cv.folds = 5,
|
||||
data = Boston)
|
||||
|
||||
# print the model
|
||||
print(gbm2)
|
||||
print(fit_gbm)
|
||||
|
||||
# check performance using 5-fold cross-validation
|
||||
best.iter <- gbm.perf(gbm2, method = "cv")
|
||||
best.iter <- gbm.perf(fit_gbm, method = "cv")
|
||||
|
||||
# check variable importance
|
||||
f_imp <- summary(gbm2, n.trees = best.iter, plot = FALSE)
|
||||
f_imp <- summary(fit_gbm, n.trees = best.iter, plot = FALSE)
|
||||
|
||||
# use a custom plot to show variable importance
|
||||
barplot(f_imp$rel.inf, names.arg = f_imp$var, xlab = "Feature",
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
(if (!require("glmnet")) install.packages("glmnet"))
|
||||
library("glmnet") # use this package to fit a glmnet model
|
||||
(if (!require("MASS")) install.packages("MASS"))
|
||||
library("MASS") # use the Boston dataset
|
||||
library("MASS") # to use the Boston dataset
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# identify the optimal value of lambda
|
||||
|
@ -45,8 +45,8 @@ print(coef(model1, s = "lambda.min"))
|
|||
model2 <- glmnet(x = train_X, y = train_y, alpha = 1, family = "gaussian")
|
||||
|
||||
# identify variable names
|
||||
vid <- as.character(seq(1,13))
|
||||
vn = colnames(train_X)
|
||||
vid <- as.character(seq(1,length(vn)))
|
||||
|
||||
# check and exclude the variables with coefficient value 0
|
||||
vnat = coef(model2)
|
||||
|
@ -65,15 +65,13 @@ mycl <- seq(1,nvars)
|
|||
plot(model2, xvar = "lambda", label = TRUE, col = mycl, xlim = c(-5.5, 2))
|
||||
legend(-0.5,-2, legend_desc, lty = mylty, col = mycl, cex = 0.8)
|
||||
|
||||
# check coefficients from using glmnet() to compare with
|
||||
# those from cv.glmnet(): the same
|
||||
coef(model2, s = model1$lambda.min)
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# make predictions
|
||||
# ----------------------------------------------------------------------------
|
||||
# make predictions with model 1
|
||||
# data
|
||||
x_new <- data.matrix(train_X[1:2, - response_column])
|
||||
|
||||
# make predictions with model 1
|
||||
predictions_train <- predict(model1, newx = x_new, s = "lambda.min")
|
||||
print(predictions_train)
|
||||
|
||||
|
|
|
@ -34,9 +34,9 @@ pred <- predict(lm1)
|
|||
mae <- mean(abs(pred - Boston$medv))
|
||||
rmse <- sqrt(mean((pred - Boston$medv) ^ 2))
|
||||
rae <- mean(abs(pred - Boston$medv)) / mean(abs(Boston$medv -
|
||||
mean(Boston$medv)))
|
||||
mean(Boston$medv)))
|
||||
rse <- mean((pred - Boston$medv) ^ 2) / mean((Boston$medv -
|
||||
mean(Boston$medv)) ^ 2)
|
||||
mean(Boston$medv)) ^ 2)
|
||||
|
||||
print(paste("Mean Absolute Error: ",
|
||||
as.character(round(mae, digit = 6)), sep = ""))
|
||||
|
@ -47,16 +47,18 @@ print(paste("Relative Absolute Error: ",
|
|||
print(paste("Relative Squared Error: ",
|
||||
as.character(round(rse, digit = 6)), sep = ""))
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# publish and consume a web service
|
||||
# ----------------------------------------------------------------------------
|
||||
# workspace information
|
||||
ws <- workspace(
|
||||
id = ws_id,
|
||||
auth = auth_token)
|
||||
id = ws_id,
|
||||
auth = auth_token)
|
||||
|
||||
# define predict function
|
||||
mypredict <- function(newdata) {
|
||||
res <- predict(lm1, newdata)
|
||||
res
|
||||
res <- predict(lm1, newdata)
|
||||
res
|
||||
}
|
||||
|
||||
# test the prediction function
|
||||
|
@ -67,9 +69,6 @@ print(mypredict(newdata))
|
|||
ep <- publishWebService(ws = ws, fun = mypredict,
|
||||
name = "HousePricePrediction", inputSchema = newdata)
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# consume the web service
|
||||
# ----------------------------------------------------------------------------
|
||||
# consume web service - 1st approach
|
||||
pred <- consume(ep, newdata)
|
||||
pred
|
||||
|
@ -87,16 +86,16 @@ consume(ep_price_pred, newdata)
|
|||
# ----------------------------------------------------------------------------
|
||||
# define function for testing purpose
|
||||
mypredictnew <- function(newdata) {
|
||||
res <- predict(lm1, newdata) + 100
|
||||
res
|
||||
res <- predict(lm1, newdata) + 100
|
||||
res
|
||||
}
|
||||
|
||||
# update service with the new function
|
||||
ep_update <- updateWebService(
|
||||
ws = ws,
|
||||
fun = mypredictnew,
|
||||
inputSchema = newdata,
|
||||
serviceId = ep$WebServiceId)
|
||||
ws = ws,
|
||||
fun = mypredictnew,
|
||||
inputSchema = newdata,
|
||||
serviceId = ep$WebServiceId)
|
||||
|
||||
# consume the updated web service
|
||||
consume(ep_price_pred, newdata)
|
||||
|
|
|
@ -1,150 +0,0 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# purpose: to demonstrate the commonalities and differences among functions
|
||||
# in R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
|
||||
# audience: you are expected to have some prior experience with R
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# to learn more about the differences among R, MRO and MRS, refer to:
|
||||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server is installed
|
||||
# ----------------------------------------------------------------------------
|
||||
if (!require("RevoScaleR"))
|
||||
{
|
||||
stop(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
"For example, in R Tools for Visual Studio: \n",
|
||||
"R Tools -> Options -> R Engine. \n",
|
||||
"If Microsoft R Server is not installed, you can download it from: \n",
|
||||
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
|
||||
}
|
||||
|
||||
# install a package if it's not already installed
|
||||
|
||||
if (!require("ggplot2", quietly = TRUE))
|
||||
install.packages("ggplot2")
|
||||
|
||||
|
||||
# load packages
|
||||
|
||||
library("MASS") # to use the mvrnorm function
|
||||
library("ggplot2") # used for plotting
|
||||
|
||||
|
||||
# fit a model with glm(), this can be run on R, MRO, or MRS
|
||||
|
||||
# check the data
|
||||
head(mtcars)
|
||||
# predict V engine vs straight engine with weight and displacement
|
||||
logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
|
||||
summary(logistic1)
|
||||
|
||||
|
||||
# fit the same model with rxGlm(), this can be run on MRS only
|
||||
|
||||
# check the data
|
||||
head(mtcars)
|
||||
# predict V engine vs straight engine with weight and displacement
|
||||
logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
|
||||
summary(logistic2)
|
||||
|
||||
|
||||
# simulate cluster data for analysis, on R, MRO, or MRS
|
||||
|
||||
# make sure the results can be replicated
|
||||
set.seed(0)
|
||||
|
||||
# function to simulate data
|
||||
simulCluster <- function(nsamples, mean, dimension, group)
|
||||
{
|
||||
Sigma <- diag(1, dimension, dimension)
|
||||
x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
|
||||
z <- as.data.frame(x)
|
||||
z$group = group
|
||||
z
|
||||
}
|
||||
|
||||
# simulate data with 2 clusters
|
||||
nsamples <- 1000
|
||||
group_all <- rbind(
|
||||
simulCluster(nsamples, -1, 2, "a"),
|
||||
simulCluster(nsamples, 1, 2, "b"))
|
||||
|
||||
nclusters <- 2
|
||||
|
||||
# plot data
|
||||
|
||||
ggplot(group_all, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = group)) +
|
||||
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Simulated data in two overlapping groups")
|
||||
|
||||
|
||||
# assign data
|
||||
mydata <- group_all[, 1:2]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with kmeans(), it works on R, MRO, or MRS
|
||||
# ----------------------------------------------------------------------------
|
||||
fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with rxKmeans(), it works on MRS only
|
||||
# ----------------------------------------------------------------------------
|
||||
fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
|
||||
numClusters = nclusters, algorithm = "lloyd")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# compare the cluster assignments between kmeans() and rxKmeans(): the same
|
||||
# the code below should be run on MRS due to the use of "rx" commands
|
||||
# ----------------------------------------------------------------------------
|
||||
# save a dataset in XDF format
|
||||
dataXDF = "testData.xdf"
|
||||
rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
|
||||
# rxKmeans
|
||||
clust <- rxKmeans( ~ V1 + V2, data = dataXDF, numClusters = nclusters,
|
||||
algorithm = "lloyd", outFile = dataXDF,
|
||||
outColName = "cluster", overwrite = TRUE)
|
||||
|
||||
rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
|
||||
|
||||
# append cluster assignment from kmeans
|
||||
mydata_clusters <- cbind(
|
||||
group_all,
|
||||
kmeans.cluster = factor(fit.kmeans$cluster),
|
||||
rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
|
||||
|
||||
|
||||
# compare the cluster assignments between kmeans and rxKmeans
|
||||
with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))
|
||||
|
||||
# get cluster means
|
||||
clustermeans.kmeans <- fit.kmeans$centers
|
||||
clustermeans.rxKmeans <- fit.kmeans$centers
|
||||
|
||||
# plot clusters from kmeans
|
||||
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = kmeans.cluster)) +
|
||||
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by kmeans()")
|
||||
|
||||
|
||||
# plot clusters from rxKmeans
|
||||
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = rxKmeans.cluster)) +
|
||||
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by rxKmeans()")
|
||||
|
||||
|
|
@ -8,51 +8,50 @@
|
|||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server (RRE 8.0) is installed
|
||||
# check if Microsoft R Server is installed and load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
if (require("RevoScaleR")) {
|
||||
library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
|
||||
message("RevoScaleR package is succesfully loaded.")
|
||||
} else {
|
||||
message("Can't find RevoScaleR package...")
|
||||
message("If you have Microsoft R Server installed,")
|
||||
message("please switch the R engine")
|
||||
message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
|
||||
message("If Microsoft R Server is not installed,")
|
||||
message("please download it from here:")
|
||||
message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
|
||||
if (!require("RevoScaleR"))
|
||||
{
|
||||
stop(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
"For example, in R Tools for Visual Studio: \n",
|
||||
"R Tools -> Options -> R Engine. \n",
|
||||
"If Microsoft R Server is not installed, you can download it from: \n",
|
||||
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# install a library if it's not already installed
|
||||
# ----------------------------------------------------------------------------
|
||||
(if (!require("ggplot2")) install.packages("ggplot2"))
|
||||
library("ggplot2")
|
||||
(if (!require("MASS")) install.packages("MASS"))
|
||||
library("MASS") # used for plotting
|
||||
# install a package if it's not already installed
|
||||
if (!require("ggplot2", quietly = TRUE))
|
||||
install.packages("ggplot2")
|
||||
|
||||
# fit a model with glm(), this can be run on R, MRO, or MRS
|
||||
# load packages
|
||||
library("MASS") # to use the mvrnorm function
|
||||
library("ggplot2") # used for plotting
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# glm (R, MRO, MRS) vs rxGlm (MRS)
|
||||
# ----------------------------------------------------------------------------
|
||||
# check the data
|
||||
head(mtcars)
|
||||
# fit a model with glm(), this can be run on R, MRO, or MRS
|
||||
# predict V engine vs straight engine with weight and displacement
|
||||
logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
|
||||
summary(logistic1)
|
||||
|
||||
|
||||
# fit the same model with rxGlm(), this can be run on MRS only
|
||||
|
||||
# check the data
|
||||
head(mtcars)
|
||||
# fit the same model with rxGlm(), this can be run on MRS only
|
||||
# predict V engine vs straight engine with weight and displacement
|
||||
logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
|
||||
summary(logistic2)
|
||||
|
||||
|
||||
# simulate cluster data for analysis, on R, MRO, or MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# kmeans (R, MRO, MRS) vs rxKmeans (MRS)
|
||||
# ----------------------------------------------------------------------------
|
||||
# make sure the results can be replicated
|
||||
set.seed(0)
|
||||
set.seed(112)
|
||||
|
||||
# function to simulate data
|
||||
simulCluster <- function(nsamples, mean, dimension, group)
|
||||
|
@ -73,51 +72,47 @@ group_all <- rbind(
|
|||
nclusters <- 2
|
||||
|
||||
# plot data
|
||||
|
||||
ggplot(group_all, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = group)) +
|
||||
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Simulated data in two overlapping groups")
|
||||
|
||||
geom_point(aes(colour = group)) +
|
||||
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Simulated data in two overlapping groups")
|
||||
|
||||
# assign data
|
||||
mydata <- group_all[, 1:2]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with kmeans(), it works on R, MRO, or MRS
|
||||
# ----------------------------------------------------------------------------
|
||||
fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with rxKmeans(), it works on MRS only
|
||||
# ----------------------------------------------------------------------------
|
||||
fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
|
||||
numClusters = nclusters, algorithm = "lloyd")
|
||||
numClusters = nclusters, algorithm = "lloyd")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# compare the cluster assignments between kmeans() and rxKmeans(): the same
|
||||
# the code below should be run on MRS due to the use of "rx" commands
|
||||
# compare the cluster assignments between kmeans and rxKmeans (MRS): the same
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# the code below should be run on MRS due to the use of "rx" commands
|
||||
|
||||
# save a dataset in XDF format
|
||||
dataXDF = "testData.xdf"
|
||||
dataXDF = tempfile(fileext = ".xdf")
|
||||
rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
|
||||
# rxKmeans
|
||||
clust <- rxKmeans( ~ V1 + V2, data = dataXDF, numClusters = nclusters,
|
||||
algorithm = "lloyd", outFile = dataXDF,
|
||||
outColName = "cluster", overwrite = TRUE)
|
||||
|
||||
clust <- rxKmeans(~ V1 + V2, data = dataXDF,
|
||||
numClusters = nclusters, algorithm = "lloyd",
|
||||
outFile = dataXDF, outColName = "cluster",
|
||||
overwrite = TRUE)
|
||||
|
||||
rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
|
||||
|
||||
# append cluster assignment from kmeans
|
||||
# append cluster assignment from kmeans and rxKmeans
|
||||
mydata_clusters <- cbind(
|
||||
group_all,
|
||||
kmeans.cluster = factor(fit.kmeans$cluster),
|
||||
rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
|
||||
|
||||
|
||||
# compare the cluster assignments between kmeans and rxKmeans
|
||||
with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))
|
||||
|
||||
|
@ -127,20 +122,18 @@ clustermeans.rxKmeans <- fit.kmeans$centers
|
|||
|
||||
# plot clusters from kmeans
|
||||
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = kmeans.cluster)) +
|
||||
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by kmeans()")
|
||||
|
||||
geom_point(aes(colour = kmeans.cluster)) +
|
||||
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by kmeans()")
|
||||
|
||||
# plot clusters from rxKmeans
|
||||
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = rxKmeans.cluster)) +
|
||||
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by rxKmeans()")
|
||||
|
||||
geom_point(aes(colour = rxKmeans.cluster)) +
|
||||
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by rxKmeans()")
|
||||
|
|
|
@ -1,90 +0,0 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# purpose: to demonstrate that MRS's rxKmeans() function works
|
||||
# successfully even when kmeans() does not for large datasets
|
||||
# audience: you are expected to have some prior experience with R
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# to learn more about the differences among R, MRO and MRS, refer to:
|
||||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server is installed
|
||||
# ----------------------------------------------------------------------------
|
||||
if (!require("RevoScaleR"))
|
||||
{
|
||||
stop(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
"For example, in R Tools for Visual Studio: \n",
|
||||
"R Tools -> Options -> R Engine. \n",
|
||||
"If Microsoft R Server is not installed, you can download it from: \n",
|
||||
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
|
||||
}
|
||||
|
||||
# install a package if it's not already installed
|
||||
|
||||
if (!require("ggplot2", quietly = TRUE))
|
||||
install.packages("ggplot2")
|
||||
|
||||
# load packages
|
||||
|
||||
library("MASS") # to use the mvrnorm function
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# simulate cluster data for analysis, run this on R, MRO, or MRS
|
||||
# ----------------------------------------------------------------------------
|
||||
# make sure the results can be replicated
|
||||
set.seed(0)
|
||||
|
||||
# function to simulate data
|
||||
simulCluster <- function(nsamples, mean, dimension, group)
|
||||
{
|
||||
Sigma <- diag(1, dimension, dimension)
|
||||
x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
|
||||
z <- as.data.frame(x)
|
||||
z$group = group
|
||||
z
|
||||
}
|
||||
|
||||
# simulate data and append
|
||||
# modify the value for nsamples to test out the capacity limit for kmeans()
|
||||
# kmeans() failed when nsamples is 3*10^7 but rxKmeans()
|
||||
# worked on a computer with 7 GB RAM
|
||||
nsamples <- 3 * 10 ^ 7
|
||||
group_a <- simulCluster(nsamples, -1, 2, "a")
|
||||
group_b <- simulCluster(nsamples, 1, 2, "b")
|
||||
group_all <- rbind(group_a, group_b)
|
||||
|
||||
nclusters <- 2
|
||||
|
||||
# save data
|
||||
mydata = group_all[, 1:2]
|
||||
write.csv(group_all, "simData.csv", row.names = FALSE)
|
||||
dataCSV = "simData.csv"
|
||||
dataXDF = "simData.xdf"
|
||||
rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with kmeans(), it doesn't work when data is large enough
|
||||
# ----------------------------------------------------------------------------
|
||||
system.time(
|
||||
{
|
||||
fit <- kmeans(mydata, nclusters,
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd")
|
||||
})
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with rxKmeans(), it works even if kmeans() does not
|
||||
# ----------------------------------------------------------------------------
|
||||
system.time(
|
||||
{
|
||||
clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
|
||||
numClusters = nclusters,
|
||||
algorithm = "lloyd",
|
||||
outFile = dataXDF,
|
||||
outColName = "cluster",
|
||||
overwrite = TRUE)
|
||||
})
|
||||
|
|
@ -2,37 +2,33 @@
|
|||
# purpose: to demonstrate that MRS's rxKmeans() function works
|
||||
# successfully even when kmeans() does not for large datasets
|
||||
# audience: you are expected to have some prior experience with R
|
||||
#
|
||||
# NOTE: On a computer with less than 7GB of RAM available, this
|
||||
# script may not be able to run to completion.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# to learn more about the differences among R, MRO and MRS, refer to:
|
||||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server is installed
|
||||
# check if Microsoft R Server is installed and load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
if (require("RevoScaleR")) {
|
||||
library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
|
||||
message("RevoScaleR package is succesfully loaded.")
|
||||
} else {
|
||||
message("Can't find RevoScaleR package...")
|
||||
message("If you have Microsoft R Server installed,")
|
||||
message("please switch the R engine")
|
||||
message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
|
||||
message("If Microsoft R Server is not installed,")
|
||||
message("please download it from here:")
|
||||
message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
|
||||
if (!require("RevoScaleR"))
|
||||
{
|
||||
stop(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
"For example, in R Tools for Visual Studio: \n",
|
||||
"R Tools -> Options -> R Engine. \n",
|
||||
"If Microsoft R Server is not installed, you can download it from: \n",
|
||||
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# install a library if it's not already installed
|
||||
# ----------------------------------------------------------------------------
|
||||
(if (!require("ggplot2")) install.packages("ggplot2"))
|
||||
library("ggplot2")
|
||||
(if (!require("MASS")) install.packages("MASS"))
|
||||
library("MASS") # used for plotting
|
||||
# install a package if it's not already installed
|
||||
|
||||
if (!require("ggplot2", quietly = TRUE))
|
||||
install.packages("ggplot2")
|
||||
|
||||
# load packages
|
||||
library("MASS") # to use the mvrnorm function
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# simulate cluster data for analysis, run this on R, MRO, or MRS
|
||||
|
@ -51,9 +47,9 @@ simulCluster <- function(nsamples, mean, dimension, group)
|
|||
}
|
||||
|
||||
# simulate data and append
|
||||
# Modify the value for nsamples to test out the capacity limit for kmeans().
|
||||
# kmeans() failed when nsamples is 3*10^7 but rxKmeans()
|
||||
# worked on a computer with 7 GB RAM
|
||||
# modify the value for nsamples to test out the capacity limit for kmeans()
|
||||
# on a computer with 7 GB RAM, when nsamples is 3*10^7 kmeans() failed
|
||||
# but rxKmeans() worked
|
||||
nsamples <- 3 * 10 ^ 7
|
||||
group_a <- simulCluster(nsamples, -1, 2, "a")
|
||||
group_b <- simulCluster(nsamples, 1, 2, "b")
|
||||
|
@ -63,30 +59,32 @@ nclusters <- 2
|
|||
|
||||
# save data
|
||||
mydata = group_all[, 1:2]
|
||||
write.csv(group_all, "simData.csv", row.names = FALSE)
|
||||
dataCSV = "simData.csv"
|
||||
dataXDF = "simData.xdf"
|
||||
dataCSV = tempfile(fileext = ".csv")
|
||||
dataXDF = tempfile(fileext = ".xdf")
|
||||
write.csv(group_all, dataCSV, row.names = FALSE)
|
||||
rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with kmeans(), it doesn't work when data is large enough
|
||||
# ----------------------------------------------------------------------------
|
||||
system.time(
|
||||
{
|
||||
fit <- kmeans(mydata, nclusters,
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd")
|
||||
})
|
||||
system_time_R <-
|
||||
system.time(
|
||||
{
|
||||
fit <- kmeans(mydata, nclusters,
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd")
|
||||
})
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with rxKmeans(), it works even if kmeans() does not
|
||||
# ----------------------------------------------------------------------------
|
||||
system.time(
|
||||
{
|
||||
clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
|
||||
numClusters = nclusters,
|
||||
algorithm = "lloyd",
|
||||
outFile = dataXDF,
|
||||
outColName = "cluster",
|
||||
overwrite = TRUE)
|
||||
})
|
||||
system_time_MRS <-
|
||||
system.time(
|
||||
{
|
||||
clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
|
||||
numClusters = nclusters,
|
||||
algorithm = "lloyd",
|
||||
outFile = dataXDF,
|
||||
outColName = "cluster",
|
||||
overwrite = TRUE)
|
||||
})
|
||||
|
|
|
@ -1,165 +0,0 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# purpose: to demonstrate the speed differences across
|
||||
# R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
|
||||
# audience: you are expected to have some prior experience with R
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# to learn more about the differences among R, MRO and MRS, refer to:
|
||||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server (RRE 8.0) is installed
|
||||
# ----------------------------------------------------------------------------
|
||||
if (!require("RevoScaleR"))
|
||||
{
|
||||
stop(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
"For example, in R Tools for Visual Studio: \n",
|
||||
"R Tools -> Options -> R Engine. \n",
|
||||
"If Microsoft R Server is not installed, you can download it from: \n",
|
||||
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
|
||||
}
|
||||
|
||||
# install a package if it's not already installed
|
||||
if (!require("ggplot2", quietly = TRUE))
|
||||
install.packages("ggplot2")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
library("MASS") # to use the mvrnorm function
|
||||
library("ggplot2") # used for plotting
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# run the following code on R, MRO, and MRS and
|
||||
# notice the speed improvement with MRO and MRS over R
|
||||
# ----------------------------------------------------------------------------
|
||||
# the code in this section can be found at the following address
|
||||
# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
|
||||
|
||||
# print the default number of threads if MKL library is installed
|
||||
if (require("RevoUtilsMath"))
|
||||
{
|
||||
print(paste("The number of threads is:", getMKLthreads()))
|
||||
}
|
||||
|
||||
# Initialization
|
||||
set.seed(1)
|
||||
m <- 10000
|
||||
n <- 5000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
|
||||
# Matrix multiply
|
||||
system.time(B <- crossprod(A))
|
||||
|
||||
# Cholesky Factorization
|
||||
system.time(C <- chol(B))
|
||||
|
||||
# Singular Value Decomposition
|
||||
m <- 10000
|
||||
n <- 2000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
system.time(S <- svd(A, nu = 0, nv = 0))
|
||||
|
||||
# Principal Components Analysis
|
||||
m <- 10000
|
||||
n <- 2000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
system.time(P <- prcomp(A))
|
||||
|
||||
# Linear Discriminant Analysis
|
||||
library("MASS")
|
||||
g <- 5
|
||||
k <- round(m / 2)
|
||||
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
|
||||
train <- sample(1:m, k)
|
||||
system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# run an analysis that does not involve matrix to show that
|
||||
# the speed is similar on R, MRO and MRS
|
||||
# ----------------------------------------------------------------------------
|
||||
set.seed(0)
|
||||
|
||||
# function to simulate data
|
||||
simulCluster <- function(nsamples, mean, dimension, group)
|
||||
{
|
||||
Sigma <- diag(1, dimension, dimension)
|
||||
x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
|
||||
z <- as.data.frame(x)
|
||||
z$group = group
|
||||
z
|
||||
}
|
||||
|
||||
# simulate data
|
||||
nsamples <- 10 ^ 7 # this was used on different platforms
|
||||
# nsamples <- 1000 # for testing purpose
|
||||
group_a <- simulCluster(nsamples, -1, 2, "a")
|
||||
group_b <- simulCluster(nsamples, 1, 2, "b")
|
||||
group_all <- rbind(group_a, group_b)
|
||||
|
||||
nclusters <- 2
|
||||
|
||||
mydata = group_all[, 1:2]
|
||||
# K-Means Cluster Analysis
|
||||
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd"))
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# compare the speed of kmeans() with that of rxKmeans()
|
||||
# for different data sizes
|
||||
# ----------------------------------------------------------------------------
|
||||
myresult <- data.frame(nsamples = integer(), time_r = double(),
|
||||
time_rre = double())
|
||||
|
||||
nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
|
||||
5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
|
||||
|
||||
for (nsamples in nsamples_list)
|
||||
{
|
||||
# simulate data and append
|
||||
group_a <- simulCluster(nsamples, -1, 2, "a")
|
||||
group_b <- simulCluster(nsamples, 1, 2, "b")
|
||||
group_all <- rbind(group_a, group_b)
|
||||
mydata = group_all[, 1:2]
|
||||
|
||||
nclusters <- 2
|
||||
|
||||
# kmeans with R
|
||||
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd"))
|
||||
|
||||
# kmeans with MRS
|
||||
system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
|
||||
numClusters = nclusters,
|
||||
algorithm = "lloyd"))
|
||||
|
||||
# combine
|
||||
newrow <- data.frame(nsamples = nsamples,
|
||||
time_r = as.numeric(system_time_r[3]),
|
||||
time_rre = as.numeric(system_time_rre[3]))
|
||||
myresult <- rbind(myresult, newrow)
|
||||
|
||||
}
|
||||
|
||||
myresult$nsamples <- 2 * myresult$nsamples
|
||||
mydata <- myresult
|
||||
mydata$nsamples_log <- log10(mydata$nsamples)
|
||||
|
||||
mydata
|
||||
|
||||
ggplot(data = mydata, aes(x = nsamples_log)) +
|
||||
geom_point(aes(y = time_r, colour = "kmeans")) +
|
||||
geom_line(aes(y = time_r, colour = "kmeans")) +
|
||||
geom_point(aes(y = time_rre, colour = "rxKmeans")) +
|
||||
geom_line(aes(y = time_rre, colour = "rxKmeans")) +
|
||||
scale_x_continuous(breaks = seq(2, 8, by = 1)) +
|
||||
scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
|
||||
xlab("log10(number of samples)") +
|
||||
ylab("time in seconds") +
|
||||
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
|
||||
|
|
@ -8,28 +8,27 @@
|
|||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server (RRE 8.0) is installed
|
||||
# check if Microsoft R Server (RRE 8.0) is installed and load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
if (require("RevoScaleR")) {
|
||||
library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server.
|
||||
message("RevoScaleR package is succesfully loaded.")
|
||||
} else {
|
||||
message("Can't find RevoScaleR package...")
|
||||
message("If you have Microsoft R Server installed,")
|
||||
message("please switch the R engine")
|
||||
message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.")
|
||||
message("If Microsoft R Server is not installed,")
|
||||
message("please download it from here:")
|
||||
message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.")
|
||||
if (!require("RevoScaleR"))
|
||||
{
|
||||
stop(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
"For example, in R Tools for Visual Studio: \n",
|
||||
"R Tools -> Options -> R Engine. \n",
|
||||
"If Microsoft R Server is not installed, you can download it from: \n",
|
||||
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# install a library if it's not already installed
|
||||
# ----------------------------------------------------------------------------
|
||||
(if (!require("ggplot2")) install.packages("ggplot2"))
|
||||
library("ggplot2")
|
||||
(if (!require("MASS")) install.packages("MASS"))
|
||||
library("MASS") # used for plotting
|
||||
# install a package if it's not already installed
|
||||
if (!require("ggplot2", quietly = TRUE))
|
||||
install.packages("ggplot2")
|
||||
|
||||
# load libraries
|
||||
library("MASS") # to use the mvrnorm function
|
||||
library("ggplot2") # used for plotting
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# run the following code on R, MRO, and MRS and
|
||||
|
@ -69,6 +68,7 @@ A <- matrix(runif(m * n), m, n)
|
|||
system.time(P <- prcomp(A))
|
||||
|
||||
# Linear Discriminant Analysis
|
||||
library("MASS")
|
||||
g <- 5
|
||||
k <- round(m / 2)
|
||||
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
|
||||
|
@ -77,7 +77,7 @@ system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
|
|||
|
||||
# ----------------------------------------------------------------------------
|
||||
# run an analysis that does not involve matrix to show that
|
||||
# the speed is similar on R, MRO and MRS
|
||||
# the speed is similar on R, MRO and MRS
|
||||
# ----------------------------------------------------------------------------
|
||||
set.seed(0)
|
||||
|
||||
|
@ -110,9 +110,11 @@ system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
|
|||
# compare the speed of kmeans() with that of rxKmeans()
|
||||
# for different data sizes
|
||||
# ----------------------------------------------------------------------------
|
||||
# to save timing results
|
||||
myresult <- data.frame(nsamples = integer(), time_r = double(),
|
||||
time_rre = double())
|
||||
|
||||
# list of sample sizes
|
||||
nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
|
||||
5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
|
||||
|
||||
|
@ -147,9 +149,9 @@ for (nsamples in nsamples_list)
|
|||
myresult$nsamples <- 2 * myresult$nsamples
|
||||
mydata <- myresult
|
||||
mydata$nsamples_log <- log10(mydata$nsamples)
|
||||
|
||||
mydata
|
||||
|
||||
# generate plot
|
||||
ggplot(data = mydata, aes(x = nsamples_log)) +
|
||||
geom_point(aes(y = time_r, colour = "kmeans")) +
|
||||
geom_line(aes(y = time_r, colour = "kmeans")) +
|
||||
|
@ -159,4 +161,4 @@ ggplot(data = mydata, aes(x = nsamples_log)) +
|
|||
scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
|
||||
xlab("log10(number of samples)") +
|
||||
ylab("time in seconds") +
|
||||
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
|
||||
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
|
Загрузка…
Ссылка в новой задаче