This commit is contained in:
Brandon Rohrer 2016-03-15 16:33:24 -04:00
Родитель 226842a54a 5e84bf09e4
Коммит 25b2a34bf0
14 изменённых файлов: 559 добавлений и 481 удалений

Просмотреть файл

@ -1,27 +1,30 @@
# Regression: Demand estimation with Microsoft R Server
# This example demonstrates the feature engineering process for building
# a regression model to predict bike rental demand.
# The dataset contains 17,379 rows and 17 columns, each row representing
# the number of bike rentals within a specific hour of a day in the years
# 2011 or 2012. Weather conditions (such as temperature, humidity,
# and wind speed) were included in this raw feature set, and
# the dates were categorized as holiday vs. weekday, etc.
# The field to predict is "cnt", which contains a count value ranging
# from 1 to 977, representing the number of bike rentals within
# a specific hour. The lag features we add in the data set are the number
# of bikes that were rented in each of the previous 12 hours.
# This caputures the very recent demand for the bikes.
# The following scripts include five basic steps of building this example
# using Microsoft R Server.
#-------------------------------------------------------------------------------------------------------------
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
#-------------------------------------------------------------------------------------------------------------
#
#
# This example demonstrates the feature engineering process for building a regression model to predict
# bike rental demand.
#
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity,
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
# weekday etc.
#
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
# of bike rentals within a specific hour. The lag features we add in the data set is the number of bikes that
# were rented in each of the previous 12 hours, which caputures the very recent demand for the bikes.
#
# The following scripts include five basic steps of building this example using Microsoft R Server.
# This execution might require more than one minute.
#
#--------------------------------------------------------------------------------------------------------------
# Step 0: Get Started
# Check whether Microsoft R Server (RRE 8.0) is installed
#---------------------------Step 0: Get Started-------------------------------
# ----------------------------------------------------------------------------
# Check if Microsoft R Server (RRE 8.0) is installed
# ----------------------------------------------------------------------------
if (!require("RevoScaleR")) {
cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run.
@ -35,8 +38,8 @@ if (!require("RevoScaleR")) {
quit()
}
# Initialize some variables.
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
# Initial some variables.
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
# Create a temporary directory to store the intermediate .xdf files.
@ -44,115 +47,89 @@ td <- tempdir()
outFileBike <- paste0(td, "/bike.xdf")
outFileLag <- paste0(td, "/lagData.xdf")
# Step 1: Import the Bike Data
bike <- rxImport(inData = inputFileBikeURL,
outFile = outFileBike, overwrite = TRUE,
#---------------------------Step 1: Import the Bike Data---------------------------
bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
missingValueString = "M", stringsAsFactors = FALSE,
# Remove timestamps and all columns that are
# part of the label.
# Remove timestamps and all columns that are part of the label.
varsToDrop = c("instant", "dteday", "casual", "registered"),
# Define year, weather conditions and season columns
# as categorical.
# Definite year, weather conditions and season columns as categorical.
colInfo = list(yr = list(type = "factor"),
weathersit = list(type = "factor"),
season = list(type = "factor")))
# Step 2: Feature Engineering
# Add the number of bikes that were rented in each of the previous
# 12 hours as 12 lag features.
computeLagFeatures <- function(dataList) {
# Total number of lags that need to be added.
numLags <- length(nLagsVector)
# lag feature names as lagN
varLagNameVector <- paste("lag", nLagsVector, sep="")
#---------------------------Step 2: Feature Engineering---------------------------
# Add number of bikes that were rented in each of the previous 12 hours as 12 lag features.
computeLagFeatures <- function (dataList) { # function for computing lag features.
# Set the value of an object "storeLagData" in the transform environment.
if (!exists("storeLagData"))
{
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
names(lagData) <- varLagNameVector
.rxSet("storeLagData",lagData)
}
numLags <- length(nLagsVector) # total number of lags that need to be added
varLagNameVector <- paste("cnt_", nLagsVector, "hour", sep="") # lag feature names as lagN
if (!.rxIsTestChunk)
# Set the value of an object "storeLagData" in the transform environment.
if (!exists("storeLagData"))
{
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
names(lagData) <- varLagNameVector
.rxSet("storeLagData", lagData)
}
if (!.rxIsTestChunk)
{
for (iL in 1:numLags)
{
for (iL in 1:numLags) {
# Number of rows in the current chunk.
numRowsInChunk <- length(dataList[[varName]])
nlags <- nLagsVector[iL]
varLagName <- paste("lag", nlags, sep = "")
# Retrieve lag data from the previous chunk.
lagData <- .rxGet("storeLagData")
# Concatenate lagData and the "cnt" feature.
allData <- c(lagData[[varLagName]], dataList[[varName]])
# Take the first N rows of allData, where N is the total
# number of rows in the original dataList.
dataList[[varLagName]] <- allData[1:numRowsInChunk]
# Save last nlag rows as the new lagData to be used
# to process in the next chunk.
lagData[[varLagName]] <- tail(allData, nlags)
.rxSet("storeLagData", lagData)
}
numRowsInChunk <- length(dataList[[varName]]) # number of rows in the current chunk
nlags <- nLagsVector[iL]
varLagName <- paste("cnt_", nlags, "hour", sep="")
lagData <- .rxGet("storeLagData") # retrieve lag data from the previous chunk
allData <- c(lagData[[varLagName]], dataList[[varName]]) # concatenate lagData and the "cnt" feature
dataList[[varLagName]] <- allData[1:numRowsInChunk] # take the first N rows of allData, where N is the total number of rows in the original dataList
lagData[[varLagName]] <- tail(allData, nlags) # save last nlag rows as the new lagData to be used to process in the next chunk
.rxSet("storeLagData", lagData)
}
return(dataList)
}
return(dataList)
}
# Apply the "computeLagFeatures" on the bike data.
lagData <- rxDataStep(inData = bike, outFile = outFileLag,
transformFunc = computeLagFeatures,
transformObjects = list(varName = "cnt",
nLagsVector = seq(12)),
transformVars = "cnt", overwrite=TRUE)
lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = computeLagFeatures,
transformObjects = list(varName = "cnt", nLagsVector = seq(12)),
transformVars = "cnt", overwrite = TRUE)
# Step 3: Prepare Training and Test Datasets
# Split data by "yr" so that the training data contains records
# for the year 2011 and the test data contains records for 2012.
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"),
splitByFactor = "yr", overwrite = TRUE,
reportProgress = 0, verbose = 0)
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
# Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012.
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0)
# Point to the .xdf files for the training and test set.
train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data.
modelFormula <- formula(train, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
modelFormula <- formula(train, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
# Fit a Decision Forest Regression model on the training data.
dForest <- rxDForest(modelFormula, data = train, importance = TRUE,
seed = 123)
dForest <- rxDForest(modelFormula, data = train,
method = "anova", maxDepth = 10, nTree = 50,
importance = TRUE, seed = 123)
# Plot a dotchart of the variable importance as measured by the decision forest.
rxVarImpPlot(dForest, main = "Variable Importance of Decision Forest")
# Step 5: Predict over new data and review the model performance
# Plot Out-of-bag error rate comparing to the number of trees build in decision forest model.
plot(dForest, main = "OOB Error Rate vs Number of Trees")
#---------------------------Step 5: Predict over new data and review the model performance---------------------------
# Predict the probability on the test dataset.
predict <- rxPredict(dForest, data = test, overwrite = TRUE,
computeResiduals = TRUE)
predict <- rxPredict(dForest, data = test, overwrite = TRUE, computeResiduals = TRUE)
# Calculate three statistical measures: Mean Absolute Error (MAE),
# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sum <- rxSummary(~cnt_Resid_abs + cnt_Resid_2 + cnt_rel, data = predict,
summaryStats = "Mean",
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sum <- rxSummary(~ cnt_Resid_abs+cnt_Resid_2+cnt_rel, data = predict, summaryStats = "Mean",
transforms = list(cnt_Resid_abs = abs(cnt_Resid),
cnt_Resid_2 = cnt_Resid^2,
cnt_rel = abs(cnt_Resid)/cnt)
)$sDataFrame
# List all measures in a data frame.
measures <- data.frame(MAE = sum[1, 2],
RMSE = sqrt(sum[2, 2]),
RAE = sum[3, 2])
measures <- data.frame(MAE = sum[1, 2], RMSE = sqrt(sum[2, 2]), RAE = sum[3, 2])
# Review the measures.
measures

Просмотреть файл

@ -1,67 +1,64 @@
# Regression: Demand estimation with Microsoft R Server
# This example a replication of an existing Azure Machine Learning
# Experiment - Regression: Demand Estimation
#-------------------------------------------------------------------------------------------------------------
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
#-------------------------------------------------------------------------------------------------------------
#
#
# This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation
# https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
# The dataset contains 17,379 rows and 17 columns, each row representing
# the number of bike rentals within a given hour of a given day in
# the years 2011 or 2012. Weather conditions (such as temperature, humidity,
# and wind speed) were included in this raw feature set, and the dates
# were categorized as holiday vs. weekday, etc.
# The field to predict is "cnt", which contains a count value ranging
# from 1 to 977, representing the number of bike rentals within
# a given hour.
# We built four models using the same algorithm, but with four different
# training datasets. The four training datasets that we constructed
# were all based on the same raw input data, but we added different
# additional features to each training set.
#
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity,
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
# weekday etc.
#
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
# of bike rentals within a specific hour.
#
# We built four models using the same algorithm, but with four different training datasets. The four training
# datasets that we constructed were all based on the same raw input data, but we added different additional
# features to each training set.
#
# Set A = weather + holiday + weekday + weekend features for the predicted day
# Set B = number of bikes that were rented in each of the previous 12 hours
# Set C = number of bikes that were rented in each of the previous 12 days
# at the same hour
# Set D = number of bikes that were rented in each of the previous 12 weeks
# at the same hour and the same day
# Set C = number of bikes that were rented in each of the previous 12 days at the same hour
# Set D = number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day
#
# Each of these feature sets captures different aspects of the problem:
# Feature set B captures very recent demand for the bikes.
# Feature set C captures the demand for bikes at a particular hour.
# Feature set D captures demand for bikes at a particular hour and
# particular day of the week.
# The four training datasets were built by combining the feature set
# as follows:
# Feature set D captures demand for bikes at a particular hour and particular day of the week.
#
# The four training datasets were built by combining the feature set as follows:
# Training set 1: feature set A only
# Training set 2: feature sets A+B
# Training set 3: feature sets A+B+C
# Training set 4: feature sets A+B+C+D
# The following scripts include five basic steps of building this example
# using Microsoft R Server.
#
# The following scripts include five basic steps of building this example using Microsoft R Server.
# This execution might require more than two minutes.
#
#--------------------------------------------------------------------------------------------------------------
# Step 0: Get Started
# Check whether Microsoft R Server (RRE 8.0) is installed
#---------------------------Step 0: Get Started-------------------------------
# ----------------------------------------------------------------------------
# Check if Microsoft R Server (RRE 8.0) is installed
# ----------------------------------------------------------------------------
if (!require("RevoScaleR")) {
cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run.
\nIf you have Microsoft R Server installed,
\nplease switch the R engine.
\nFor example, in R Tools for Visual Studio:
\nR Tools -> Options -> R Engine.
\nIf Microsoft R Server is not installed, you can download it from:
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
\n")
\nThis means that the functions starting with 'rx' will not run.
\nIf you have Microsoft R Server installed, please switch the R engine.
\nFor example, in R Tools for Visual Studio:
\nR Tools -> Options -> R Engine.
\nIf Microsoft R Server is not installed, you can download it from:
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
\n")
quit()
}
# Initial some variables.
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
# Create a temporary directory to store the intermediate .xdf files.
@ -70,36 +67,25 @@ outFileBike <- paste0(td, "/bike.xdf")
outFileEdit <- paste0(td, "/editData.xdf")
outFileLag <- paste0(td, "/lagData")
# Step 1: Import Data
#---------------------------Step 1: Import Data---------------------------
# Import the bike data.
# Remove timestamps and all columns that are part of the label.
bike_mrs <- rxImport(inData = inputFileBikeURL,
outFile = outFileBike, overwrite = TRUE,
bike_mrs <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
missingValueString = "M", stringsAsFactors = FALSE,
varsToDrop = c("instant",
"dteday",
"casual",
"registered"))
varsToDrop = c("instant", "dteday", "casual", "registered"))
# Edit Metadata: Define year, weather conditions and season columns
# as categorical.
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit,
sortLevels = TRUE,
factorInfo = c("yr", "weathersit", "season"),
overwrite = TRUE)
# Step 2: Feature Engineering
# Edit Metadata: Definite year, weather conditions and season columns as categorical.
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit, sortLevels = TRUE,
factorInfo = c("yr", "weathersit", "season"), overwrite = TRUE)
#---------------------------Step 2: Feature Engineering---------------------------
# Create a function to construct lag features for four different aspects.
computeLagFeatures <- function (dataList) {
# Total number of lags that need to be added.
numLags <- length(nLagsVector)
computeLagFeatures <- function(dataList) {
numLags <- length(nLagsVector) # total number of lags that need to be added
for (iL in 1:numLags) {
nlag <- nLagsVector[iL]
varLagName <- paste("demand.", nlag, unit, sep = "")
varLagName <- paste("cnt_", nlag, unit, sep = "")
numRowsInChunk <- length(dataList[[baseVar]])
numRowsToRead <- nlag * interval
numRowsPadding <- 0
@ -107,89 +93,67 @@ computeLagFeatures <- function (dataList) {
numRowsToRead <- .rxStartRow - 1
numRowsPadding <- nlag * interval - numRowsToRead
}
# Determine the current row to start processing the data
# between chunks.
startRow <- .rxStartRow - numRowsToRead
startRow <- .rxStartRow - numRowsToRead # determine the current row to start processing the data between chunks.
previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
varsToKeep = baseVar,
startRow = startRow,
numRows = numRowsToRead,
returnDataFrame = FALSE)
varsToKeep = baseVar,
startRow = startRow, numRows = numRowsToRead,
returnDataFrame = FALSE)
paddingRowsDataList <- rxReadXdf(file = .rxReadFileName,
varsToKeep = baseVar,
startRow = 1,
numRows = numRowsPadding,
returnDataFrame = FALSE)
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]],
previousRowsDataList[[baseVar]],
dataList[[baseVar]])[1:numRowsInChunk]
varsToKeep = baseVar,
startRow = 1, numRows = numRowsPadding,
returnDataFrame = FALSE)
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]], previousRowsDataList[[baseVar]], dataList[[baseVar]])[1:numRowsInChunk]
}
return(dataList)
}
# Create a function to add lag features a set of columns at a time.
addLag <- function(inputData, outputFileBase) {
inputFile <- inputData
outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf",
sep="")
outputFileHour <- paste(outputFileBase, "_hour", ".xdf", sep = "")
outputFileHourDay <- paste(outputFileBase, "_hour_day", ".xdf", sep = "")
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf", sep = "")
# Initialize some fix values.
hourInterval <- 1
dayInterval <- 24
weekInterval <- 168
previous <- 12
# Add number of bikes that were rented in each of the previous 12 hours
# (for Set B).
rxDataStep(inData = inputFile, outFile = outputFileHour,
transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt", unit = "hour",
nLagsVector=seq(12),
interval = hourInterval),
transformVars = c("cnt"), overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 days
# at the same hour (for Set C).
rxDataStep(inData = outputFileHour, outFile = outputFileHourDay,
transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt",
unit = "day",
nLagsVector=seq(12),
interval = dayInterval),
transformVars = c("cnt"),
overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 weeks
# at the same hour and the same day (for Set D).
rxDataStep(inData = outputFileHourDay,
outFile = outputFileHourDayWeek,
transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt",
unit = "week",
nLagsVector=seq(12),
interval = weekInterval),
transformVars = c("cnt"), overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 hours (for Set B).
rxDataStep(inData = inputFile, outFile = outputFileHour, transformFunc = computeLagFeatures,
transformObjects = list(baseVar = "cnt", unit = "hour", nLagsVector = seq(12),
interval = hourInterval),
transformVars = c("cnt"), overwrite = TRUE)
# Add number of bikes that were rented in each of the previous 12 days at the same hour (for Set C).
rxDataStep(inData = outputFileHour, outFile = outputFileHourDay, transformFunc = computeLagFeatures,
transformObjects = list(baseVar = "cnt", unit = "day", nLagsVector = seq(12),
interval = dayInterval),
transformVars = c("cnt"), overwrite = TRUE)
# Add number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day (for Set D).
rxDataStep(inData = outputFileHourDay, outFile = outputFileHourDayWeek, transformFunc = computeLagFeatures,
transformObjects = list(baseVar = "cnt", unit = "week", nLagsVector = seq(12),
interval = weekInterval),
transformVars = c("cnt"), overwrite = TRUE)
file.remove(outputFileHour)
file.remove(outputFileHourDay)
return(outputFileHourDayWeek)
}
# Set A = weather + holiday + weekday + weekend features for
# the predicted day.
# Set A = weather + holiday + weekday + weekend features for the predicted day.
finalDataA_mrs <- editData_mrs
# Set B, C & D.
finalDataLag_dir <- addLag(inputData = editData_mrs,
outputFileBase = outFileLag)
finalDataLag_dir <- addLag(inputData = editData_mrs, outputFileBase = outFileLag)
finalDataLag_mrs <- RxXdfData(finalDataLag_dir)
# Step 3: Prepare Training and Test Datasets
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
## Set A:
# Split Data.
rxSplit(inData = finalDataA_mrs,
@ -202,62 +166,62 @@ testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf"))
## Set B, C & D:
# Split Data.
rxSplit(inData = finalDataLag_mrs,
outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
overwrite = TRUE, reportProgress = 0, verbose = 0)
# Point to the .xdf files for the training and test set.
train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
newDayFeatures <- paste("cnt_", seq(12), "day", sep = "")
newWeekFeatures <- paste("cnt_", seq(12), "week", sep = "")
## Set A:
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data.
formA_mrs <- formula(trainA_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
formA_mrs <- formula(trainA_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
# Fit Decision Forest Regression model.
dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs,
method = "anova", maxDepth = 10, nTree = 20,
importance = TRUE, seed = 123)
## Set B:
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data, and lag features
# for Set C and D.
formB_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr",
newDayFeatures,
newWeekFeatures))
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set C and D.
formB_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newDayFeatures, newWeekFeatures))
# Fit Decision Forest Regression model.
dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs,
method = "anova", maxDepth = 10, nTree = 20,
importance = TRUE, seed = 123)
## Set C:
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data, and lag features
# for Set D.
formC_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr", newWeekFeatures))
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set D.
formC_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newWeekFeatures))
# Fit Decision Forest Regression model.
dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs,
method = "anova", maxDepth = 10, nTree = 20,
importance = TRUE, seed = 123)
## Set D:
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data.
formD_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
formD_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
# Fit Decision Forest Regression model.
dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs,
method = "anova", maxDepth = 10, nTree = 20,
importance = TRUE, seed = 123)
# Plot four dotchart of the variable importance as measured by the four decision forest models.
par(mfrow = c(2, 2))
rxVarImpPlot(dForestA_mrs, main = "Variable Importance of Set A")
rxVarImpPlot(dForestB_mrs, main = "Variable Importance of Set B")
rxVarImpPlot(dForestC_mrs, main = "Variable Importance of Set C")
rxVarImpPlot(dForestD_mrs, main = "Variable Importance of Set D")
# Step 5: Predict over new data
# Plot Out-of-bag error rate comparing to the number of trees build in decision forest model.
plot(dForestA_mrs, main = "OOB Error Rate vs Number of Trees: Set A")
plot(dForestB_mrs, main = "OOB Error Rate vs Number of Trees: Set B")
plot(dForestC_mrs, main = "OOB Error Rate vs Number of Trees: Set C")
plot(dForestD_mrs, main = "OOB Error Rate vs Number of Trees: Set D")
#---------------------------Step 5: Predict over new data---------------------------
## Set A:
# Predict the probability on the test dataset.
rxPredict(dForestA_mrs, data = testA_mrs,
@ -286,22 +250,17 @@ rxPredict(dForestD_mrs, data = test_mrs,
residVarNames = "cnt_Resid_D",
overwrite = TRUE, computeResiduals = TRUE)
# Prepare outputs
#---------------------------Prepare outputs---------------------------
## Set A:
# Calculate three statistical measures: Mean Absolute Error (MAE),
# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sumA <- rxSummary( ~ cnt_Resid_A_abs + cnt_Resid_A_2 + cnt_rel_A,
data = testA_mrs, summaryStats = "Mean",
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sumA <- rxSummary(~ cnt_Resid_A_abs+cnt_Resid_A_2+cnt_rel_A, data = testA_mrs, summaryStats = "Mean",
transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A),
cnt_Resid_A_2 = cnt_Resid_A^2,
cnt_rel_A = abs(cnt_Resid_A)/cnt)
)$sDataFrame
## Set B, C & D:
sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
cnt_Resid_C_abs + cnt_Resid_C_2 + cnt_rel_C +
cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_Resid_C_2+cnt_rel_C+cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
data = test_mrs, summaryStats = "Mean",
transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B),
cnt_Resid_B_2 = cnt_Resid_B^2,
@ -312,7 +271,7 @@ sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
cnt_Resid_D_abs = abs(cnt_Resid_D),
cnt_Resid_D_2 = cnt_Resid_D^2,
cnt_rel_D = abs(cnt_Resid_D)/cnt)
)$sDataFrame
)$sDataFrame
# Add row names.
features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
@ -323,10 +282,7 @@ features <- c("baseline: weather + holiday + weekday + weekend features for the
# List all measures in a data frame.
measures <- data.frame(Features = features,
MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
RMSE = c(sqrt(sumA[2, 2]),
sqrt(sum[2, 2]),
sqrt(sum[5, 2]),
sqrt(sum[8, 2])),
RMSE = c(sqrt(sumA[2, 2]), sqrt(sum[2, 2]), sqrt(sum[5, 2]), sqrt(sum[8, 2])),
RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))
# Review the measures.

Просмотреть файл

@ -7,7 +7,7 @@
library("RCurl", quietly = TRUE)
# A URL contains the raw data.
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputDataURL <- paste0(github, "Flight_Delays_Sample.csv")
# Download data from the URL.

Просмотреть файл

@ -17,7 +17,7 @@ if (!require("RevoScaleR")) {
}
# A URL contains the raw data.
github <- "https://raw.githubusercontent.com/mezmicrosoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputDataURL <- paste0(github, "Flight_Delays_Sample.csv")
# Create a temporary directory to store the intermediate .xdf files.

Просмотреть файл

@ -16,6 +16,9 @@
# if the flight was on time.
# The following scripts include five basic steps of building this example using Microsoft R Server.
# This execution might require more than one minute.
#
#------------------------------------------------------------------------------------------------------------------------------------
#---------------------------Step 0: Get Started-------------------------------
@ -36,7 +39,7 @@ if (!require("RevoScaleR")) {
}
# Initial some variables.
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")

Просмотреть файл

@ -23,7 +23,7 @@
# are labeled 1 if a flight was delayed, and labeled 0 if the flight was on time.
#
# The following scripts include five basic steps of building this example using Microsoft R Server.
#
# This execution might require more than one minute.
#
#------------------------------------------------------------------------------------------------------------------------------------
@ -46,7 +46,7 @@ if (!require("RevoScaleR")) {
}
# Initial some variables.
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")

Просмотреть файл

@ -23,14 +23,14 @@
# are labeled 1 if a flight was delayed, and labeled 0 if the flight was on time.
#
# The following scripts include five basic steps of building this example using open source R.
#
# This execution might require more than one minute.
#
#-------------------------------------------------------------------------------------------------------------------------
#---------------------------Step 0: Get Started---------------------------
# Initial some variables.
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")

Просмотреть файл

@ -4,8 +4,8 @@
# ----------------------------------------------------------------------------
# ----------------------------------------------------------------------------
# NOTE: In order to run this script you'll need to have the an Azure ML
# workspace, with its workspace ID and key.
# NOTE: In order to run the last part of the script you'll need to have
# the an Azure ML workspace, with its workspace ID and key.
# For details about Azure ML, go to http://studio.azureml.net/
# ----------------------------------------------------------------------------
# Enter your Azure ML Studio workspace info here before continuing.
@ -23,8 +23,13 @@ library("MASS") # to use the Boston dataset
# ----------------------------------------------------------------------------
# fit a model and check model performance
# ----------------------------------------------------------------------------
# check the data
head(Boston)
ggplot(Boston, aes(x=medv)) +
geom_histogram(binwidth=5) +
ggtitle("Histogram of Response Variable")
# fit a model using all variables except medv as predictors
# fit a model using medv as response and others as predictors
lm1 <- lm(medv ~ ., data = Boston)
# check model performance
@ -50,52 +55,63 @@ print(paste("Relative Squared Error: ",
# ----------------------------------------------------------------------------
# publish and consume a web service
# ----------------------------------------------------------------------------
# workspace information
ws <- workspace(
id = ws_id,
auth = auth_token)
# the following works only with valid AzureML workspace information
# define predict function
mypredict <- function(newdata) {
res <- predict(lm1, newdata)
res
}
# get workspace information
AML <- 0
tryCatch(
{
ws <- workspace(id = ws_id, auth = auth_token)
AML <- 1
},
error = function(cond){
message("Azure ML workspace information was not valid.")
}
)
# test the prediction function
newdata <- Boston[1, 1:13]
print(mypredict(newdata))
# Publish the service
ep <- publishWebService(ws = ws, fun = mypredict,
name = "HousePricePrediction", inputSchema = newdata)
# consume web service - 1st approach
pred <- consume(ep, newdata)
pred
# consume web service - 2nd approach
# retrieve web service information for later use
service_id <- ep$WebServiceId
# define endpoint
ep_price_pred <- endpoints(ws, service_id)
# consume
consume(ep_price_pred, newdata)
# ----------------------------------------------------------------------------
# update a web service and then consume
# ----------------------------------------------------------------------------
# define function for testing purpose
mypredictnew <- function(newdata) {
res <- predict(lm1, newdata) + 100
res
}
# update service with the new function
ep_update <- updateWebService(
ws = ws,
fun = mypredictnew,
inputSchema = newdata,
serviceId = ep$WebServiceId)
# consume the updated web service
consume(ep_price_pred, newdata)
if (AML) {
# define predict function
mypredict <- function(newdata) {
res <- predict(lm1, newdata)
res
}
# test the prediction function
newdata <- Boston[1, 1:13]
print(mypredict(newdata))
# Publish the service
ep <- publishWebService(ws = ws, fun = mypredict,
name = "HousePricePrediction", inputSchema = newdata)
# consume web service - 1st approach
pred <- consume(ep, newdata)
pred
# consume web service - 2nd approach
# retrieve web service information for later use
service_id <- ep$WebServiceId
# define endpoint
ep_price_pred <- endpoints(ws, service_id)
# consume
consume(ep_price_pred, newdata)
# define function for testing purpose
mypredictnew <- function(newdata) {
res <- predict(lm1, newdata) + 100
res
}
# update service with the new function
ep_update <- updateWebService(
ws = ws,
fun = mypredictnew,
inputSchema = newdata,
serviceId = ep$WebServiceId)
# consume the updated web service
consume(ep_price_pred, newdata)
} else {
message("Azure ML webservice is not deployed because ",
"the workspace information is invalid")
}

Просмотреть файл

@ -0,0 +1,60 @@
# ----------------------------------------------------------------------------
# purpose: to demonstrate the commonalities and differences among functions
# in R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
# to learn more about the differences among R, MRO and MRS, refer to:
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# check if Microsoft R Server is installed and load libraries
# ----------------------------------------------------------------------------
# to check if RevoScaleR is available
RRE <- require("RevoScaleR")
if (!RRE)
{
message(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
"For example, in R Tools for Visual Studio: \n",
"R Tools -> Options -> R Engine. \n",
"If Microsoft R Server is not installed, you can download it from: \n",
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
}
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# load libraries
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------
# glm vs rxGlm
# ----------------------------------------------------------------------------
# check the data
head(mtcars)
# check the response variable
ggplot(mtcars, aes(x=vs)) +
geom_histogram(binwidth=.3) +
ggtitle("Distribution of Response Values")
# fit a model with glm(), this can be run on R, MRO, or MRS
# predict V engine vs straight engine with weight and displacement
logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
summary(logistic1)
# fit the same model with rxGlm() if RevoScaleR is installed
if (RRE){
# check the data
head(mtcars)
# predict V engine vs straight engine with weight and displacement
logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
summary(logistic2)
} else {
print("rxGlm was not run becauase the RevoScaleR package is not available")
}

Просмотреть файл

@ -10,9 +10,11 @@
# ----------------------------------------------------------------------------
# check if Microsoft R Server is installed and load libraries
# ----------------------------------------------------------------------------
if (!require("RevoScaleR"))
# to check if RevoScaleR is available
RRE <- require("RevoScaleR")
if (!RRE)
{
stop(
message(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
@ -31,27 +33,10 @@ library("MASS") # to use the mvrnorm function
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------
# glm (R, MRO, MRS) vs rxGlm (MRS)
# ----------------------------------------------------------------------------
# check the data
head(mtcars)
# fit a model with glm(), this can be run on R, MRO, or MRS
# predict V engine vs straight engine with weight and displacement
logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
summary(logistic1)
# check the data
head(mtcars)
# fit the same model with rxGlm(), this can be run on MRS only
# predict V engine vs straight engine with weight and displacement
logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
summary(logistic2)
# ----------------------------------------------------------------------------
# kmeans (R, MRO, MRS) vs rxKmeans (MRS)
# simulate data, it works on R, MRO, or MRS
# ----------------------------------------------------------------------------
# make sure the results can be replicated
set.seed(112)
set.seed(121)
# function to simulate data
simulCluster <- function(nsamples, mean, dimension, group)
@ -83,42 +68,18 @@ ggplot(group_all, aes(x = V1, y = V2)) +
# assign data
mydata <- group_all[, 1:2]
# ----------------------------------------------------------------------------
# kmeans
# ----------------------------------------------------------------------------
# cluster analysis with kmeans(), it works on R, MRO, or MRS
fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")
# cluster analysis with rxKmeans(), it works on MRS only
fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
numClusters = nclusters, algorithm = "lloyd")
# ----------------------------------------------------------------------------
# compare the cluster assignments between kmeans and rxKmeans (MRS): the same
# ----------------------------------------------------------------------------
# the code below should be run on MRS due to the use of "rx" commands
# save a dataset in XDF format
dataXDF = tempfile(fileext = ".xdf")
rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
# rxKmeans
clust <- rxKmeans(~ V1 + V2, data = dataXDF,
numClusters = nclusters, algorithm = "lloyd",
outFile = dataXDF, outColName = "cluster",
overwrite = TRUE)
rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
# append cluster assignment from kmeans and rxKmeans
mydata_clusters <- cbind(
group_all,
kmeans.cluster = factor(fit.kmeans$cluster),
rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
# compare the cluster assignments between kmeans and rxKmeans
with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))
kmeans.cluster = factor(fit.kmeans$cluster))
# get cluster means
clustermeans.kmeans <- fit.kmeans$centers
clustermeans.rxKmeans <- fit.kmeans$centers
# plot clusters from kmeans
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
@ -129,11 +90,47 @@ ggplot(mydata_clusters, aes(x = V1, y = V2)) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by kmeans()")
# plot clusters from rxKmeans
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
geom_point(aes(colour = rxKmeans.cluster)) +
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by rxKmeans()")
# ----------------------------------------------------------------------------
# cluster analysis with rxKmeans(), it works on MRS only
# ----------------------------------------------------------------------------
# cluster analysis with rxKmeans() if RevoScaleR is installed
if (RRE){
# save a dataset in XDF format
dataXDF = tempfile(fileext = ".xdf")
rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
# rxKmeans
clust <- rxKmeans(~ V1 + V2, data = dataXDF,
numClusters = nclusters, algorithm = "lloyd",
outFile = dataXDF, outColName = "cluster",
overwrite = TRUE)
rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
# append cluster assignment from kmeans and rxKmeans
mydata_clusters <- cbind(
group_all,
kmeans.cluster = factor(fit.kmeans$cluster),
rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
# compare the cluster assignments between kmeans and rxKmeans
message("\nComparing cluster assignments between kmeans and rxKmeans:")
print(with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster)))
# get cluster means
clustermeans.rxKmeans <- fit.kmeans$centers
# plot clusters from rxKmeans
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
geom_point(aes(colour = rxKmeans.cluster)) +
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Clusters found by rxKmeans()")
} else{
print("rxKmeans was not run becauase the RevoScaleR package is not available")
}

Просмотреть файл

@ -10,9 +10,11 @@
# ----------------------------------------------------------------------------
# check if Microsoft R Server is installed and load libraries
# ----------------------------------------------------------------------------
if (!require("RevoScaleR"))
# to check if RevoScaleR is available
RRE <- require("RevoScaleR")
if (RRE)
{
stop(
message(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
@ -23,12 +25,12 @@ if (!require("RevoScaleR"))
}
# install a package if it's not already installed
if (!require("ggplot2", quietly = TRUE))
install.packages("ggplot2")
# load packages
library("MASS") # to use the mvrnorm function
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------
# simulate cluster data for analysis, run this on R, MRO, or MRS
@ -50,6 +52,9 @@ simulCluster <- function(nsamples, mean, dimension, group)
# modify the value for nsamples to test out the capacity limit for kmeans()
# on a computer with 7 GB RAM, when nsamples is 3*10^7 kmeans() failed
# but rxKmeans() worked
message("If the sample size is large, it will take some time to finish. \n",
"You can use a smaller value for nsamples, say 1000, \n",
"to test the program.")
nsamples <- 3 * 10 ^ 7
group_a <- simulCluster(nsamples, -1, 2, "a")
group_b <- simulCluster(nsamples, 1, 2, "b")
@ -57,16 +62,26 @@ group_all <- rbind(group_a, group_b)
nclusters <- 2
# plot sample data
plot_data <- group_all[sample(nrow(group_all), 1000),]
ggplot(plot_data, aes(x = V1, y = V2)) +
geom_point(aes(colour = group)) +
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
xlim(-5, 5) + ylim(-5, 5) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0) +
ggtitle("Simulated data in two overlapping groups")
# save data
mydata = group_all[, 1:2]
dataCSV = tempfile(fileext = ".csv")
dataXDF = tempfile(fileext = ".xdf")
write.csv(group_all, dataCSV, row.names = FALSE)
rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
# ----------------------------------------------------------------------------
# cluster analysis with kmeans(), it doesn't work when data is large enough
# ----------------------------------------------------------------------------
# this can be run on R, MRO, or MRS
system_time_R <-
system.time(
{
@ -76,9 +91,12 @@ system_time_R <-
})
# ----------------------------------------------------------------------------
# cluster analysis with rxKmeans(), it works even if kmeans() does not
# cluster analysis with rxKmeans() on MRS, it works even if kmeans() does not
# ----------------------------------------------------------------------------
system_time_MRS <-
# cluster analysis with rxKmeans() if RevoScaleR is installed
if (RRE){
rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
system.time(
{
clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
@ -88,3 +106,6 @@ system_time_MRS <-
outColName = "cluster",
overwrite = TRUE)
})
} else {
print("rxKmeans was not run becauase the RevoScaleR package is not available")
}

Просмотреть файл

@ -0,0 +1,53 @@
# ----------------------------------------------------------------------------
# purpose: to demonstrate the speed differences across
# R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
# to learn more about the differences among R, MRO and MRS, refer to:
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# run the following code on R, MRO, and MRS and
# notice the speed improvement with MRO and MRS over R
# ----------------------------------------------------------------------------
# the code in this section can be found at the following address
# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
# print the default number of threads if MKL library is installed
if (require("RevoUtilsMath"))
{
print(paste("The number of threads is:", getMKLthreads()))
}
# Initialization
set.seed(1)
m <- 10000
n <- 5000
A <- matrix(runif(m * n), m, n)
# Matrix multiply
system.time(B <- crossprod(A))
# Cholesky Factorization
system.time(C <- chol(B))
# Singular Value Decomposition
m <- 10000
n <- 2000
A <- matrix(runif(m * n), m, n)
system.time(S <- svd(A, nu = 0, nv = 0))
# Principal Components Analysis
m <- 10000
n <- 2000
A <- matrix(runif(m * n), m, n)
system.time(P <- prcomp(A))
# Linear Discriminant Analysis
library("MASS")
g <- 5
k <- round(m / 2)
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
train <- sample(1:m, k)
system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))

Просмотреть файл

@ -0,0 +1,44 @@
# ----------------------------------------------------------------------------
# purpose: to demonstrate the speed similarities for kmeans across
# R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
# to learn more about the differences among R, MRO and MRS, refer to:
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# load libraries
# ----------------------------------------------------------------------------
library("MASS") # to use the mvrnorm function
# ----------------------------------------------------------------------------
# run the following analysis that does not involve matrix manipulation
# to observe that the speed is similar on R, MRO and MRS
# ----------------------------------------------------------------------------
set.seed(0)
# function to simulate data
simulCluster <- function(nsamples, mean, dimension, group)
{
Sigma <- diag(1, dimension, dimension)
x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
z <- as.data.frame(x)
z$group = group
z
}
# simulate data
nsamples <- 10 ^ 7 # this was used on different platforms
# nsamples <- 1000 # for testing purpose
group_a <- simulCluster(nsamples, -1, 2, "a")
group_b <- simulCluster(nsamples, 1, 2, "b")
group_all <- rbind(group_a, group_b)
nclusters <- 2
mydata = group_all[, 1:2]
# K-Means Cluster Analysis
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd"))

Просмотреть файл

@ -1,6 +1,6 @@
# ----------------------------------------------------------------------------
# purpose: to demonstrate the speed differences across
# R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
# purpose: to compare the speed of kmeans() with that of rxKmeans()
# on Microsoft R Server (MRS)
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
@ -8,11 +8,13 @@
# https://github.com/lixzhang/R-MRO-MRS
# ----------------------------------------------------------------------------
# check if Microsoft R Server (RRE 8.0) is installed and load libraries
# check if Microsoft R Server is installed and load libraries
# ----------------------------------------------------------------------------
if (!require("RevoScaleR"))
# to check if RevoScaleR is available
RRE <- require("RevoScaleR")
if (!RRE)
{
stop(
message(
"RevoScaleR package does not seem to exist. \n",
"This means that the functions starting with 'rx' will not run. \n",
"If you have Microsoft R Server installed, please switch the R engine.\n",
@ -31,55 +33,16 @@ library("MASS") # to use the mvrnorm function
library("ggplot2") # used for plotting
# ----------------------------------------------------------------------------
# run the following code on R, MRO, and MRS and
# notice the speed improvement with MRO and MRS over R
# compare the speed of kmeans with that of rxKmeans on MRS
# for different data sizes
# ----------------------------------------------------------------------------
# the code in this section can be found at the following address
# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
# to save timing results
myresult <- data.frame(nsamples = integer(), time_r = double(),
time_rre = double())
# print the default number of threads if MKL library is installed
if (require("RevoUtilsMath"))
{
print(paste("The number of threads is:", getMKLthreads()))
}
# Initialization
set.seed(1)
m <- 10000
n <- 5000
A <- matrix(runif(m * n), m, n)
# Matrix multiply
system.time(B <- crossprod(A))
# Cholesky Factorization
system.time(C <- chol(B))
# Singular Value Decomposition
m <- 10000
n <- 2000
A <- matrix(runif(m * n), m, n)
system.time(S <- svd(A, nu = 0, nv = 0))
# Principal Components Analysis
m <- 10000
n <- 2000
A <- matrix(runif(m * n), m, n)
system.time(P <- prcomp(A))
# Linear Discriminant Analysis
library("MASS")
g <- 5
k <- round(m / 2)
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
train <- sample(1:m, k)
system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
# ----------------------------------------------------------------------------
# run an analysis that does not involve matrix to show that
# the speed is similar on R, MRO and MRS
# ----------------------------------------------------------------------------
set.seed(0)
# list of sample sizes
nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
# function to simulate data
simulCluster <- function(nsamples, mean, dimension, group)
@ -91,33 +54,6 @@ simulCluster <- function(nsamples, mean, dimension, group)
z
}
# simulate data
nsamples <- 10 ^ 7 # this was used on different platforms
# nsamples <- 1000 # for testing purpose
group_a <- simulCluster(nsamples, -1, 2, "a")
group_b <- simulCluster(nsamples, 1, 2, "b")
group_all <- rbind(group_a, group_b)
nclusters <- 2
mydata = group_all[, 1:2]
# K-Means Cluster Analysis
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd"))
# ----------------------------------------------------------------------------
# compare the speed of kmeans() with that of rxKmeans()
# for different data sizes
# ----------------------------------------------------------------------------
# to save timing results
myresult <- data.frame(nsamples = integer(), time_r = double(),
time_rre = double())
# list of sample sizes
nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
for (nsamples in nsamples_list)
{
# simulate data and append
@ -125,25 +61,29 @@ for (nsamples in nsamples_list)
group_b <- simulCluster(nsamples, 1, 2, "b")
group_all <- rbind(group_a, group_b)
mydata = group_all[, 1:2]
nclusters <- 2
# kmeans with R
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
iter.max = 1000,
algorithm = "Lloyd"))
iter.max = 1000,
algorithm = "Lloyd"))
# kmeans with MRS
system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
numClusters = nclusters,
algorithm = "lloyd"))
if (RRE){
system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
numClusters = nclusters,
algorithm = "lloyd"))
}
else system_time_rre <- c(NA,NA,NA)
# combine
newrow <- data.frame(nsamples = nsamples,
time_r = as.numeric(system_time_r[3]),
time_rre = as.numeric(system_time_rre[3]))
time_r = as.numeric(system_time_r[3]),
time_rre = as.numeric(system_time_rre[3]))
myresult <- rbind(myresult, newrow)
}
myresult$nsamples <- 2 * myresult$nsamples
@ -152,7 +92,8 @@ mydata$nsamples_log <- log10(mydata$nsamples)
mydata
# generate plot
ggplot(data = mydata, aes(x = nsamples_log)) +
if (RRE){
ggplot(data = mydata, aes(x = nsamples_log)) +
geom_point(aes(y = time_r, colour = "kmeans")) +
geom_line(aes(y = time_r, colour = "kmeans")) +
geom_point(aes(y = time_rre, colour = "rxKmeans")) +
@ -161,4 +102,14 @@ ggplot(data = mydata, aes(x = nsamples_log)) +
scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
xlab("log10(number of samples)") +
ylab("time in seconds") +
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
} else {
ggplot(data = mydata, aes(x = nsamples_log)) +
geom_point(aes(y = time_r, colour = "kmeans")) +
geom_line(aes(y = time_r, colour = "kmeans")) +
scale_x_continuous(breaks = seq(2, 8, by = 1)) +
scale_colour_manual("Function", values = c(kmeans = "red")) +
xlab("log10(number of samples)") +
ylab("time in seconds") +
ggtitle("Time for kmeans. To add time for rxKmean, use the RRE engine")
}