зеркало из https://github.com/microsoft/RTVS-docs.git
merged
This commit is contained in:
Коммит
25b2a34bf0
|
@ -1,27 +1,30 @@
|
|||
# Regression: Demand estimation with Microsoft R Server
|
||||
|
||||
# This example demonstrates the feature engineering process for building
|
||||
# a regression model to predict bike rental demand.
|
||||
|
||||
# The dataset contains 17,379 rows and 17 columns, each row representing
|
||||
# the number of bike rentals within a specific hour of a day in the years
|
||||
# 2011 or 2012. Weather conditions (such as temperature, humidity,
|
||||
# and wind speed) were included in this raw feature set, and
|
||||
# the dates were categorized as holiday vs. weekday, etc.
|
||||
|
||||
# The field to predict is "cnt", which contains a count value ranging
|
||||
# from 1 to 977, representing the number of bike rentals within
|
||||
# a specific hour. The lag features we add in the data set are the number
|
||||
# of bikes that were rented in each of the previous 12 hours.
|
||||
# This caputures the very recent demand for the bikes.
|
||||
|
||||
# The following scripts include five basic steps of building this example
|
||||
# using Microsoft R Server.
|
||||
#-------------------------------------------------------------------------------------------------------------
|
||||
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
|
||||
#-------------------------------------------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# This example demonstrates the feature engineering process for building a regression model to predict
|
||||
# bike rental demand.
|
||||
#
|
||||
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within
|
||||
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity,
|
||||
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
|
||||
# weekday etc.
|
||||
#
|
||||
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
|
||||
# of bike rentals within a specific hour. The lag features we add in the data set is the number of bikes that
|
||||
# were rented in each of the previous 12 hours, which caputures the very recent demand for the bikes.
|
||||
#
|
||||
# The following scripts include five basic steps of building this example using Microsoft R Server.
|
||||
# This execution might require more than one minute.
|
||||
#
|
||||
#--------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Step 0: Get Started
|
||||
|
||||
# Check whether Microsoft R Server (RRE 8.0) is installed
|
||||
#---------------------------Step 0: Get Started-------------------------------
|
||||
# ----------------------------------------------------------------------------
|
||||
# Check if Microsoft R Server (RRE 8.0) is installed
|
||||
# ----------------------------------------------------------------------------
|
||||
if (!require("RevoScaleR")) {
|
||||
cat("RevoScaleR package does not seem to exist.
|
||||
\nThis means that the functions starting with 'rx' will not run.
|
||||
|
@ -35,8 +38,8 @@ if (!require("RevoScaleR")) {
|
|||
quit()
|
||||
}
|
||||
|
||||
# Initialize some variables.
|
||||
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
# Initial some variables.
|
||||
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
|
||||
|
||||
# Create a temporary directory to store the intermediate .xdf files.
|
||||
|
@ -44,115 +47,89 @@ td <- tempdir()
|
|||
outFileBike <- paste0(td, "/bike.xdf")
|
||||
outFileLag <- paste0(td, "/lagData.xdf")
|
||||
|
||||
|
||||
# Step 1: Import the Bike Data
|
||||
|
||||
bike <- rxImport(inData = inputFileBikeURL,
|
||||
outFile = outFileBike, overwrite = TRUE,
|
||||
#---------------------------Step 1: Import the Bike Data---------------------------
|
||||
bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
|
||||
missingValueString = "M", stringsAsFactors = FALSE,
|
||||
# Remove timestamps and all columns that are
|
||||
# part of the label.
|
||||
# Remove timestamps and all columns that are part of the label.
|
||||
varsToDrop = c("instant", "dteday", "casual", "registered"),
|
||||
# Define year, weather conditions and season columns
|
||||
# as categorical.
|
||||
# Definite year, weather conditions and season columns as categorical.
|
||||
colInfo = list(yr = list(type = "factor"),
|
||||
weathersit = list(type = "factor"),
|
||||
season = list(type = "factor")))
|
||||
|
||||
|
||||
# Step 2: Feature Engineering
|
||||
|
||||
# Add the number of bikes that were rented in each of the previous
|
||||
# 12 hours as 12 lag features.
|
||||
computeLagFeatures <- function(dataList) {
|
||||
# Total number of lags that need to be added.
|
||||
numLags <- length(nLagsVector)
|
||||
# lag feature names as lagN
|
||||
varLagNameVector <- paste("lag", nLagsVector, sep="")
|
||||
#---------------------------Step 2: Feature Engineering---------------------------
|
||||
# Add number of bikes that were rented in each of the previous 12 hours as 12 lag features.
|
||||
computeLagFeatures <- function (dataList) { # function for computing lag features.
|
||||
|
||||
# Set the value of an object "storeLagData" in the transform environment.
|
||||
if (!exists("storeLagData"))
|
||||
{
|
||||
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
|
||||
names(lagData) <- varLagNameVector
|
||||
.rxSet("storeLagData",lagData)
|
||||
}
|
||||
numLags <- length(nLagsVector) # total number of lags that need to be added
|
||||
varLagNameVector <- paste("cnt_", nLagsVector, "hour", sep="") # lag feature names as lagN
|
||||
|
||||
if (!.rxIsTestChunk)
|
||||
# Set the value of an object "storeLagData" in the transform environment.
|
||||
if (!exists("storeLagData"))
|
||||
{
|
||||
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
|
||||
names(lagData) <- varLagNameVector
|
||||
.rxSet("storeLagData", lagData)
|
||||
}
|
||||
|
||||
if (!.rxIsTestChunk)
|
||||
{
|
||||
for (iL in 1:numLags)
|
||||
{
|
||||
for (iL in 1:numLags) {
|
||||
# Number of rows in the current chunk.
|
||||
numRowsInChunk <- length(dataList[[varName]])
|
||||
nlags <- nLagsVector[iL]
|
||||
varLagName <- paste("lag", nlags, sep = "")
|
||||
# Retrieve lag data from the previous chunk.
|
||||
lagData <- .rxGet("storeLagData")
|
||||
# Concatenate lagData and the "cnt" feature.
|
||||
allData <- c(lagData[[varLagName]], dataList[[varName]])
|
||||
# Take the first N rows of allData, where N is the total
|
||||
# number of rows in the original dataList.
|
||||
dataList[[varLagName]] <- allData[1:numRowsInChunk]
|
||||
# Save last nlag rows as the new lagData to be used
|
||||
# to process in the next chunk.
|
||||
lagData[[varLagName]] <- tail(allData, nlags)
|
||||
.rxSet("storeLagData", lagData)
|
||||
}
|
||||
numRowsInChunk <- length(dataList[[varName]]) # number of rows in the current chunk
|
||||
nlags <- nLagsVector[iL]
|
||||
varLagName <- paste("cnt_", nlags, "hour", sep="")
|
||||
lagData <- .rxGet("storeLagData") # retrieve lag data from the previous chunk
|
||||
allData <- c(lagData[[varLagName]], dataList[[varName]]) # concatenate lagData and the "cnt" feature
|
||||
dataList[[varLagName]] <- allData[1:numRowsInChunk] # take the first N rows of allData, where N is the total number of rows in the original dataList
|
||||
lagData[[varLagName]] <- tail(allData, nlags) # save last nlag rows as the new lagData to be used to process in the next chunk
|
||||
.rxSet("storeLagData", lagData)
|
||||
}
|
||||
return(dataList)
|
||||
}
|
||||
return(dataList)
|
||||
}
|
||||
|
||||
# Apply the "computeLagFeatures" on the bike data.
|
||||
lagData <- rxDataStep(inData = bike, outFile = outFileLag,
|
||||
transformFunc = computeLagFeatures,
|
||||
transformObjects = list(varName = "cnt",
|
||||
nLagsVector = seq(12)),
|
||||
transformVars = "cnt", overwrite=TRUE)
|
||||
lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = computeLagFeatures,
|
||||
transformObjects = list(varName = "cnt", nLagsVector = seq(12)),
|
||||
transformVars = "cnt", overwrite = TRUE)
|
||||
|
||||
|
||||
# Step 3: Prepare Training and Test Datasets
|
||||
|
||||
# Split data by "yr" so that the training data contains records
|
||||
# for the year 2011 and the test data contains records for 2012.
|
||||
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"),
|
||||
splitByFactor = "yr", overwrite = TRUE,
|
||||
reportProgress = 0, verbose = 0)
|
||||
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
|
||||
# Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012.
|
||||
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0)
|
||||
|
||||
# Point to the .xdf files for the training and test set.
|
||||
train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
|
||||
test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))
|
||||
|
||||
|
||||
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
|
||||
|
||||
# Build a formula for the regression model and remove the "yr",
|
||||
# which is used to split the training and test data.
|
||||
modelFormula <- formula(train, depVars = "cnt",
|
||||
varsToDrop = c("RowNum", "yr"))
|
||||
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
|
||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
|
||||
modelFormula <- formula(train, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
|
||||
|
||||
# Fit a Decision Forest Regression model on the training data.
|
||||
dForest <- rxDForest(modelFormula, data = train, importance = TRUE,
|
||||
seed = 123)
|
||||
dForest <- rxDForest(modelFormula, data = train,
|
||||
method = "anova", maxDepth = 10, nTree = 50,
|
||||
importance = TRUE, seed = 123)
|
||||
|
||||
# Plot a dotchart of the variable importance as measured by the decision forest.
|
||||
rxVarImpPlot(dForest, main = "Variable Importance of Decision Forest")
|
||||
|
||||
# Step 5: Predict over new data and review the model performance
|
||||
# Plot Out-of-bag error rate comparing to the number of trees build in decision forest model.
|
||||
plot(dForest, main = "OOB Error Rate vs Number of Trees")
|
||||
|
||||
#---------------------------Step 5: Predict over new data and review the model performance---------------------------
|
||||
# Predict the probability on the test dataset.
|
||||
predict <- rxPredict(dForest, data = test, overwrite = TRUE,
|
||||
computeResiduals = TRUE)
|
||||
predict <- rxPredict(dForest, data = test, overwrite = TRUE, computeResiduals = TRUE)
|
||||
|
||||
# Calculate three statistical measures: Mean Absolute Error (MAE),
|
||||
# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
|
||||
sum <- rxSummary(~cnt_Resid_abs + cnt_Resid_2 + cnt_rel, data = predict,
|
||||
summaryStats = "Mean",
|
||||
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
|
||||
sum <- rxSummary(~ cnt_Resid_abs+cnt_Resid_2+cnt_rel, data = predict, summaryStats = "Mean",
|
||||
transforms = list(cnt_Resid_abs = abs(cnt_Resid),
|
||||
cnt_Resid_2 = cnt_Resid^2,
|
||||
cnt_rel = abs(cnt_Resid)/cnt)
|
||||
)$sDataFrame
|
||||
|
||||
# List all measures in a data frame.
|
||||
measures <- data.frame(MAE = sum[1, 2],
|
||||
RMSE = sqrt(sum[2, 2]),
|
||||
RAE = sum[3, 2])
|
||||
measures <- data.frame(MAE = sum[1, 2], RMSE = sqrt(sum[2, 2]), RAE = sum[3, 2])
|
||||
|
||||
# Review the measures.
|
||||
measures
|
||||
|
|
|
@ -1,67 +1,64 @@
|
|||
# Regression: Demand estimation with Microsoft R Server
|
||||
|
||||
# This example a replication of an existing Azure Machine Learning
|
||||
# Experiment - Regression: Demand Estimation
|
||||
#-------------------------------------------------------------------------------------------------------------
|
||||
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
|
||||
#-------------------------------------------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation
|
||||
# https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
|
||||
|
||||
# The dataset contains 17,379 rows and 17 columns, each row representing
|
||||
# the number of bike rentals within a given hour of a given day in
|
||||
# the years 2011 or 2012. Weather conditions (such as temperature, humidity,
|
||||
# and wind speed) were included in this raw feature set, and the dates
|
||||
# were categorized as holiday vs. weekday, etc.
|
||||
|
||||
# The field to predict is "cnt", which contains a count value ranging
|
||||
# from 1 to 977, representing the number of bike rentals within
|
||||
# a given hour.
|
||||
|
||||
# We built four models using the same algorithm, but with four different
|
||||
# training datasets. The four training datasets that we constructed
|
||||
# were all based on the same raw input data, but we added different
|
||||
# additional features to each training set.
|
||||
|
||||
#
|
||||
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within
|
||||
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity,
|
||||
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
|
||||
# weekday etc.
|
||||
#
|
||||
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
|
||||
# of bike rentals within a specific hour.
|
||||
#
|
||||
# We built four models using the same algorithm, but with four different training datasets. The four training
|
||||
# datasets that we constructed were all based on the same raw input data, but we added different additional
|
||||
# features to each training set.
|
||||
#
|
||||
# Set A = weather + holiday + weekday + weekend features for the predicted day
|
||||
# Set B = number of bikes that were rented in each of the previous 12 hours
|
||||
# Set C = number of bikes that were rented in each of the previous 12 days
|
||||
# at the same hour
|
||||
# Set D = number of bikes that were rented in each of the previous 12 weeks
|
||||
# at the same hour and the same day
|
||||
|
||||
# Set C = number of bikes that were rented in each of the previous 12 days at the same hour
|
||||
# Set D = number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day
|
||||
#
|
||||
# Each of these feature sets captures different aspects of the problem:
|
||||
# Feature set B captures very recent demand for the bikes.
|
||||
# Feature set C captures the demand for bikes at a particular hour.
|
||||
# Feature set D captures demand for bikes at a particular hour and
|
||||
# particular day of the week.
|
||||
|
||||
# The four training datasets were built by combining the feature set
|
||||
# as follows:
|
||||
# Feature set D captures demand for bikes at a particular hour and particular day of the week.
|
||||
#
|
||||
# The four training datasets were built by combining the feature set as follows:
|
||||
# Training set 1: feature set A only
|
||||
# Training set 2: feature sets A+B
|
||||
# Training set 3: feature sets A+B+C
|
||||
# Training set 4: feature sets A+B+C+D
|
||||
|
||||
# The following scripts include five basic steps of building this example
|
||||
# using Microsoft R Server.
|
||||
#
|
||||
# The following scripts include five basic steps of building this example using Microsoft R Server.
|
||||
# This execution might require more than two minutes.
|
||||
#
|
||||
#--------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
# Step 0: Get Started
|
||||
|
||||
# Check whether Microsoft R Server (RRE 8.0) is installed
|
||||
#---------------------------Step 0: Get Started-------------------------------
|
||||
# ----------------------------------------------------------------------------
|
||||
# Check if Microsoft R Server (RRE 8.0) is installed
|
||||
# ----------------------------------------------------------------------------
|
||||
if (!require("RevoScaleR")) {
|
||||
cat("RevoScaleR package does not seem to exist.
|
||||
\nThis means that the functions starting with 'rx' will not run.
|
||||
\nIf you have Microsoft R Server installed,
|
||||
\nplease switch the R engine.
|
||||
\nFor example, in R Tools for Visual Studio:
|
||||
\nR Tools -> Options -> R Engine.
|
||||
\nIf Microsoft R Server is not installed, you can download it from:
|
||||
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
|
||||
\n")
|
||||
\nThis means that the functions starting with 'rx' will not run.
|
||||
\nIf you have Microsoft R Server installed, please switch the R engine.
|
||||
\nFor example, in R Tools for Visual Studio:
|
||||
\nR Tools -> Options -> R Engine.
|
||||
\nIf Microsoft R Server is not installed, you can download it from:
|
||||
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
|
||||
\n")
|
||||
|
||||
quit()
|
||||
}
|
||||
|
||||
# Initial some variables.
|
||||
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
|
||||
|
||||
# Create a temporary directory to store the intermediate .xdf files.
|
||||
|
@ -70,36 +67,25 @@ outFileBike <- paste0(td, "/bike.xdf")
|
|||
outFileEdit <- paste0(td, "/editData.xdf")
|
||||
outFileLag <- paste0(td, "/lagData")
|
||||
|
||||
|
||||
# Step 1: Import Data
|
||||
|
||||
#---------------------------Step 1: Import Data---------------------------
|
||||
# Import the bike data.
|
||||
# Remove timestamps and all columns that are part of the label.
|
||||
bike_mrs <- rxImport(inData = inputFileBikeURL,
|
||||
outFile = outFileBike, overwrite = TRUE,
|
||||
bike_mrs <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
|
||||
missingValueString = "M", stringsAsFactors = FALSE,
|
||||
varsToDrop = c("instant",
|
||||
"dteday",
|
||||
"casual",
|
||||
"registered"))
|
||||
varsToDrop = c("instant", "dteday", "casual", "registered"))
|
||||
|
||||
# Edit Metadata: Define year, weather conditions and season columns
|
||||
# as categorical.
|
||||
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit,
|
||||
sortLevels = TRUE,
|
||||
factorInfo = c("yr", "weathersit", "season"),
|
||||
overwrite = TRUE)
|
||||
|
||||
|
||||
# Step 2: Feature Engineering
|
||||
# Edit Metadata: Definite year, weather conditions and season columns as categorical.
|
||||
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit, sortLevels = TRUE,
|
||||
factorInfo = c("yr", "weathersit", "season"), overwrite = TRUE)
|
||||
|
||||
#---------------------------Step 2: Feature Engineering---------------------------
|
||||
# Create a function to construct lag features for four different aspects.
|
||||
computeLagFeatures <- function (dataList) {
|
||||
# Total number of lags that need to be added.
|
||||
numLags <- length(nLagsVector)
|
||||
computeLagFeatures <- function(dataList) {
|
||||
|
||||
numLags <- length(nLagsVector) # total number of lags that need to be added
|
||||
for (iL in 1:numLags) {
|
||||
nlag <- nLagsVector[iL]
|
||||
varLagName <- paste("demand.", nlag, unit, sep = "")
|
||||
varLagName <- paste("cnt_", nlag, unit, sep = "")
|
||||
numRowsInChunk <- length(dataList[[baseVar]])
|
||||
numRowsToRead <- nlag * interval
|
||||
numRowsPadding <- 0
|
||||
|
@ -107,89 +93,67 @@ computeLagFeatures <- function (dataList) {
|
|||
numRowsToRead <- .rxStartRow - 1
|
||||
numRowsPadding <- nlag * interval - numRowsToRead
|
||||
}
|
||||
# Determine the current row to start processing the data
|
||||
# between chunks.
|
||||
startRow <- .rxStartRow - numRowsToRead
|
||||
startRow <- .rxStartRow - numRowsToRead # determine the current row to start processing the data between chunks.
|
||||
previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
|
||||
varsToKeep = baseVar,
|
||||
startRow = startRow,
|
||||
numRows = numRowsToRead,
|
||||
returnDataFrame = FALSE)
|
||||
varsToKeep = baseVar,
|
||||
startRow = startRow, numRows = numRowsToRead,
|
||||
returnDataFrame = FALSE)
|
||||
paddingRowsDataList <- rxReadXdf(file = .rxReadFileName,
|
||||
varsToKeep = baseVar,
|
||||
startRow = 1,
|
||||
numRows = numRowsPadding,
|
||||
returnDataFrame = FALSE)
|
||||
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]],
|
||||
previousRowsDataList[[baseVar]],
|
||||
dataList[[baseVar]])[1:numRowsInChunk]
|
||||
varsToKeep = baseVar,
|
||||
startRow = 1, numRows = numRowsPadding,
|
||||
returnDataFrame = FALSE)
|
||||
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]], previousRowsDataList[[baseVar]], dataList[[baseVar]])[1:numRowsInChunk]
|
||||
}
|
||||
return(dataList)
|
||||
}
|
||||
|
||||
# Create a function to add lag features a set of columns at a time.
|
||||
addLag <- function(inputData, outputFileBase) {
|
||||
|
||||
|
||||
inputFile <- inputData
|
||||
outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
|
||||
outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
|
||||
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf",
|
||||
sep="")
|
||||
|
||||
outputFileHour <- paste(outputFileBase, "_hour", ".xdf", sep = "")
|
||||
outputFileHourDay <- paste(outputFileBase, "_hour_day", ".xdf", sep = "")
|
||||
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf", sep = "")
|
||||
|
||||
# Initialize some fix values.
|
||||
hourInterval <- 1
|
||||
dayInterval <- 24
|
||||
weekInterval <- 168
|
||||
previous <- 12
|
||||
|
||||
# Add number of bikes that were rented in each of the previous 12 hours
|
||||
# (for Set B).
|
||||
rxDataStep(inData = inputFile, outFile = outputFileHour,
|
||||
transformFunc=computeLagFeatures,
|
||||
transformObjects = list(baseVar = "cnt", unit = "hour",
|
||||
nLagsVector=seq(12),
|
||||
interval = hourInterval),
|
||||
transformVars = c("cnt"), overwrite=TRUE)
|
||||
|
||||
# Add number of bikes that were rented in each of the previous 12 days
|
||||
# at the same hour (for Set C).
|
||||
rxDataStep(inData = outputFileHour, outFile = outputFileHourDay,
|
||||
transformFunc=computeLagFeatures,
|
||||
transformObjects = list(baseVar = "cnt",
|
||||
unit = "day",
|
||||
nLagsVector=seq(12),
|
||||
interval = dayInterval),
|
||||
transformVars = c("cnt"),
|
||||
overwrite=TRUE)
|
||||
|
||||
# Add number of bikes that were rented in each of the previous 12 weeks
|
||||
# at the same hour and the same day (for Set D).
|
||||
rxDataStep(inData = outputFileHourDay,
|
||||
outFile = outputFileHourDayWeek,
|
||||
transformFunc=computeLagFeatures,
|
||||
transformObjects = list(baseVar = "cnt",
|
||||
unit = "week",
|
||||
nLagsVector=seq(12),
|
||||
interval = weekInterval),
|
||||
transformVars = c("cnt"), overwrite=TRUE)
|
||||
|
||||
|
||||
# Add number of bikes that were rented in each of the previous 12 hours (for Set B).
|
||||
rxDataStep(inData = inputFile, outFile = outputFileHour, transformFunc = computeLagFeatures,
|
||||
transformObjects = list(baseVar = "cnt", unit = "hour", nLagsVector = seq(12),
|
||||
interval = hourInterval),
|
||||
transformVars = c("cnt"), overwrite = TRUE)
|
||||
|
||||
# Add number of bikes that were rented in each of the previous 12 days at the same hour (for Set C).
|
||||
rxDataStep(inData = outputFileHour, outFile = outputFileHourDay, transformFunc = computeLagFeatures,
|
||||
transformObjects = list(baseVar = "cnt", unit = "day", nLagsVector = seq(12),
|
||||
interval = dayInterval),
|
||||
transformVars = c("cnt"), overwrite = TRUE)
|
||||
|
||||
# Add number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day (for Set D).
|
||||
rxDataStep(inData = outputFileHourDay, outFile = outputFileHourDayWeek, transformFunc = computeLagFeatures,
|
||||
transformObjects = list(baseVar = "cnt", unit = "week", nLagsVector = seq(12),
|
||||
interval = weekInterval),
|
||||
transformVars = c("cnt"), overwrite = TRUE)
|
||||
|
||||
file.remove(outputFileHour)
|
||||
file.remove(outputFileHourDay)
|
||||
|
||||
return(outputFileHourDayWeek)
|
||||
}
|
||||
|
||||
# Set A = weather + holiday + weekday + weekend features for
|
||||
# the predicted day.
|
||||
# Set A = weather + holiday + weekday + weekend features for the predicted day.
|
||||
finalDataA_mrs <- editData_mrs
|
||||
|
||||
# Set B, C & D.
|
||||
finalDataLag_dir <- addLag(inputData = editData_mrs,
|
||||
outputFileBase = outFileLag)
|
||||
finalDataLag_dir <- addLag(inputData = editData_mrs, outputFileBase = outFileLag)
|
||||
finalDataLag_mrs <- RxXdfData(finalDataLag_dir)
|
||||
|
||||
|
||||
# Step 3: Prepare Training and Test Datasets
|
||||
|
||||
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
|
||||
## Set A:
|
||||
# Split Data.
|
||||
rxSplit(inData = finalDataA_mrs,
|
||||
|
@ -202,62 +166,62 @@ testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf"))
|
|||
|
||||
## Set B, C & D:
|
||||
# Split Data.
|
||||
rxSplit(inData = finalDataLag_mrs,
|
||||
outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
|
||||
rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
|
||||
overwrite = TRUE, reportProgress = 0, verbose = 0)
|
||||
# Point to the .xdf files for the training and test set.
|
||||
train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
|
||||
test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))
|
||||
|
||||
|
||||
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
|
||||
|
||||
newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
|
||||
newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")
|
||||
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
|
||||
newDayFeatures <- paste("cnt_", seq(12), "day", sep = "")
|
||||
newWeekFeatures <- paste("cnt_", seq(12), "week", sep = "")
|
||||
|
||||
## Set A:
|
||||
# Build a formula for the regression model and remove the "yr",
|
||||
# which is used to split the training and test data.
|
||||
formA_mrs <- formula(trainA_mrs, depVars = "cnt",
|
||||
varsToDrop = c("RowNum", "yr"))
|
||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
|
||||
formA_mrs <- formula(trainA_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
|
||||
# Fit Decision Forest Regression model.
|
||||
dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs,
|
||||
method = "anova", maxDepth = 10, nTree = 20,
|
||||
importance = TRUE, seed = 123)
|
||||
|
||||
## Set B:
|
||||
# Build a formula for the regression model and remove the "yr",
|
||||
# which is used to split the training and test data, and lag features
|
||||
# for Set C and D.
|
||||
formB_mrs <- formula(train_mrs, depVars = "cnt",
|
||||
varsToDrop = c("RowNum", "yr",
|
||||
newDayFeatures,
|
||||
newWeekFeatures))
|
||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set C and D.
|
||||
formB_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newDayFeatures, newWeekFeatures))
|
||||
# Fit Decision Forest Regression model.
|
||||
dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs,
|
||||
method = "anova", maxDepth = 10, nTree = 20,
|
||||
importance = TRUE, seed = 123)
|
||||
|
||||
## Set C:
|
||||
# Build a formula for the regression model and remove the "yr",
|
||||
# which is used to split the training and test data, and lag features
|
||||
# for Set D.
|
||||
formC_mrs <- formula(train_mrs, depVars = "cnt",
|
||||
varsToDrop = c("RowNum", "yr", newWeekFeatures))
|
||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set D.
|
||||
formC_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newWeekFeatures))
|
||||
# Fit Decision Forest Regression model.
|
||||
dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs,
|
||||
method = "anova", maxDepth = 10, nTree = 20,
|
||||
importance = TRUE, seed = 123)
|
||||
|
||||
## Set D:
|
||||
# Build a formula for the regression model and remove the "yr",
|
||||
# which is used to split the training and test data.
|
||||
formD_mrs <- formula(train_mrs, depVars = "cnt",
|
||||
varsToDrop = c("RowNum", "yr"))
|
||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
|
||||
formD_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
|
||||
# Fit Decision Forest Regression model.
|
||||
dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs,
|
||||
method = "anova", maxDepth = 10, nTree = 20,
|
||||
importance = TRUE, seed = 123)
|
||||
|
||||
# Plot four dotchart of the variable importance as measured by the four decision forest models.
|
||||
par(mfrow = c(2, 2))
|
||||
rxVarImpPlot(dForestA_mrs, main = "Variable Importance of Set A")
|
||||
rxVarImpPlot(dForestB_mrs, main = "Variable Importance of Set B")
|
||||
rxVarImpPlot(dForestC_mrs, main = "Variable Importance of Set C")
|
||||
rxVarImpPlot(dForestD_mrs, main = "Variable Importance of Set D")
|
||||
|
||||
# Step 5: Predict over new data
|
||||
# Plot Out-of-bag error rate comparing to the number of trees build in decision forest model.
|
||||
plot(dForestA_mrs, main = "OOB Error Rate vs Number of Trees: Set A")
|
||||
plot(dForestB_mrs, main = "OOB Error Rate vs Number of Trees: Set B")
|
||||
plot(dForestC_mrs, main = "OOB Error Rate vs Number of Trees: Set C")
|
||||
plot(dForestD_mrs, main = "OOB Error Rate vs Number of Trees: Set D")
|
||||
|
||||
#---------------------------Step 5: Predict over new data---------------------------
|
||||
## Set A:
|
||||
# Predict the probability on the test dataset.
|
||||
rxPredict(dForestA_mrs, data = testA_mrs,
|
||||
|
@ -286,22 +250,17 @@ rxPredict(dForestD_mrs, data = test_mrs,
|
|||
residVarNames = "cnt_Resid_D",
|
||||
overwrite = TRUE, computeResiduals = TRUE)
|
||||
|
||||
|
||||
# Prepare outputs
|
||||
#---------------------------Prepare outputs---------------------------
|
||||
## Set A:
|
||||
# Calculate three statistical measures: Mean Absolute Error (MAE),
|
||||
# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
|
||||
sumA <- rxSummary( ~ cnt_Resid_A_abs + cnt_Resid_A_2 + cnt_rel_A,
|
||||
data = testA_mrs, summaryStats = "Mean",
|
||||
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
|
||||
sumA <- rxSummary(~ cnt_Resid_A_abs+cnt_Resid_A_2+cnt_rel_A, data = testA_mrs, summaryStats = "Mean",
|
||||
transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A),
|
||||
cnt_Resid_A_2 = cnt_Resid_A^2,
|
||||
cnt_rel_A = abs(cnt_Resid_A)/cnt)
|
||||
)$sDataFrame
|
||||
|
||||
## Set B, C & D:
|
||||
sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
|
||||
cnt_Resid_C_abs + cnt_Resid_C_2 + cnt_rel_C +
|
||||
cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
|
||||
sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_Resid_C_2+cnt_rel_C+cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
|
||||
data = test_mrs, summaryStats = "Mean",
|
||||
transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B),
|
||||
cnt_Resid_B_2 = cnt_Resid_B^2,
|
||||
|
@ -312,7 +271,7 @@ sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
|
|||
cnt_Resid_D_abs = abs(cnt_Resid_D),
|
||||
cnt_Resid_D_2 = cnt_Resid_D^2,
|
||||
cnt_rel_D = abs(cnt_Resid_D)/cnt)
|
||||
)$sDataFrame
|
||||
)$sDataFrame
|
||||
|
||||
# Add row names.
|
||||
features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
|
||||
|
@ -323,10 +282,7 @@ features <- c("baseline: weather + holiday + weekday + weekend features for the
|
|||
# List all measures in a data frame.
|
||||
measures <- data.frame(Features = features,
|
||||
MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
|
||||
RMSE = c(sqrt(sumA[2, 2]),
|
||||
sqrt(sum[2, 2]),
|
||||
sqrt(sum[5, 2]),
|
||||
sqrt(sum[8, 2])),
|
||||
RMSE = c(sqrt(sumA[2, 2]), sqrt(sum[2, 2]), sqrt(sum[5, 2]), sqrt(sum[8, 2])),
|
||||
RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))
|
||||
|
||||
# Review the measures.
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
library("RCurl", quietly = TRUE)
|
||||
|
||||
# A URL contains the raw data.
|
||||
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
inputDataURL <- paste0(github, "Flight_Delays_Sample.csv")
|
||||
|
||||
# Download data from the URL.
|
||||
|
|
|
@ -17,7 +17,7 @@ if (!require("RevoScaleR")) {
|
|||
}
|
||||
|
||||
# A URL contains the raw data.
|
||||
github <- "https://raw.githubusercontent.com/mezmicrosoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
inputDataURL <- paste0(github, "Flight_Delays_Sample.csv")
|
||||
|
||||
# Create a temporary directory to store the intermediate .xdf files.
|
||||
|
|
|
@ -16,6 +16,9 @@
|
|||
# if the flight was on time.
|
||||
|
||||
# The following scripts include five basic steps of building this example using Microsoft R Server.
|
||||
# This execution might require more than one minute.
|
||||
#
|
||||
#------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
#---------------------------Step 0: Get Started-------------------------------
|
||||
|
@ -36,7 +39,7 @@ if (!require("RevoScaleR")) {
|
|||
}
|
||||
|
||||
# Initial some variables.
|
||||
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
|
||||
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
|
||||
inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
# are labeled 1 if a flight was delayed, and labeled 0 if the flight was on time.
|
||||
#
|
||||
# The following scripts include five basic steps of building this example using Microsoft R Server.
|
||||
#
|
||||
# This execution might require more than one minute.
|
||||
#
|
||||
#------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
@ -46,7 +46,7 @@ if (!require("RevoScaleR")) {
|
|||
}
|
||||
|
||||
# Initial some variables.
|
||||
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
|
||||
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
|
||||
inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")
|
||||
|
||||
|
|
|
@ -23,14 +23,14 @@
|
|||
# are labeled 1 if a flight was delayed, and labeled 0 if the flight was on time.
|
||||
#
|
||||
# The following scripts include five basic steps of building this example using open source R.
|
||||
#
|
||||
# This execution might require more than one minute.
|
||||
#
|
||||
#-------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
#---------------------------Step 0: Get Started---------------------------
|
||||
# Initial some variables.
|
||||
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/"
|
||||
github <- "https://raw.githubusercontent.com/brohrer-ms/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||
inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv")
|
||||
inputFileWeatherURL <- paste0(github, "Weather_Sample.csv")
|
||||
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# NOTE: In order to run this script you'll need to have the an Azure ML
|
||||
# workspace, with its workspace ID and key.
|
||||
# NOTE: In order to run the last part of the script you'll need to have
|
||||
# the an Azure ML workspace, with its workspace ID and key.
|
||||
# For details about Azure ML, go to http://studio.azureml.net/
|
||||
# ----------------------------------------------------------------------------
|
||||
# Enter your Azure ML Studio workspace info here before continuing.
|
||||
|
@ -23,8 +23,13 @@ library("MASS") # to use the Boston dataset
|
|||
# ----------------------------------------------------------------------------
|
||||
# fit a model and check model performance
|
||||
# ----------------------------------------------------------------------------
|
||||
# check the data
|
||||
head(Boston)
|
||||
ggplot(Boston, aes(x=medv)) +
|
||||
geom_histogram(binwidth=5) +
|
||||
ggtitle("Histogram of Response Variable")
|
||||
|
||||
# fit a model using all variables except medv as predictors
|
||||
# fit a model using medv as response and others as predictors
|
||||
lm1 <- lm(medv ~ ., data = Boston)
|
||||
|
||||
# check model performance
|
||||
|
@ -50,52 +55,63 @@ print(paste("Relative Squared Error: ",
|
|||
# ----------------------------------------------------------------------------
|
||||
# publish and consume a web service
|
||||
# ----------------------------------------------------------------------------
|
||||
# workspace information
|
||||
ws <- workspace(
|
||||
id = ws_id,
|
||||
auth = auth_token)
|
||||
# the following works only with valid AzureML workspace information
|
||||
|
||||
# define predict function
|
||||
mypredict <- function(newdata) {
|
||||
res <- predict(lm1, newdata)
|
||||
res
|
||||
}
|
||||
# get workspace information
|
||||
AML <- 0
|
||||
tryCatch(
|
||||
{
|
||||
ws <- workspace(id = ws_id, auth = auth_token)
|
||||
AML <- 1
|
||||
},
|
||||
error = function(cond){
|
||||
message("Azure ML workspace information was not valid.")
|
||||
}
|
||||
)
|
||||
|
||||
# test the prediction function
|
||||
newdata <- Boston[1, 1:13]
|
||||
print(mypredict(newdata))
|
||||
|
||||
# Publish the service
|
||||
ep <- publishWebService(ws = ws, fun = mypredict,
|
||||
name = "HousePricePrediction", inputSchema = newdata)
|
||||
|
||||
# consume web service - 1st approach
|
||||
pred <- consume(ep, newdata)
|
||||
pred
|
||||
|
||||
# consume web service - 2nd approach
|
||||
# retrieve web service information for later use
|
||||
service_id <- ep$WebServiceId
|
||||
# define endpoint
|
||||
ep_price_pred <- endpoints(ws, service_id)
|
||||
# consume
|
||||
consume(ep_price_pred, newdata)
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# update a web service and then consume
|
||||
# ----------------------------------------------------------------------------
|
||||
# define function for testing purpose
|
||||
mypredictnew <- function(newdata) {
|
||||
res <- predict(lm1, newdata) + 100
|
||||
res
|
||||
}
|
||||
|
||||
# update service with the new function
|
||||
ep_update <- updateWebService(
|
||||
ws = ws,
|
||||
fun = mypredictnew,
|
||||
inputSchema = newdata,
|
||||
serviceId = ep$WebServiceId)
|
||||
|
||||
# consume the updated web service
|
||||
consume(ep_price_pred, newdata)
|
||||
if (AML) {
|
||||
# define predict function
|
||||
mypredict <- function(newdata) {
|
||||
res <- predict(lm1, newdata)
|
||||
res
|
||||
}
|
||||
|
||||
# test the prediction function
|
||||
newdata <- Boston[1, 1:13]
|
||||
print(mypredict(newdata))
|
||||
|
||||
# Publish the service
|
||||
ep <- publishWebService(ws = ws, fun = mypredict,
|
||||
name = "HousePricePrediction", inputSchema = newdata)
|
||||
|
||||
# consume web service - 1st approach
|
||||
pred <- consume(ep, newdata)
|
||||
pred
|
||||
|
||||
# consume web service - 2nd approach
|
||||
# retrieve web service information for later use
|
||||
service_id <- ep$WebServiceId
|
||||
# define endpoint
|
||||
ep_price_pred <- endpoints(ws, service_id)
|
||||
# consume
|
||||
consume(ep_price_pred, newdata)
|
||||
|
||||
# define function for testing purpose
|
||||
mypredictnew <- function(newdata) {
|
||||
res <- predict(lm1, newdata) + 100
|
||||
res
|
||||
}
|
||||
|
||||
# update service with the new function
|
||||
ep_update <- updateWebService(
|
||||
ws = ws,
|
||||
fun = mypredictnew,
|
||||
inputSchema = newdata,
|
||||
serviceId = ep$WebServiceId)
|
||||
|
||||
# consume the updated web service
|
||||
consume(ep_price_pred, newdata)
|
||||
} else {
|
||||
message("Azure ML webservice is not deployed because ",
|
||||
"the workspace information is invalid")
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# purpose: to demonstrate the commonalities and differences among functions
|
||||
# in R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
|
||||
# audience: you are expected to have some prior experience with R
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# to learn more about the differences among R, MRO and MRS, refer to:
|
||||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server is installed and load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
# to check if RevoScaleR is available
|
||||
RRE <- require("RevoScaleR")
|
||||
if (!RRE)
|
||||
{
|
||||
message(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
"For example, in R Tools for Visual Studio: \n",
|
||||
"R Tools -> Options -> R Engine. \n",
|
||||
"If Microsoft R Server is not installed, you can download it from: \n",
|
||||
"https://www.microsoft.com/en-us/server-cloud/products/r-server/")
|
||||
}
|
||||
|
||||
# install a package if it's not already installed
|
||||
if (!require("ggplot2", quietly = TRUE))
|
||||
install.packages("ggplot2")
|
||||
|
||||
# load libraries
|
||||
library("ggplot2") # used for plotting
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# glm vs rxGlm
|
||||
# ----------------------------------------------------------------------------
|
||||
# check the data
|
||||
head(mtcars)
|
||||
|
||||
# check the response variable
|
||||
ggplot(mtcars, aes(x=vs)) +
|
||||
geom_histogram(binwidth=.3) +
|
||||
ggtitle("Distribution of Response Values")
|
||||
|
||||
# fit a model with glm(), this can be run on R, MRO, or MRS
|
||||
# predict V engine vs straight engine with weight and displacement
|
||||
logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
|
||||
summary(logistic1)
|
||||
|
||||
# fit the same model with rxGlm() if RevoScaleR is installed
|
||||
if (RRE){
|
||||
# check the data
|
||||
head(mtcars)
|
||||
|
||||
# predict V engine vs straight engine with weight and displacement
|
||||
logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
|
||||
summary(logistic2)
|
||||
} else {
|
||||
print("rxGlm was not run becauase the RevoScaleR package is not available")
|
||||
}
|
|
@ -10,9 +10,11 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server is installed and load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
if (!require("RevoScaleR"))
|
||||
# to check if RevoScaleR is available
|
||||
RRE <- require("RevoScaleR")
|
||||
if (!RRE)
|
||||
{
|
||||
stop(
|
||||
message(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
|
@ -31,27 +33,10 @@ library("MASS") # to use the mvrnorm function
|
|||
library("ggplot2") # used for plotting
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# glm (R, MRO, MRS) vs rxGlm (MRS)
|
||||
# ----------------------------------------------------------------------------
|
||||
# check the data
|
||||
head(mtcars)
|
||||
# fit a model with glm(), this can be run on R, MRO, or MRS
|
||||
# predict V engine vs straight engine with weight and displacement
|
||||
logistic1 <- glm(vs ~ wt + disp, data = mtcars, family = binomial)
|
||||
summary(logistic1)
|
||||
|
||||
# check the data
|
||||
head(mtcars)
|
||||
# fit the same model with rxGlm(), this can be run on MRS only
|
||||
# predict V engine vs straight engine with weight and displacement
|
||||
logistic2 <- rxGlm(vs ~ wt + disp, data = mtcars, family = binomial)
|
||||
summary(logistic2)
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# kmeans (R, MRO, MRS) vs rxKmeans (MRS)
|
||||
# simulate data, it works on R, MRO, or MRS
|
||||
# ----------------------------------------------------------------------------
|
||||
# make sure the results can be replicated
|
||||
set.seed(112)
|
||||
set.seed(121)
|
||||
|
||||
# function to simulate data
|
||||
simulCluster <- function(nsamples, mean, dimension, group)
|
||||
|
@ -83,42 +68,18 @@ ggplot(group_all, aes(x = V1, y = V2)) +
|
|||
# assign data
|
||||
mydata <- group_all[, 1:2]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# kmeans
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with kmeans(), it works on R, MRO, or MRS
|
||||
fit.kmeans <- kmeans(mydata, nclusters, iter.max = 1000, algorithm = "Lloyd")
|
||||
|
||||
# cluster analysis with rxKmeans(), it works on MRS only
|
||||
fit.rxKmeans <- rxKmeans( ~ V1 + V2, data = mydata,
|
||||
numClusters = nclusters, algorithm = "lloyd")
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# compare the cluster assignments between kmeans and rxKmeans (MRS): the same
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# the code below should be run on MRS due to the use of "rx" commands
|
||||
|
||||
# save a dataset in XDF format
|
||||
dataXDF = tempfile(fileext = ".xdf")
|
||||
rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
|
||||
# rxKmeans
|
||||
clust <- rxKmeans(~ V1 + V2, data = dataXDF,
|
||||
numClusters = nclusters, algorithm = "lloyd",
|
||||
outFile = dataXDF, outColName = "cluster",
|
||||
overwrite = TRUE)
|
||||
|
||||
rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
|
||||
|
||||
# append cluster assignment from kmeans and rxKmeans
|
||||
mydata_clusters <- cbind(
|
||||
group_all,
|
||||
kmeans.cluster = factor(fit.kmeans$cluster),
|
||||
rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
|
||||
|
||||
# compare the cluster assignments between kmeans and rxKmeans
|
||||
with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster))
|
||||
kmeans.cluster = factor(fit.kmeans$cluster))
|
||||
|
||||
# get cluster means
|
||||
clustermeans.kmeans <- fit.kmeans$centers
|
||||
clustermeans.rxKmeans <- fit.kmeans$centers
|
||||
|
||||
# plot clusters from kmeans
|
||||
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
|
||||
|
@ -129,11 +90,47 @@ ggplot(mydata_clusters, aes(x = V1, y = V2)) +
|
|||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by kmeans()")
|
||||
|
||||
# plot clusters from rxKmeans
|
||||
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = rxKmeans.cluster)) +
|
||||
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by rxKmeans()")
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with rxKmeans(), it works on MRS only
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# cluster analysis with rxKmeans() if RevoScaleR is installed
|
||||
if (RRE){
|
||||
# save a dataset in XDF format
|
||||
dataXDF = tempfile(fileext = ".xdf")
|
||||
rxImport(inData = mydata, outFile = dataXDF, overwrite = TRUE)
|
||||
|
||||
# rxKmeans
|
||||
clust <- rxKmeans(~ V1 + V2, data = dataXDF,
|
||||
numClusters = nclusters, algorithm = "lloyd",
|
||||
outFile = dataXDF, outColName = "cluster",
|
||||
overwrite = TRUE)
|
||||
|
||||
rxKmeans.cluster <- rxDataStep(dataXDF, varsToKeep = "cluster")
|
||||
|
||||
# append cluster assignment from kmeans and rxKmeans
|
||||
mydata_clusters <- cbind(
|
||||
group_all,
|
||||
kmeans.cluster = factor(fit.kmeans$cluster),
|
||||
rxKmeans.cluster = factor(rxKmeans.cluster$cluster))
|
||||
|
||||
# compare the cluster assignments between kmeans and rxKmeans
|
||||
message("\nComparing cluster assignments between kmeans and rxKmeans:")
|
||||
print(with(mydata_clusters, table(kmeans.cluster, rxKmeans.cluster)))
|
||||
|
||||
# get cluster means
|
||||
clustermeans.rxKmeans <- fit.kmeans$centers
|
||||
|
||||
# plot clusters from rxKmeans
|
||||
ggplot(mydata_clusters, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = rxKmeans.cluster)) +
|
||||
geom_point(data = as.data.frame(clustermeans.kmeans), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Clusters found by rxKmeans()")
|
||||
|
||||
} else{
|
||||
print("rxKmeans was not run becauase the RevoScaleR package is not available")
|
||||
}
|
||||
|
|
@ -10,9 +10,11 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server is installed and load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
if (!require("RevoScaleR"))
|
||||
# to check if RevoScaleR is available
|
||||
RRE <- require("RevoScaleR")
|
||||
if (RRE)
|
||||
{
|
||||
stop(
|
||||
message(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
|
@ -23,12 +25,12 @@ if (!require("RevoScaleR"))
|
|||
}
|
||||
|
||||
# install a package if it's not already installed
|
||||
|
||||
if (!require("ggplot2", quietly = TRUE))
|
||||
install.packages("ggplot2")
|
||||
|
||||
# load packages
|
||||
library("MASS") # to use the mvrnorm function
|
||||
library("ggplot2") # used for plotting
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# simulate cluster data for analysis, run this on R, MRO, or MRS
|
||||
|
@ -50,6 +52,9 @@ simulCluster <- function(nsamples, mean, dimension, group)
|
|||
# modify the value for nsamples to test out the capacity limit for kmeans()
|
||||
# on a computer with 7 GB RAM, when nsamples is 3*10^7 kmeans() failed
|
||||
# but rxKmeans() worked
|
||||
message("If the sample size is large, it will take some time to finish. \n",
|
||||
"You can use a smaller value for nsamples, say 1000, \n",
|
||||
"to test the program.")
|
||||
nsamples <- 3 * 10 ^ 7
|
||||
group_a <- simulCluster(nsamples, -1, 2, "a")
|
||||
group_b <- simulCluster(nsamples, 1, 2, "b")
|
||||
|
@ -57,16 +62,26 @@ group_all <- rbind(group_a, group_b)
|
|||
|
||||
nclusters <- 2
|
||||
|
||||
# plot sample data
|
||||
plot_data <- group_all[sample(nrow(group_all), 1000),]
|
||||
ggplot(plot_data, aes(x = V1, y = V2)) +
|
||||
geom_point(aes(colour = group)) +
|
||||
geom_point(data = data.frame(V1 = c(-1, 1), V2 = c(-1, 1)), size = 5) +
|
||||
xlim(-5, 5) + ylim(-5, 5) +
|
||||
geom_hline(yintercept = 0) +
|
||||
geom_vline(xintercept = 0) +
|
||||
ggtitle("Simulated data in two overlapping groups")
|
||||
|
||||
# save data
|
||||
mydata = group_all[, 1:2]
|
||||
dataCSV = tempfile(fileext = ".csv")
|
||||
dataXDF = tempfile(fileext = ".xdf")
|
||||
write.csv(group_all, dataCSV, row.names = FALSE)
|
||||
rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with kmeans(), it doesn't work when data is large enough
|
||||
# ----------------------------------------------------------------------------
|
||||
# this can be run on R, MRO, or MRS
|
||||
system_time_R <-
|
||||
system.time(
|
||||
{
|
||||
|
@ -76,9 +91,12 @@ system_time_R <-
|
|||
})
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# cluster analysis with rxKmeans(), it works even if kmeans() does not
|
||||
# cluster analysis with rxKmeans() on MRS, it works even if kmeans() does not
|
||||
# ----------------------------------------------------------------------------
|
||||
system_time_MRS <-
|
||||
# cluster analysis with rxKmeans() if RevoScaleR is installed
|
||||
if (RRE){
|
||||
rxImport(inData = dataCSV, outFile = dataXDF, overwrite = TRUE)
|
||||
|
||||
system.time(
|
||||
{
|
||||
clust <- rxKmeans( ~ V1 + V2, data = dataXDF,
|
||||
|
@ -88,3 +106,6 @@ system_time_MRS <-
|
|||
outColName = "cluster",
|
||||
overwrite = TRUE)
|
||||
})
|
||||
} else {
|
||||
print("rxKmeans was not run becauase the RevoScaleR package is not available")
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# purpose: to demonstrate the speed differences across
|
||||
# R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
|
||||
# audience: you are expected to have some prior experience with R
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# to learn more about the differences among R, MRO and MRS, refer to:
|
||||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# run the following code on R, MRO, and MRS and
|
||||
# notice the speed improvement with MRO and MRS over R
|
||||
# ----------------------------------------------------------------------------
|
||||
# the code in this section can be found at the following address
|
||||
# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
|
||||
|
||||
# print the default number of threads if MKL library is installed
|
||||
if (require("RevoUtilsMath"))
|
||||
{
|
||||
print(paste("The number of threads is:", getMKLthreads()))
|
||||
}
|
||||
|
||||
# Initialization
|
||||
set.seed(1)
|
||||
m <- 10000
|
||||
n <- 5000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
|
||||
# Matrix multiply
|
||||
system.time(B <- crossprod(A))
|
||||
|
||||
# Cholesky Factorization
|
||||
system.time(C <- chol(B))
|
||||
|
||||
# Singular Value Decomposition
|
||||
m <- 10000
|
||||
n <- 2000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
system.time(S <- svd(A, nu = 0, nv = 0))
|
||||
|
||||
# Principal Components Analysis
|
||||
m <- 10000
|
||||
n <- 2000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
system.time(P <- prcomp(A))
|
||||
|
||||
# Linear Discriminant Analysis
|
||||
library("MASS")
|
||||
g <- 5
|
||||
k <- round(m / 2)
|
||||
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
|
||||
train <- sample(1:m, k)
|
||||
system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
|
|
@ -0,0 +1,44 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# purpose: to demonstrate the speed similarities for kmeans across
|
||||
# R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
|
||||
# audience: you are expected to have some prior experience with R
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# to learn more about the differences among R, MRO and MRS, refer to:
|
||||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
library("MASS") # to use the mvrnorm function
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# run the following analysis that does not involve matrix manipulation
|
||||
# to observe that the speed is similar on R, MRO and MRS
|
||||
# ----------------------------------------------------------------------------
|
||||
set.seed(0)
|
||||
|
||||
# function to simulate data
|
||||
simulCluster <- function(nsamples, mean, dimension, group)
|
||||
{
|
||||
Sigma <- diag(1, dimension, dimension)
|
||||
x <- mvrnorm(n = nsamples, rep(mean, dimension), Sigma)
|
||||
z <- as.data.frame(x)
|
||||
z$group = group
|
||||
z
|
||||
}
|
||||
|
||||
# simulate data
|
||||
nsamples <- 10 ^ 7 # this was used on different platforms
|
||||
# nsamples <- 1000 # for testing purpose
|
||||
group_a <- simulCluster(nsamples, -1, 2, "a")
|
||||
group_b <- simulCluster(nsamples, 1, 2, "b")
|
||||
group_all <- rbind(group_a, group_b)
|
||||
|
||||
nclusters <- 2
|
||||
|
||||
mydata = group_all[, 1:2]
|
||||
# K-Means Cluster Analysis
|
||||
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd"))
|
|
@ -1,6 +1,6 @@
|
|||
# ----------------------------------------------------------------------------
|
||||
# purpose: to demonstrate the speed differences across
|
||||
# R, Microsoft R Open (MRO), and Microsoft R Server (MRS)
|
||||
# purpose: to compare the speed of kmeans() with that of rxKmeans()
|
||||
# on Microsoft R Server (MRS)
|
||||
# audience: you are expected to have some prior experience with R
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
@ -8,11 +8,13 @@
|
|||
# https://github.com/lixzhang/R-MRO-MRS
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# check if Microsoft R Server (RRE 8.0) is installed and load libraries
|
||||
# check if Microsoft R Server is installed and load libraries
|
||||
# ----------------------------------------------------------------------------
|
||||
if (!require("RevoScaleR"))
|
||||
# to check if RevoScaleR is available
|
||||
RRE <- require("RevoScaleR")
|
||||
if (!RRE)
|
||||
{
|
||||
stop(
|
||||
message(
|
||||
"RevoScaleR package does not seem to exist. \n",
|
||||
"This means that the functions starting with 'rx' will not run. \n",
|
||||
"If you have Microsoft R Server installed, please switch the R engine.\n",
|
||||
|
@ -31,55 +33,16 @@ library("MASS") # to use the mvrnorm function
|
|||
library("ggplot2") # used for plotting
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# run the following code on R, MRO, and MRS and
|
||||
# notice the speed improvement with MRO and MRS over R
|
||||
# compare the speed of kmeans with that of rxKmeans on MRS
|
||||
# for different data sizes
|
||||
# ----------------------------------------------------------------------------
|
||||
# the code in this section can be found at the following address
|
||||
# https://mran.revolutionanalytics.com/documents/rro/multithread/#mt-bench
|
||||
# to save timing results
|
||||
myresult <- data.frame(nsamples = integer(), time_r = double(),
|
||||
time_rre = double())
|
||||
|
||||
# print the default number of threads if MKL library is installed
|
||||
if (require("RevoUtilsMath"))
|
||||
{
|
||||
print(paste("The number of threads is:", getMKLthreads()))
|
||||
}
|
||||
|
||||
# Initialization
|
||||
set.seed(1)
|
||||
m <- 10000
|
||||
n <- 5000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
|
||||
# Matrix multiply
|
||||
system.time(B <- crossprod(A))
|
||||
|
||||
# Cholesky Factorization
|
||||
system.time(C <- chol(B))
|
||||
|
||||
# Singular Value Decomposition
|
||||
m <- 10000
|
||||
n <- 2000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
system.time(S <- svd(A, nu = 0, nv = 0))
|
||||
|
||||
# Principal Components Analysis
|
||||
m <- 10000
|
||||
n <- 2000
|
||||
A <- matrix(runif(m * n), m, n)
|
||||
system.time(P <- prcomp(A))
|
||||
|
||||
# Linear Discriminant Analysis
|
||||
library("MASS")
|
||||
g <- 5
|
||||
k <- round(m / 2)
|
||||
A <- data.frame(A, fac = sample(LETTERS[1:g], m, replace = TRUE))
|
||||
train <- sample(1:m, k)
|
||||
system.time(L <- lda(fac ~ ., data = A, prior = rep(1, g) / g, subset = train))
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# run an analysis that does not involve matrix to show that
|
||||
# the speed is similar on R, MRO and MRS
|
||||
# ----------------------------------------------------------------------------
|
||||
set.seed(0)
|
||||
# list of sample sizes
|
||||
nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
|
||||
5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
|
||||
|
||||
# function to simulate data
|
||||
simulCluster <- function(nsamples, mean, dimension, group)
|
||||
|
@ -91,33 +54,6 @@ simulCluster <- function(nsamples, mean, dimension, group)
|
|||
z
|
||||
}
|
||||
|
||||
# simulate data
|
||||
nsamples <- 10 ^ 7 # this was used on different platforms
|
||||
# nsamples <- 1000 # for testing purpose
|
||||
group_a <- simulCluster(nsamples, -1, 2, "a")
|
||||
group_b <- simulCluster(nsamples, 1, 2, "b")
|
||||
group_all <- rbind(group_a, group_b)
|
||||
|
||||
nclusters <- 2
|
||||
|
||||
mydata = group_all[, 1:2]
|
||||
# K-Means Cluster Analysis
|
||||
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd"))
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# compare the speed of kmeans() with that of rxKmeans()
|
||||
# for different data sizes
|
||||
# ----------------------------------------------------------------------------
|
||||
# to save timing results
|
||||
myresult <- data.frame(nsamples = integer(), time_r = double(),
|
||||
time_rre = double())
|
||||
|
||||
# list of sample sizes
|
||||
nsamples_list <- c(5 * 10 ^ 2, 10 ^ 3, 5 * 10 ^ 3, 10 ^ 4, 5 * 10 ^ 4, 10 ^ 5,
|
||||
5 * 10 ^ 5, 10 ^ 6, 5 * 10 ^ 6, 10 ^ 7)
|
||||
|
||||
for (nsamples in nsamples_list)
|
||||
{
|
||||
# simulate data and append
|
||||
|
@ -125,25 +61,29 @@ for (nsamples in nsamples_list)
|
|||
group_b <- simulCluster(nsamples, 1, 2, "b")
|
||||
group_all <- rbind(group_a, group_b)
|
||||
mydata = group_all[, 1:2]
|
||||
|
||||
|
||||
nclusters <- 2
|
||||
|
||||
|
||||
# kmeans with R
|
||||
system_time_r <- system.time(fit <- kmeans(mydata, nclusters,
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd"))
|
||||
|
||||
iter.max = 1000,
|
||||
algorithm = "Lloyd"))
|
||||
|
||||
# kmeans with MRS
|
||||
system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
|
||||
numClusters = nclusters,
|
||||
algorithm = "lloyd"))
|
||||
|
||||
|
||||
if (RRE){
|
||||
system_time_rre <- system.time(clust <- rxKmeans( ~ V1 + V2, data = mydata,
|
||||
numClusters = nclusters,
|
||||
algorithm = "lloyd"))
|
||||
}
|
||||
else system_time_rre <- c(NA,NA,NA)
|
||||
|
||||
# combine
|
||||
newrow <- data.frame(nsamples = nsamples,
|
||||
time_r = as.numeric(system_time_r[3]),
|
||||
time_rre = as.numeric(system_time_rre[3]))
|
||||
time_r = as.numeric(system_time_r[3]),
|
||||
time_rre = as.numeric(system_time_rre[3]))
|
||||
myresult <- rbind(myresult, newrow)
|
||||
|
||||
|
||||
}
|
||||
|
||||
myresult$nsamples <- 2 * myresult$nsamples
|
||||
|
@ -152,7 +92,8 @@ mydata$nsamples_log <- log10(mydata$nsamples)
|
|||
mydata
|
||||
|
||||
# generate plot
|
||||
ggplot(data = mydata, aes(x = nsamples_log)) +
|
||||
if (RRE){
|
||||
ggplot(data = mydata, aes(x = nsamples_log)) +
|
||||
geom_point(aes(y = time_r, colour = "kmeans")) +
|
||||
geom_line(aes(y = time_r, colour = "kmeans")) +
|
||||
geom_point(aes(y = time_rre, colour = "rxKmeans")) +
|
||||
|
@ -161,4 +102,14 @@ ggplot(data = mydata, aes(x = nsamples_log)) +
|
|||
scale_colour_manual("Function", values = c(kmeans = "red", rxKmeans = "blue")) +
|
||||
xlab("log10(number of samples)") +
|
||||
ylab("time in seconds") +
|
||||
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
|
||||
ggtitle("If data fits in memory, kmeans() and rxKmeans() are equally performant")
|
||||
} else {
|
||||
ggplot(data = mydata, aes(x = nsamples_log)) +
|
||||
geom_point(aes(y = time_r, colour = "kmeans")) +
|
||||
geom_line(aes(y = time_r, colour = "kmeans")) +
|
||||
scale_x_continuous(breaks = seq(2, 8, by = 1)) +
|
||||
scale_colour_manual("Function", values = c(kmeans = "red")) +
|
||||
xlab("log10(number of samples)") +
|
||||
ylab("time in seconds") +
|
||||
ggtitle("Time for kmeans. To add time for rxKmean, use the RRE engine")
|
||||
}
|
Загрузка…
Ссылка в новой задаче