This commit is contained in:
Brandon Rohrer 2016-03-15 16:06:28 -04:00
Родитель ac4919044e
Коммит 226842a54a
7 изменённых файлов: 375 добавлений и 297 удалений

Просмотреть файл

@ -89,7 +89,7 @@ library(help = foreign)
# To install a new package, use install.packages()
# Install the ggplot2 package for it's plotting capability.
if (!require("ggplot2"))
install.packages("ggplot2")
install.packages("ggplot2")
# Then load the package.
library("ggplot2")
@ -146,7 +146,7 @@ theme_set(theme_gray(base_size = 18))
# In this sample you use ggplot2.
ggplot(diamondSample, aes(x = carat, y = price)) +
geom_point(colour = "blue")
geom_point(colour = "blue")
# Add a log scale.
ggplot(diamondSample, aes(x = carat, y = price)) +
@ -155,9 +155,9 @@ ggplot(diamondSample, aes(x = carat, y = price)) +
# Add a log scale for both scales.
ggplot(diamondSample, aes(x = carat, y = price)) +
geom_point(colour = "blue") +
scale_x_log10() +
scale_y_log10()
geom_point(colour = "blue") +
scale_x_log10() +
scale_y_log10()
### Linear Regression in R
@ -177,10 +177,10 @@ exp(coef(model)[1])
# Show the model in a plot.
ggplot(diamondSample, aes(x = carat, y = price)) +
geom_point(colour = "blue") +
geom_smooth(method = "lm", colour = "red", size = 2) +
scale_x_log10() +
scale_y_log10()
geom_point(colour = "blue") +
geom_smooth(method = "lm", colour = "red", size = 2) +
scale_x_log10() +
scale_y_log10()
### Regression Diagnostics

Просмотреть файл

@ -36,8 +36,8 @@ theme_set(theme_gray(base_size = 18))
# and lat to the y-axis.
# Then you add a layer with points (geom_point) and a layer to plot maps.
p0 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point() +
coord_map()
geom_point() +
coord_map()
p0
# You can use a number of different aesthetics, for example colour or size
@ -45,14 +45,14 @@ p0
# Map the depth column to the colour aesthetic.
p1 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth)) +
coord_map()
geom_point(aes(colour = depth)) +
coord_map()
p1
# Add size for magnitude. The bigger the magnitude, the larger the point.
p2 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag)) +
coord_map()
geom_point(aes(colour = depth, size = mag)) +
coord_map()
p2
# You can control the transparancy of a plot object with the alpha aesthetic.
@ -60,24 +60,24 @@ p2
# are translucent.
# Add alpha level to hide overplotting, thus revealing detail.
p3 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map()
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map()
p3
# Change colour gradient by adding a gradient scale.
p4 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map() +
scale_colour_gradient(low = "blue", high = "red")
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map() +
scale_colour_gradient(low = "blue", high = "red")
p4
# Add a plot title.
p5 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
scale_colour_gradient(low = "blue", high = "red") +
ggtitle("Distribution of earthquakes near Fiji") +
coord_map()
p5
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
scale_colour_gradient(low = "blue", high = "red") +
ggtitle("Distribution of earthquakes near Fiji") +
coord_map()
p5
# Now plot multiple plots on the same graphic.
# The package "grid" is built into R and allows you to take control of the
@ -89,10 +89,10 @@ theme_set(theme_grey(12) + theme(legend.key.size = unit(0.5, "lines")))
library(grid)
plot.new()
grid.draw(cbind(
ggplotGrob(p1),
ggplotGrob(p2),
ggplotGrob(p3),
size = "last"
))
ggplotGrob(p1),
ggplotGrob(p2),
ggplotGrob(p3),
size = "last"
))

Просмотреть файл

@ -1,30 +1,27 @@
#-------------------------------------------------------------------------------------------------------------
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
#-------------------------------------------------------------------------------------------------------------
#
#
# This example demonstrates the feature engineering process for building a regression model to predict
# bike rental demand.
#
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity,
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
# weekday etc.
#
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
# of bike rentals within a specific hour. The lag features we add in the data set is the number of bikes that
# were rented in each of the previous 12 hours, which caputures the very recent demand for the bikes.
#
# The following scripts include five basic steps of building this example using Microsoft R Server.
#
#
#--------------------------------------------------------------------------------------------------------------
# Regression: Demand estimation with Microsoft R Server
# This example demonstrates the feature engineering process for building
# a regression model to predict bike rental demand.
# The dataset contains 17,379 rows and 17 columns, each row representing
# the number of bike rentals within a specific hour of a day in the years
# 2011 or 2012. Weather conditions (such as temperature, humidity,
# and wind speed) were included in this raw feature set, and
# the dates were categorized as holiday vs. weekday, etc.
# The field to predict is "cnt", which contains a count value ranging
# from 1 to 977, representing the number of bike rentals within
# a specific hour. The lag features we add in the data set are the number
# of bikes that were rented in each of the previous 12 hours.
# This caputures the very recent demand for the bikes.
# The following scripts include five basic steps of building this example
# using Microsoft R Server.
#---------------------------Step 0: Get Started-------------------------------
# ----------------------------------------------------------------------------
# Check if Microsoft R Server (RRE 8.0) is installed
# ----------------------------------------------------------------------------
# Step 0: Get Started
# Check whether Microsoft R Server (RRE 8.0) is installed
if (!require("RevoScaleR")) {
cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run.
@ -38,7 +35,7 @@ if (!require("RevoScaleR")) {
quit()
}
# Initial some variables.
# Initialize some variables.
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
@ -47,81 +44,115 @@ td <- tempdir()
outFileBike <- paste0(td, "/bike.xdf")
outFileLag <- paste0(td, "/lagData.xdf")
#---------------------------Step 1: Import the Bike Data---------------------------
bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
# Step 1: Import the Bike Data
bike <- rxImport(inData = inputFileBikeURL,
outFile = outFileBike, overwrite = TRUE,
missingValueString = "M", stringsAsFactors = FALSE,
# Remove timestamps and all columns that are part of the label.
# Remove timestamps and all columns that are
# part of the label.
varsToDrop = c("instant", "dteday", "casual", "registered"),
# Definite year, weather conditions and season columns as categorical.
# Define year, weather conditions and season columns
# as categorical.
colInfo = list(yr = list(type = "factor"),
weathersit = list(type = "factor"),
season = list(type = "factor")))
#---------------------------Step 2: Feature Engineering---------------------------
# Add number of bikes that were rented in each of the previous 12 hours as 12 lag features.
computeLagFeatures <- function (dataList) { # function for computing lag features.
# Step 2: Feature Engineering
# Add the number of bikes that were rented in each of the previous
# 12 hours as 12 lag features.
computeLagFeatures <- function(dataList) {
# Total number of lags that need to be added.
numLags <- length(nLagsVector)
# lag feature names as lagN
varLagNameVector <- paste("lag", nLagsVector, sep="")
numLags <- length(nLagsVector) # total number of lags that need to be added
varLagNameVector <- paste("lag", nLagsVector, sep="") # lag feature names as lagN
# Set the value of an object "storeLagData" in the transform environment.
if (!exists("storeLagData"))
{
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
names(lagData) <- varLagNameVector
.rxSet("storeLagData",lagData)
}
if (!.rxIsTestChunk)
{
for (iL in 1:numLags)
{
numRowsInChunk <- length(dataList[[varName]]) # number of rows in the current chunk
nlags <- nLagsVector[iL]
varLagName <- paste("lag", nlags, sep="")
lagData <- .rxGet("storeLagData") # retrieve lag data from the previous chunk
allData <- c(lagData[[varLagName]], dataList[[varName]]) # concatenate lagData and the "cnt" feature
dataList[[varLagName]] <- allData[1:numRowsInChunk] # take the first N rows of allData, where N is the total number of rows in the original dataList
lagData[[varLagName]] <- tail(allData, nlags) # save last nlag rows as the new lagData to be used to process in the next chunk
.rxSet("storeLagData", lagData)
# Set the value of an object "storeLagData" in the transform environment.
if (!exists("storeLagData"))
{
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
names(lagData) <- varLagNameVector
.rxSet("storeLagData",lagData)
}
}
return(dataList)
if (!.rxIsTestChunk)
{
for (iL in 1:numLags) {
# Number of rows in the current chunk.
numRowsInChunk <- length(dataList[[varName]])
nlags <- nLagsVector[iL]
varLagName <- paste("lag", nlags, sep = "")
# Retrieve lag data from the previous chunk.
lagData <- .rxGet("storeLagData")
# Concatenate lagData and the "cnt" feature.
allData <- c(lagData[[varLagName]], dataList[[varName]])
# Take the first N rows of allData, where N is the total
# number of rows in the original dataList.
dataList[[varLagName]] <- allData[1:numRowsInChunk]
# Save last nlag rows as the new lagData to be used
# to process in the next chunk.
lagData[[varLagName]] <- tail(allData, nlags)
.rxSet("storeLagData", lagData)
}
}
return(dataList)
}
# Apply the "computeLagFeatures" on the bike data.
lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = computeLagFeatures,
transformObjects = list(varName = "cnt", nLagsVector = seq(12)),
lagData <- rxDataStep(inData = bike, outFile = outFileLag,
transformFunc = computeLagFeatures,
transformObjects = list(varName = "cnt",
nLagsVector = seq(12)),
transformVars = "cnt", overwrite=TRUE)
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
# Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012.
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0)
# Step 3: Prepare Training and Test Datasets
# Split data by "yr" so that the training data contains records
# for the year 2011 and the test data contains records for 2012.
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"),
splitByFactor = "yr", overwrite = TRUE,
reportProgress = 0, verbose = 0)
# Point to the .xdf files for the training and test set.
train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
modelFormula <- formula(train, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data.
modelFormula <- formula(train, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
# Fit a Decision Forest Regression model on the training data.
dForest <- rxDForest(modelFormula, data = train, importance = TRUE, seed = 123)
dForest <- rxDForest(modelFormula, data = train, importance = TRUE,
seed = 123)
# Step 5: Predict over new data and review the model performance
#---------------------------Step 5: Predict over new data and review the model performance---------------------------
# Predict the probability on the test dataset.
predict <- rxPredict(dForest, data = test, overwrite = TRUE, computeResiduals = TRUE)
predict <- rxPredict(dForest, data = test, overwrite = TRUE,
computeResiduals = TRUE)
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sum <- rxSummary(~ cnt_Resid_abs+cnt_Resid_2+cnt_rel, data = predict, summaryStats = "Mean",
# Calculate three statistical measures: Mean Absolute Error (MAE),
# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sum <- rxSummary(~cnt_Resid_abs + cnt_Resid_2 + cnt_rel, data = predict,
summaryStats = "Mean",
transforms = list(cnt_Resid_abs = abs(cnt_Resid),
cnt_Resid_2 = cnt_Resid^2,
cnt_rel = abs(cnt_Resid)/cnt)
)$sDataFrame
)$sDataFrame
# List all measures in a data frame.
measures <- data.frame(MAE = sum[1, 2], RMSE = sqrt(sum[2, 2]), RAE = sum[3, 2])
measures <- data.frame(MAE = sum[1, 2],
RMSE = sqrt(sum[2, 2]),
RAE = sum[3, 2])
# Review the measures.
measures

Просмотреть файл

@ -1,58 +1,61 @@
#-------------------------------------------------------------------------------------------------------------
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
#-------------------------------------------------------------------------------------------------------------
#
#
# This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation
# Regression: Demand estimation with Microsoft R Server
# This example a replication of an existing Azure Machine Learning
# Experiment - Regression: Demand Estimation
# https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
#
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity,
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
# weekday etc.
#
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
# of bike rentals within a specific hour.
#
# We built four models using the same algorithm, but with four different training datasets. The four training
# datasets that we constructed were all based on the same raw input data, but we added different additional
# features to each training set.
#
# The dataset contains 17,379 rows and 17 columns, each row representing
# the number of bike rentals within a given hour of a given day in
# the years 2011 or 2012. Weather conditions (such as temperature, humidity,
# and wind speed) were included in this raw feature set, and the dates
# were categorized as holiday vs. weekday, etc.
# The field to predict is "cnt", which contains a count value ranging
# from 1 to 977, representing the number of bike rentals within
# a given hour.
# We built four models using the same algorithm, but with four different
# training datasets. The four training datasets that we constructed
# were all based on the same raw input data, but we added different
# additional features to each training set.
# Set A = weather + holiday + weekday + weekend features for the predicted day
# Set B = number of bikes that were rented in each of the previous 12 hours
# Set C = number of bikes that were rented in each of the previous 12 days at the same hour
# Set D = number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day
#
# Set C = number of bikes that were rented in each of the previous 12 days
# at the same hour
# Set D = number of bikes that were rented in each of the previous 12 weeks
# at the same hour and the same day
# Each of these feature sets captures different aspects of the problem:
# Feature set B captures very recent demand for the bikes.
# Feature set C captures the demand for bikes at a particular hour.
# Feature set D captures demand for bikes at a particular hour and particular day of the week.
#
# The four training datasets were built by combining the feature set as follows:
# Feature set D captures demand for bikes at a particular hour and
# particular day of the week.
# The four training datasets were built by combining the feature set
# as follows:
# Training set 1: feature set A only
# Training set 2: feature sets A+B
# Training set 3: feature sets A+B+C
# Training set 4: feature sets A+B+C+D
#
# The following scripts include five basic steps of building this example using Microsoft R Server.
#
#
#--------------------------------------------------------------------------------------------------------------
# The following scripts include five basic steps of building this example
# using Microsoft R Server.
#---------------------------Step 0: Get Started-------------------------------
# ----------------------------------------------------------------------------
# Check if Microsoft R Server (RRE 8.0) is installed
# ----------------------------------------------------------------------------
# Step 0: Get Started
# Check whether Microsoft R Server (RRE 8.0) is installed
if (!require("RevoScaleR")) {
cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run.
\nIf you have Microsoft R Server installed, please switch the R engine.
\nFor example, in R Tools for Visual Studio:
\nR Tools -> Options -> R Engine.
\nIf Microsoft R Server is not installed, you can download it from:
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
\n")
\nThis means that the functions starting with 'rx' will not run.
\nIf you have Microsoft R Server installed,
\nplease switch the R engine.
\nFor example, in R Tools for Visual Studio:
\nR Tools -> Options -> R Engine.
\nIf Microsoft R Server is not installed, you can download it from:
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
\n")
quit()
}
@ -67,95 +70,126 @@ outFileBike <- paste0(td, "/bike.xdf")
outFileEdit <- paste0(td, "/editData.xdf")
outFileLag <- paste0(td, "/lagData")
#---------------------------Step 1: Import Data---------------------------
# Step 1: Import Data
# Import the bike data.
# Remove timestamps and all columns that are part of the label.
bike_mrs <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
bike_mrs <- rxImport(inData = inputFileBikeURL,
outFile = outFileBike, overwrite = TRUE,
missingValueString = "M", stringsAsFactors = FALSE,
varsToDrop = c("instant", "dteday", "casual", "registered"))
varsToDrop = c("instant",
"dteday",
"casual",
"registered"))
# Edit Metadata: Definite year, weather conditions and season columns as categorical.
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit, sortLevels = TRUE,
factorInfo = c("yr", "weathersit", "season"), overwrite = TRUE)
# Edit Metadata: Define year, weather conditions and season columns
# as categorical.
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit,
sortLevels = TRUE,
factorInfo = c("yr", "weathersit", "season"),
overwrite = TRUE)
# Step 2: Feature Engineering
#---------------------------Step 2: Feature Engineering---------------------------
# Create a function to construct lag features for four different aspects.
computeLagFeatures <- function (dataList) {
numLags <- length(nLagsVector) # total number of lags that need to be added
for (iL in 1:numLags)
{
nlag <- nLagsVector[iL]
varLagName <- paste("demand.",nlag,unit,sep="")
numRowsInChunk <- length(dataList[[baseVar]])
numRowsToRead <- nlag * interval
numRowsPadding <- 0
if (numRowsToRead >= .rxStartRow)
{
numRowsToRead <- .rxStartRow - 1
numRowsPadding <- nlag * interval - numRowsToRead
# Total number of lags that need to be added.
numLags <- length(nLagsVector)
for (iL in 1:numLags) {
nlag <- nLagsVector[iL]
varLagName <- paste("demand.", nlag, unit, sep = "")
numRowsInChunk <- length(dataList[[baseVar]])
numRowsToRead <- nlag * interval
numRowsPadding <- 0
if (numRowsToRead >= .rxStartRow) {
numRowsToRead <- .rxStartRow - 1
numRowsPadding <- nlag * interval - numRowsToRead
}
# Determine the current row to start processing the data
# between chunks.
startRow <- .rxStartRow - numRowsToRead
previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
varsToKeep = baseVar,
startRow = startRow,
numRows = numRowsToRead,
returnDataFrame = FALSE)
paddingRowsDataList <- rxReadXdf(file = .rxReadFileName,
varsToKeep = baseVar,
startRow = 1,
numRows = numRowsPadding,
returnDataFrame = FALSE)
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]],
previousRowsDataList[[baseVar]],
dataList[[baseVar]])[1:numRowsInChunk]
}
startRow <- .rxStartRow - numRowsToRead # determine the current row to start processing the data between chunks.
previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
varsToKeep = baseVar,
startRow = startRow, numRows = numRowsToRead,
returnDataFrame = FALSE)
paddingRowsDataList <- rxReadXdf(file=.rxReadFileName,
varsToKeep = baseVar,
startRow = 1, numRows = numRowsPadding,
returnDataFrame = FALSE)
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]], previousRowsDataList[[baseVar]], dataList[[baseVar]])[1:numRowsInChunk]
}
return(dataList)
return(dataList)
}
# Create a function to add lag features a set of columns at a time.
addLag <- function(inputData, outputFileBase) {
inputFile <- inputData
outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week",".xdf",sep="")
inputFile <- inputData
outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf",
sep="")
# Initialize some fix values.
hourInterval <- 1
dayInterval <- 24
weekInterval <- 168
previous <- 12
# Initialize some fix values.
hourInterval <- 1
dayInterval <- 24
weekInterval <- 168
previous <- 12
# Add number of bikes that were rented in each of the previous 12 hours (for Set B).
rxDataStep(inData = inputFile,outFile = outputFileHour, transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt", unit = "hour",nLagsVector=seq(12),
interval = hourInterval),
transformVars = c("cnt"), overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 hours
# (for Set B).
rxDataStep(inData = inputFile, outFile = outputFileHour,
transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt", unit = "hour",
nLagsVector=seq(12),
interval = hourInterval),
transformVars = c("cnt"), overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 days at the same hour (for Set C).
rxDataStep(inData = outputFileHour,outFile = outputFileHourDay, transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt", unit = "day",nLagsVector=seq(12),
interval = dayInterval),
transformVars = c("cnt"), overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 days
# at the same hour (for Set C).
rxDataStep(inData = outputFileHour, outFile = outputFileHourDay,
transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt",
unit = "day",
nLagsVector=seq(12),
interval = dayInterval),
transformVars = c("cnt"),
overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day (for Set D).
rxDataStep(inData = outputFileHourDay,outFile = outputFileHourDayWeek, transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt", unit = "week",nLagsVector=seq(12),
interval = weekInterval),
transformVars = c("cnt"), overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 weeks
# at the same hour and the same day (for Set D).
rxDataStep(inData = outputFileHourDay,
outFile = outputFileHourDayWeek,
transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt",
unit = "week",
nLagsVector=seq(12),
interval = weekInterval),
transformVars = c("cnt"), overwrite=TRUE)
file.remove(outputFileHour)
file.remove(outputFileHourDay)
return(outputFileHourDayWeek)
file.remove(outputFileHour)
file.remove(outputFileHourDay)
return(outputFileHourDayWeek)
}
# Set A = weather + holiday + weekday + weekend features for the predicted day.
# Set A = weather + holiday + weekday + weekend features for
# the predicted day.
finalDataA_mrs <- editData_mrs
# Set B, C & D.
finalDataLag_dir <- addLag(inputData = editData_mrs, outputFileBase = outFileLag)
finalDataLag_dir <- addLag(inputData = editData_mrs,
outputFileBase = outFileLag)
finalDataLag_mrs <- RxXdfData(finalDataLag_dir)
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
# Step 3: Prepare Training and Test Datasets
## Set A:
# Split Data.
rxSplit(inData = finalDataA_mrs,
@ -168,41 +202,62 @@ testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf"))
## Set B, C & D:
# Split Data.
rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
rxSplit(inData = finalDataLag_mrs,
outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
overwrite = TRUE, reportProgress = 0, verbose = 0)
# Point to the .xdf files for the training and test set.
train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")
## Set A:
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
formA_mrs <- formula(trainA_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data.
formA_mrs <- formula(trainA_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
# Fit Decision Forest Regression model.
dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs, importance = TRUE, seed = 123)
dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs,
importance = TRUE, seed = 123)
## Set B:
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set C and D.
formB_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newDayFeatures, newWeekFeatures))
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data, and lag features
# for Set C and D.
formB_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr",
newDayFeatures,
newWeekFeatures))
# Fit Decision Forest Regression model.
dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs, importance = TRUE, seed = 123)
dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs,
importance = TRUE, seed = 123)
## Set C:
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set D.
formC_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newWeekFeatures))
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data, and lag features
# for Set D.
formC_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr", newWeekFeatures))
# Fit Decision Forest Regression model.
dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs, importance = TRUE, seed = 123)
dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs,
importance = TRUE, seed = 123)
## Set D:
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
formD_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data.
formD_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
# Fit Decision Forest Regression model.
dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs, importance = TRUE, seed = 123)
dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs,
importance = TRUE, seed = 123)
# Step 5: Predict over new data
#---------------------------Step 5: Predict over new data---------------------------
## Set A:
# Predict the probability on the test dataset.
rxPredict(dForestA_mrs, data = testA_mrs,
@ -231,17 +286,22 @@ rxPredict(dForestD_mrs, data = test_mrs,
residVarNames = "cnt_Resid_D",
overwrite = TRUE, computeResiduals = TRUE)
#---------------------------Prepare outputs---------------------------
# Prepare outputs
## Set A:
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sumA <- rxSummary(~ cnt_Resid_A_abs+cnt_Resid_A_2+cnt_rel_A, data = testA_mrs, summaryStats = "Mean",
# Calculate three statistical measures: Mean Absolute Error (MAE),
# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sumA <- rxSummary( ~ cnt_Resid_A_abs + cnt_Resid_A_2 + cnt_rel_A,
data = testA_mrs, summaryStats = "Mean",
transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A),
cnt_Resid_A_2 = cnt_Resid_A^2,
cnt_rel_A = abs(cnt_Resid_A)/cnt)
)$sDataFrame
)$sDataFrame
## Set B, C & D:
sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_Resid_C_2+cnt_rel_C+cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
cnt_Resid_C_abs + cnt_Resid_C_2 + cnt_rel_C +
cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
data = test_mrs, summaryStats = "Mean",
transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B),
cnt_Resid_B_2 = cnt_Resid_B^2,
@ -252,7 +312,7 @@ sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_R
cnt_Resid_D_abs = abs(cnt_Resid_D),
cnt_Resid_D_2 = cnt_Resid_D^2,
cnt_rel_D = abs(cnt_Resid_D)/cnt)
)$sDataFrame
)$sDataFrame
# Add row names.
features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
@ -263,7 +323,10 @@ features <- c("baseline: weather + holiday + weekday + weekend features for the
# List all measures in a data frame.
measures <- data.frame(Features = features,
MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
RMSE = c(sqrt(sumA[2, 2]), sqrt(sum[2, 2]), sqrt(sum[5, 2]), sqrt(sum[8, 2])),
RMSE = c(sqrt(sumA[2, 2]),
sqrt(sum[2, 2]),
sqrt(sum[5, 2]),
sqrt(sum[8, 2])),
RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))
# Review the measures.

Просмотреть файл

@ -1,4 +1,6 @@
# Install a package if it's not already installed.
# This script shows how to import data into R that is referenced by a URL.
# Install the RCurl package if it's not already installed.
(if (!require("RCurl", quietly = TRUE)) install.packages("RCurl"))
# Load packages.

Просмотреть файл

@ -3,16 +3,17 @@
# Check whether the "RevoScaleR" package is loaded in the current environment.
if (!require("RevoScaleR")) {
cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run.
\nIf you have Microsoft R Server installed, please switch the R engine.
\nFor example, in R Tools for Visual Studio:
\nR Tools -> Options -> R Engine.
\nIf Microsoft R Server is not installed, you can download it from:
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
\n")
cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run.
\nIf you have Microsoft R Server installed,
\nplease switch the R engine.
\nFor example, in R Tools for Visual Studio:
\nR Tools -> Options -> R Engine.
\nIf Microsoft R Server is not installed, you can download it from:
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
\n")
quit()
quit()
}
# A URL contains the raw data.

Просмотреть файл

@ -1,11 +1,11 @@
# ----------------------------------------------------------------------------
# purpose: introduction to plotting using the ggplot2 package
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
# This sample gives an introduction to plotting using the ggplot2 package.
# R has a number of built-in datasets
# In this example you use the dataset called quakes
# This data contains locations of earthquakes off Fiji
# The ggplot2 package is tremendously popular because it allows you to create
# beautiful plots by describing the plot structure
# R has a number of built-in datasets.
# In this example you use the dataset called quakes.
# This data contains locations of earthquakes off Fiji.
# Read the help page for more information
?quakes
@ -24,71 +24,55 @@ if (!require("mapproj"))
install.packages("mapproj") # required for map projections
library("mapproj")
# ----------------------------------------------------------------------------
# Starting to use the ggplot2 package
# ----------------------------------------------------------------------------
# The ggplot2 package is tremendously popular because it allows you to create
# beautiful plots by describing the plot structure
# Plot longitude and latitude of quakes
# To create a plot, you have to specify the data, then map aesthetics to
# columns in your data. In this example, you map the column long to the x-axis
# and lat to the y-axis.
# Then you add a layer with points (geom_point) and a layer to plot maps
# Then you add a layer with points (geom_point) and a layer to plot maps.
p0 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point() +
coord_map()
geom_point() +
coord_map()
p0
# You can use a number of different aesthetics, for example colour or size
# of the points
# of the points.
# Map the depth column to the colour aesthetic
p1 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth)) +
coord_map()
geom_point(aes(colour = depth)) +
coord_map()
p1
# Add size for magnitude. The bigger the magnitude, the larger the point
p2 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag)) +
coord_map()
geom_point(aes(colour = depth, size = mag)) +
coord_map()
p2
# You can control the transparancy of a plot object with the alpha aesthetic
# High values of alpha (close to 1) are opaque, while low values (close to 0)
# are translucent
# Add alpha level to hide overplotting, thus revealing detail
p3 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map()
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map()
p3
# Change colour gradient by adding a gradient scale
p4 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map() +
scale_colour_gradient(low = "blue", high = "red")
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map() +
scale_colour_gradient(low = "blue", high = "red")
p4
# Add a plot title
p5 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
scale_colour_gradient(low = "blue", high = "red") +
ggtitle("Distribution of earthquakes near Fiji") +
coord_map()
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
scale_colour_gradient(low = "blue", high = "red") +
ggtitle("Distribution of earthquakes near Fiji") +
coord_map()
p5
# Now plot multiple plots on the same graphic
# The package "grid" is built into R and allows you to take control of the
# plotting area. A grob is the abbreviation for "graphical object", and the
@ -100,38 +84,35 @@ theme_set(theme_grey(12) + theme(legend.key.size = unit(0.5, "lines")))
library(grid)
plot.new()
grid.draw(cbind(
ggplotGrob(p1),
ggplotGrob(p2),
ggplotGrob(p3),
size = "last"
))
ggplotGrob(p1),
ggplotGrob(p2),
ggplotGrob(p3),
size = "last"
))
# ----------------------------------------------------------------------------
# The package "gtable" allows you to work with objects called grob tables.
# A grob table captures all the information needed to layout grobs in a table
# structure. It supports row and column spanning, offers some tools to
# automatically figure out the correct dimensions, and makes it easy to align
# and combine multiple tables.
if (!require("gtable")) install.packages("gtable")
if (!require("gtable"))
install.packages("gtable")
library(gtable)
plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) +
geom_point(alpha = 0.5) +
ggtitle("Top view")
plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) +
geom_point(alpha = 0.5) +
ggtitle("Top view")
plongdep <- ggplot(quakes, aes(x = long, y = -depth, size = mag, col = depth)) +
geom_point(alpha = 0.5) +
ggtitle("Side view")
geom_point(alpha = 0.5) +
ggtitle("Side view")
platdep <- ggplot(quakes, aes(x = depth, y = lat, size = mag, col = depth)) +
geom_point(alpha = 0.5) +
ggtitle("Side view")
# Next, define a gtable and add grobs to the table
geom_point(alpha = 0.5) +
ggtitle("Side view")
# Next, define a gtable and add grobs to the table.
gt <- gtable(widths = unit(rep(1,2), "null"),
heights = unit(rep(1,2), "null"))
gt <- gtable_add_grob(gt,