This commit is contained in:
Brandon Rohrer 2016-03-15 16:06:28 -04:00
Родитель ac4919044e
Коммит 226842a54a
7 изменённых файлов: 375 добавлений и 297 удалений

Просмотреть файл

@ -89,7 +89,7 @@ library(help = foreign)
# To install a new package, use install.packages() # To install a new package, use install.packages()
# Install the ggplot2 package for it's plotting capability. # Install the ggplot2 package for it's plotting capability.
if (!require("ggplot2")) if (!require("ggplot2"))
install.packages("ggplot2") install.packages("ggplot2")
# Then load the package. # Then load the package.
library("ggplot2") library("ggplot2")
@ -146,7 +146,7 @@ theme_set(theme_gray(base_size = 18))
# In this sample you use ggplot2. # In this sample you use ggplot2.
ggplot(diamondSample, aes(x = carat, y = price)) + ggplot(diamondSample, aes(x = carat, y = price)) +
geom_point(colour = "blue") geom_point(colour = "blue")
# Add a log scale. # Add a log scale.
ggplot(diamondSample, aes(x = carat, y = price)) + ggplot(diamondSample, aes(x = carat, y = price)) +
@ -155,9 +155,9 @@ ggplot(diamondSample, aes(x = carat, y = price)) +
# Add a log scale for both scales. # Add a log scale for both scales.
ggplot(diamondSample, aes(x = carat, y = price)) + ggplot(diamondSample, aes(x = carat, y = price)) +
geom_point(colour = "blue") + geom_point(colour = "blue") +
scale_x_log10() + scale_x_log10() +
scale_y_log10() scale_y_log10()
### Linear Regression in R ### Linear Regression in R
@ -177,10 +177,10 @@ exp(coef(model)[1])
# Show the model in a plot. # Show the model in a plot.
ggplot(diamondSample, aes(x = carat, y = price)) + ggplot(diamondSample, aes(x = carat, y = price)) +
geom_point(colour = "blue") + geom_point(colour = "blue") +
geom_smooth(method = "lm", colour = "red", size = 2) + geom_smooth(method = "lm", colour = "red", size = 2) +
scale_x_log10() + scale_x_log10() +
scale_y_log10() scale_y_log10()
### Regression Diagnostics ### Regression Diagnostics

Просмотреть файл

@ -36,8 +36,8 @@ theme_set(theme_gray(base_size = 18))
# and lat to the y-axis. # and lat to the y-axis.
# Then you add a layer with points (geom_point) and a layer to plot maps. # Then you add a layer with points (geom_point) and a layer to plot maps.
p0 <- ggplot(quakes, aes(x = long, y = lat)) + p0 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point() + geom_point() +
coord_map() coord_map()
p0 p0
# You can use a number of different aesthetics, for example colour or size # You can use a number of different aesthetics, for example colour or size
@ -45,14 +45,14 @@ p0
# Map the depth column to the colour aesthetic. # Map the depth column to the colour aesthetic.
p1 <- ggplot(quakes, aes(x = long, y = lat)) + p1 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth)) + geom_point(aes(colour = depth)) +
coord_map() coord_map()
p1 p1
# Add size for magnitude. The bigger the magnitude, the larger the point. # Add size for magnitude. The bigger the magnitude, the larger the point.
p2 <- ggplot(quakes, aes(x = long, y = lat)) + p2 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag)) + geom_point(aes(colour = depth, size = mag)) +
coord_map() coord_map()
p2 p2
# You can control the transparancy of a plot object with the alpha aesthetic. # You can control the transparancy of a plot object with the alpha aesthetic.
@ -60,24 +60,24 @@ p2
# are translucent. # are translucent.
# Add alpha level to hide overplotting, thus revealing detail. # Add alpha level to hide overplotting, thus revealing detail.
p3 <- ggplot(quakes, aes(x = long, y = lat)) + p3 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) + geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map() coord_map()
p3 p3
# Change colour gradient by adding a gradient scale. # Change colour gradient by adding a gradient scale.
p4 <- ggplot(quakes, aes(x = long, y = lat)) + p4 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) + geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map() + coord_map() +
scale_colour_gradient(low = "blue", high = "red") scale_colour_gradient(low = "blue", high = "red")
p4 p4
# Add a plot title. # Add a plot title.
p5 <- ggplot(quakes, aes(x = long, y = lat)) + p5 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) + geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
scale_colour_gradient(low = "blue", high = "red") + scale_colour_gradient(low = "blue", high = "red") +
ggtitle("Distribution of earthquakes near Fiji") + ggtitle("Distribution of earthquakes near Fiji") +
coord_map() coord_map()
p5 p5
# Now plot multiple plots on the same graphic. # Now plot multiple plots on the same graphic.
# The package "grid" is built into R and allows you to take control of the # The package "grid" is built into R and allows you to take control of the
@ -89,10 +89,10 @@ theme_set(theme_grey(12) + theme(legend.key.size = unit(0.5, "lines")))
library(grid) library(grid)
plot.new() plot.new()
grid.draw(cbind( grid.draw(cbind(
ggplotGrob(p1), ggplotGrob(p1),
ggplotGrob(p2), ggplotGrob(p2),
ggplotGrob(p3), ggplotGrob(p3),
size = "last" size = "last"
)) ))

Просмотреть файл

@ -1,30 +1,27 @@
#------------------------------------------------------------------------------------------------------------- # Regression: Demand estimation with Microsoft R Server
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
#------------------------------------------------------------------------------------------------------------- # This example demonstrates the feature engineering process for building
# # a regression model to predict bike rental demand.
#
# This example demonstrates the feature engineering process for building a regression model to predict # The dataset contains 17,379 rows and 17 columns, each row representing
# bike rental demand. # the number of bike rentals within a specific hour of a day in the years
# # 2011 or 2012. Weather conditions (such as temperature, humidity,
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within # and wind speed) were included in this raw feature set, and
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity, # the dates were categorized as holiday vs. weekday, etc.
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
# weekday etc. # The field to predict is "cnt", which contains a count value ranging
# # from 1 to 977, representing the number of bike rentals within
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number # a specific hour. The lag features we add in the data set are the number
# of bike rentals within a specific hour. The lag features we add in the data set is the number of bikes that # of bikes that were rented in each of the previous 12 hours.
# were rented in each of the previous 12 hours, which caputures the very recent demand for the bikes. # This caputures the very recent demand for the bikes.
#
# The following scripts include five basic steps of building this example using Microsoft R Server. # The following scripts include five basic steps of building this example
# # using Microsoft R Server.
#
#--------------------------------------------------------------------------------------------------------------
#---------------------------Step 0: Get Started------------------------------- # Step 0: Get Started
# ----------------------------------------------------------------------------
# Check if Microsoft R Server (RRE 8.0) is installed # Check whether Microsoft R Server (RRE 8.0) is installed
# ----------------------------------------------------------------------------
if (!require("RevoScaleR")) { if (!require("RevoScaleR")) {
cat("RevoScaleR package does not seem to exist. cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run. \nThis means that the functions starting with 'rx' will not run.
@ -38,7 +35,7 @@ if (!require("RevoScaleR")) {
quit() quit()
} }
# Initial some variables. # Initialize some variables.
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/" github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv") inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
@ -47,81 +44,115 @@ td <- tempdir()
outFileBike <- paste0(td, "/bike.xdf") outFileBike <- paste0(td, "/bike.xdf")
outFileLag <- paste0(td, "/lagData.xdf") outFileLag <- paste0(td, "/lagData.xdf")
#---------------------------Step 1: Import the Bike Data---------------------------
bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE, # Step 1: Import the Bike Data
bike <- rxImport(inData = inputFileBikeURL,
outFile = outFileBike, overwrite = TRUE,
missingValueString = "M", stringsAsFactors = FALSE, missingValueString = "M", stringsAsFactors = FALSE,
# Remove timestamps and all columns that are part of the label. # Remove timestamps and all columns that are
# part of the label.
varsToDrop = c("instant", "dteday", "casual", "registered"), varsToDrop = c("instant", "dteday", "casual", "registered"),
# Definite year, weather conditions and season columns as categorical. # Define year, weather conditions and season columns
# as categorical.
colInfo = list(yr = list(type = "factor"), colInfo = list(yr = list(type = "factor"),
weathersit = list(type = "factor"), weathersit = list(type = "factor"),
season = list(type = "factor"))) season = list(type = "factor")))
#---------------------------Step 2: Feature Engineering---------------------------
# Add number of bikes that were rented in each of the previous 12 hours as 12 lag features. # Step 2: Feature Engineering
computeLagFeatures <- function (dataList) { # function for computing lag features.
# Add the number of bikes that were rented in each of the previous
# 12 hours as 12 lag features.
computeLagFeatures <- function(dataList) {
# Total number of lags that need to be added.
numLags <- length(nLagsVector)
# lag feature names as lagN
varLagNameVector <- paste("lag", nLagsVector, sep="")
numLags <- length(nLagsVector) # total number of lags that need to be added # Set the value of an object "storeLagData" in the transform environment.
varLagNameVector <- paste("lag", nLagsVector, sep="") # lag feature names as lagN if (!exists("storeLagData"))
{
# Set the value of an object "storeLagData" in the transform environment. lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
if (!exists("storeLagData")) names(lagData) <- varLagNameVector
{ .rxSet("storeLagData",lagData)
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
names(lagData) <- varLagNameVector
.rxSet("storeLagData",lagData)
}
if (!.rxIsTestChunk)
{
for (iL in 1:numLags)
{
numRowsInChunk <- length(dataList[[varName]]) # number of rows in the current chunk
nlags <- nLagsVector[iL]
varLagName <- paste("lag", nlags, sep="")
lagData <- .rxGet("storeLagData") # retrieve lag data from the previous chunk
allData <- c(lagData[[varLagName]], dataList[[varName]]) # concatenate lagData and the "cnt" feature
dataList[[varLagName]] <- allData[1:numRowsInChunk] # take the first N rows of allData, where N is the total number of rows in the original dataList
lagData[[varLagName]] <- tail(allData, nlags) # save last nlag rows as the new lagData to be used to process in the next chunk
.rxSet("storeLagData", lagData)
} }
}
return(dataList) if (!.rxIsTestChunk)
{
for (iL in 1:numLags) {
# Number of rows in the current chunk.
numRowsInChunk <- length(dataList[[varName]])
nlags <- nLagsVector[iL]
varLagName <- paste("lag", nlags, sep = "")
# Retrieve lag data from the previous chunk.
lagData <- .rxGet("storeLagData")
# Concatenate lagData and the "cnt" feature.
allData <- c(lagData[[varLagName]], dataList[[varName]])
# Take the first N rows of allData, where N is the total
# number of rows in the original dataList.
dataList[[varLagName]] <- allData[1:numRowsInChunk]
# Save last nlag rows as the new lagData to be used
# to process in the next chunk.
lagData[[varLagName]] <- tail(allData, nlags)
.rxSet("storeLagData", lagData)
}
}
return(dataList)
} }
# Apply the "computeLagFeatures" on the bike data. # Apply the "computeLagFeatures" on the bike data.
lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = computeLagFeatures, lagData <- rxDataStep(inData = bike, outFile = outFileLag,
transformObjects = list(varName = "cnt", nLagsVector = seq(12)), transformFunc = computeLagFeatures,
transformObjects = list(varName = "cnt",
nLagsVector = seq(12)),
transformVars = "cnt", overwrite=TRUE) transformVars = "cnt", overwrite=TRUE)
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
# Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012. # Step 3: Prepare Training and Test Datasets
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0)
# Split data by "yr" so that the training data contains records
# for the year 2011 and the test data contains records for 2012.
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"),
splitByFactor = "yr", overwrite = TRUE,
reportProgress = 0, verbose = 0)
# Point to the .xdf files for the training and test set. # Point to the .xdf files for the training and test set.
train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf")) train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf")) test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data. # Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
modelFormula <- formula(train, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
# Build a formula for the regression model and remove the "yr",
# which is used to split the training and test data.
modelFormula <- formula(train, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
# Fit a Decision Forest Regression model on the training data. # Fit a Decision Forest Regression model on the training data.
dForest <- rxDForest(modelFormula, data = train, importance = TRUE, seed = 123) dForest <- rxDForest(modelFormula, data = train, importance = TRUE,
seed = 123)
# Step 5: Predict over new data and review the model performance
#---------------------------Step 5: Predict over new data and review the model performance---------------------------
# Predict the probability on the test dataset. # Predict the probability on the test dataset.
predict <- rxPredict(dForest, data = test, overwrite = TRUE, computeResiduals = TRUE) predict <- rxPredict(dForest, data = test, overwrite = TRUE,
computeResiduals = TRUE)
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE). # Calculate three statistical measures: Mean Absolute Error (MAE),
sum <- rxSummary(~ cnt_Resid_abs+cnt_Resid_2+cnt_rel, data = predict, summaryStats = "Mean", # Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sum <- rxSummary(~cnt_Resid_abs + cnt_Resid_2 + cnt_rel, data = predict,
summaryStats = "Mean",
transforms = list(cnt_Resid_abs = abs(cnt_Resid), transforms = list(cnt_Resid_abs = abs(cnt_Resid),
cnt_Resid_2 = cnt_Resid^2, cnt_Resid_2 = cnt_Resid^2,
cnt_rel = abs(cnt_Resid)/cnt) cnt_rel = abs(cnt_Resid)/cnt)
)$sDataFrame )$sDataFrame
# List all measures in a data frame. # List all measures in a data frame.
measures <- data.frame(MAE = sum[1, 2], RMSE = sqrt(sum[2, 2]), RAE = sum[3, 2]) measures <- data.frame(MAE = sum[1, 2],
RMSE = sqrt(sum[2, 2]),
RAE = sum[3, 2])
# Review the measures. # Review the measures.
measures measures

Просмотреть файл

@ -1,58 +1,61 @@
#------------------------------------------------------------------------------------------------------------- # Regression: Demand estimation with Microsoft R Server
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
#------------------------------------------------------------------------------------------------------------- # This example a replication of an existing Azure Machine Learning
# # Experiment - Regression: Demand Estimation
#
# This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation
# https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4. # https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
#
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within # The dataset contains 17,379 rows and 17 columns, each row representing
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity, # the number of bike rentals within a given hour of a given day in
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs. # the years 2011 or 2012. Weather conditions (such as temperature, humidity,
# weekday etc. # and wind speed) were included in this raw feature set, and the dates
# # were categorized as holiday vs. weekday, etc.
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
# of bike rentals within a specific hour. # The field to predict is "cnt", which contains a count value ranging
# # from 1 to 977, representing the number of bike rentals within
# We built four models using the same algorithm, but with four different training datasets. The four training # a given hour.
# datasets that we constructed were all based on the same raw input data, but we added different additional
# features to each training set. # We built four models using the same algorithm, but with four different
# # training datasets. The four training datasets that we constructed
# were all based on the same raw input data, but we added different
# additional features to each training set.
# Set A = weather + holiday + weekday + weekend features for the predicted day # Set A = weather + holiday + weekday + weekend features for the predicted day
# Set B = number of bikes that were rented in each of the previous 12 hours # Set B = number of bikes that were rented in each of the previous 12 hours
# Set C = number of bikes that were rented in each of the previous 12 days at the same hour # Set C = number of bikes that were rented in each of the previous 12 days
# Set D = number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day # at the same hour
# # Set D = number of bikes that were rented in each of the previous 12 weeks
# at the same hour and the same day
# Each of these feature sets captures different aspects of the problem: # Each of these feature sets captures different aspects of the problem:
# Feature set B captures very recent demand for the bikes. # Feature set B captures very recent demand for the bikes.
# Feature set C captures the demand for bikes at a particular hour. # Feature set C captures the demand for bikes at a particular hour.
# Feature set D captures demand for bikes at a particular hour and particular day of the week. # Feature set D captures demand for bikes at a particular hour and
# # particular day of the week.
# The four training datasets were built by combining the feature set as follows:
# The four training datasets were built by combining the feature set
# as follows:
# Training set 1: feature set A only # Training set 1: feature set A only
# Training set 2: feature sets A+B # Training set 2: feature sets A+B
# Training set 3: feature sets A+B+C # Training set 3: feature sets A+B+C
# Training set 4: feature sets A+B+C+D # Training set 4: feature sets A+B+C+D
#
# The following scripts include five basic steps of building this example using Microsoft R Server. # The following scripts include five basic steps of building this example
# # using Microsoft R Server.
#
#--------------------------------------------------------------------------------------------------------------
#---------------------------Step 0: Get Started------------------------------- # Step 0: Get Started
# ----------------------------------------------------------------------------
# Check if Microsoft R Server (RRE 8.0) is installed # Check whether Microsoft R Server (RRE 8.0) is installed
# ----------------------------------------------------------------------------
if (!require("RevoScaleR")) { if (!require("RevoScaleR")) {
cat("RevoScaleR package does not seem to exist. cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run. \nThis means that the functions starting with 'rx' will not run.
\nIf you have Microsoft R Server installed, please switch the R engine. \nIf you have Microsoft R Server installed,
\nFor example, in R Tools for Visual Studio: \nplease switch the R engine.
\nR Tools -> Options -> R Engine. \nFor example, in R Tools for Visual Studio:
\nIf Microsoft R Server is not installed, you can download it from: \nR Tools -> Options -> R Engine.
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/ \nIf Microsoft R Server is not installed, you can download it from:
\n") \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
\n")
quit() quit()
} }
@ -67,95 +70,126 @@ outFileBike <- paste0(td, "/bike.xdf")
outFileEdit <- paste0(td, "/editData.xdf") outFileEdit <- paste0(td, "/editData.xdf")
outFileLag <- paste0(td, "/lagData") outFileLag <- paste0(td, "/lagData")
#---------------------------Step 1: Import Data---------------------------
# Step 1: Import Data
# Import the bike data. # Import the bike data.
# Remove timestamps and all columns that are part of the label. # Remove timestamps and all columns that are part of the label.
bike_mrs <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE, bike_mrs <- rxImport(inData = inputFileBikeURL,
outFile = outFileBike, overwrite = TRUE,
missingValueString = "M", stringsAsFactors = FALSE, missingValueString = "M", stringsAsFactors = FALSE,
varsToDrop = c("instant", "dteday", "casual", "registered")) varsToDrop = c("instant",
"dteday",
"casual",
"registered"))
# Edit Metadata: Definite year, weather conditions and season columns as categorical. # Edit Metadata: Define year, weather conditions and season columns
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit, sortLevels = TRUE, # as categorical.
factorInfo = c("yr", "weathersit", "season"), overwrite = TRUE) editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit,
sortLevels = TRUE,
factorInfo = c("yr", "weathersit", "season"),
overwrite = TRUE)
# Step 2: Feature Engineering
#---------------------------Step 2: Feature Engineering---------------------------
# Create a function to construct lag features for four different aspects. # Create a function to construct lag features for four different aspects.
computeLagFeatures <- function (dataList) { computeLagFeatures <- function (dataList) {
# Total number of lags that need to be added.
numLags <- length(nLagsVector) # total number of lags that need to be added numLags <- length(nLagsVector)
for (iL in 1:numLags) for (iL in 1:numLags) {
{ nlag <- nLagsVector[iL]
nlag <- nLagsVector[iL] varLagName <- paste("demand.", nlag, unit, sep = "")
varLagName <- paste("demand.",nlag,unit,sep="") numRowsInChunk <- length(dataList[[baseVar]])
numRowsInChunk <- length(dataList[[baseVar]]) numRowsToRead <- nlag * interval
numRowsToRead <- nlag * interval numRowsPadding <- 0
numRowsPadding <- 0 if (numRowsToRead >= .rxStartRow) {
if (numRowsToRead >= .rxStartRow) numRowsToRead <- .rxStartRow - 1
{ numRowsPadding <- nlag * interval - numRowsToRead
numRowsToRead <- .rxStartRow - 1 }
numRowsPadding <- nlag * interval - numRowsToRead # Determine the current row to start processing the data
# between chunks.
startRow <- .rxStartRow - numRowsToRead
previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
varsToKeep = baseVar,
startRow = startRow,
numRows = numRowsToRead,
returnDataFrame = FALSE)
paddingRowsDataList <- rxReadXdf(file = .rxReadFileName,
varsToKeep = baseVar,
startRow = 1,
numRows = numRowsPadding,
returnDataFrame = FALSE)
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]],
previousRowsDataList[[baseVar]],
dataList[[baseVar]])[1:numRowsInChunk]
} }
startRow <- .rxStartRow - numRowsToRead # determine the current row to start processing the data between chunks. return(dataList)
previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
varsToKeep = baseVar,
startRow = startRow, numRows = numRowsToRead,
returnDataFrame = FALSE)
paddingRowsDataList <- rxReadXdf(file=.rxReadFileName,
varsToKeep = baseVar,
startRow = 1, numRows = numRowsPadding,
returnDataFrame = FALSE)
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]], previousRowsDataList[[baseVar]], dataList[[baseVar]])[1:numRowsInChunk]
}
return(dataList)
} }
# Create a function to add lag features a set of columns at a time. # Create a function to add lag features a set of columns at a time.
addLag <- function(inputData, outputFileBase) { addLag <- function(inputData, outputFileBase) {
inputFile <- inputData inputFile <- inputData
outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="") outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="") outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week",".xdf",sep="") outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf",
sep="")
# Initialize some fix values. # Initialize some fix values.
hourInterval <- 1 hourInterval <- 1
dayInterval <- 24 dayInterval <- 24
weekInterval <- 168 weekInterval <- 168
previous <- 12 previous <- 12
# Add number of bikes that were rented in each of the previous 12 hours (for Set B). # Add number of bikes that were rented in each of the previous 12 hours
rxDataStep(inData = inputFile,outFile = outputFileHour, transformFunc=computeLagFeatures, # (for Set B).
transformObjects = list(baseVar = "cnt", unit = "hour",nLagsVector=seq(12), rxDataStep(inData = inputFile, outFile = outputFileHour,
interval = hourInterval), transformFunc=computeLagFeatures,
transformVars = c("cnt"), overwrite=TRUE) transformObjects = list(baseVar = "cnt", unit = "hour",
nLagsVector=seq(12),
interval = hourInterval),
transformVars = c("cnt"), overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 days at the same hour (for Set C). # Add number of bikes that were rented in each of the previous 12 days
rxDataStep(inData = outputFileHour,outFile = outputFileHourDay, transformFunc=computeLagFeatures, # at the same hour (for Set C).
transformObjects = list(baseVar = "cnt", unit = "day",nLagsVector=seq(12), rxDataStep(inData = outputFileHour, outFile = outputFileHourDay,
interval = dayInterval), transformFunc=computeLagFeatures,
transformVars = c("cnt"), overwrite=TRUE) transformObjects = list(baseVar = "cnt",
unit = "day",
nLagsVector=seq(12),
interval = dayInterval),
transformVars = c("cnt"),
overwrite=TRUE)
# Add number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day (for Set D). # Add number of bikes that were rented in each of the previous 12 weeks
rxDataStep(inData = outputFileHourDay,outFile = outputFileHourDayWeek, transformFunc=computeLagFeatures, # at the same hour and the same day (for Set D).
transformObjects = list(baseVar = "cnt", unit = "week",nLagsVector=seq(12), rxDataStep(inData = outputFileHourDay,
interval = weekInterval), outFile = outputFileHourDayWeek,
transformVars = c("cnt"), overwrite=TRUE) transformFunc=computeLagFeatures,
transformObjects = list(baseVar = "cnt",
unit = "week",
nLagsVector=seq(12),
interval = weekInterval),
transformVars = c("cnt"), overwrite=TRUE)
file.remove(outputFileHour) file.remove(outputFileHour)
file.remove(outputFileHourDay) file.remove(outputFileHourDay)
return(outputFileHourDayWeek)
return(outputFileHourDayWeek)
} }
# Set A = weather + holiday + weekday + weekend features for the predicted day. # Set A = weather + holiday + weekday + weekend features for
# the predicted day.
finalDataA_mrs <- editData_mrs finalDataA_mrs <- editData_mrs
# Set B, C & D. # Set B, C & D.
finalDataLag_dir <- addLag(inputData = editData_mrs, outputFileBase = outFileLag) finalDataLag_dir <- addLag(inputData = editData_mrs,
outputFileBase = outFileLag)
finalDataLag_mrs <- RxXdfData(finalDataLag_dir) finalDataLag_mrs <- RxXdfData(finalDataLag_dir)
#---------------------------Step 3: Prepare Training and Test Datasets--------------------------- # Step 3: Prepare Training and Test Datasets
## Set A: ## Set A:
# Split Data. # Split Data.
rxSplit(inData = finalDataA_mrs, rxSplit(inData = finalDataA_mrs,
@ -168,41 +202,62 @@ testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf"))
## Set B, C & D: ## Set B, C & D:
# Split Data. # Split Data.
rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr", rxSplit(inData = finalDataLag_mrs,
outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
overwrite = TRUE, reportProgress = 0, verbose = 0) overwrite = TRUE, reportProgress = 0, verbose = 0)
# Point to the .xdf files for the training and test set. # Point to the .xdf files for the training and test set.
train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf")) train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf")) test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "") newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "") newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")
## Set A: ## Set A:
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data. # Build a formula for the regression model and remove the "yr",
formA_mrs <- formula(trainA_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr")) # which is used to split the training and test data.
formA_mrs <- formula(trainA_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
# Fit Decision Forest Regression model. # Fit Decision Forest Regression model.
dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs, importance = TRUE, seed = 123) dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs,
importance = TRUE, seed = 123)
## Set B: ## Set B:
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set C and D. # Build a formula for the regression model and remove the "yr",
formB_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newDayFeatures, newWeekFeatures)) # which is used to split the training and test data, and lag features
# for Set C and D.
formB_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr",
newDayFeatures,
newWeekFeatures))
# Fit Decision Forest Regression model. # Fit Decision Forest Regression model.
dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs, importance = TRUE, seed = 123) dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs,
importance = TRUE, seed = 123)
## Set C: ## Set C:
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set D. # Build a formula for the regression model and remove the "yr",
formC_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newWeekFeatures)) # which is used to split the training and test data, and lag features
# for Set D.
formC_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr", newWeekFeatures))
# Fit Decision Forest Regression model. # Fit Decision Forest Regression model.
dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs, importance = TRUE, seed = 123) dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs,
importance = TRUE, seed = 123)
## Set D: ## Set D:
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data. # Build a formula for the regression model and remove the "yr",
formD_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr")) # which is used to split the training and test data.
formD_mrs <- formula(train_mrs, depVars = "cnt",
varsToDrop = c("RowNum", "yr"))
# Fit Decision Forest Regression model. # Fit Decision Forest Regression model.
dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs, importance = TRUE, seed = 123) dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs,
importance = TRUE, seed = 123)
# Step 5: Predict over new data
#---------------------------Step 5: Predict over new data---------------------------
## Set A: ## Set A:
# Predict the probability on the test dataset. # Predict the probability on the test dataset.
rxPredict(dForestA_mrs, data = testA_mrs, rxPredict(dForestA_mrs, data = testA_mrs,
@ -231,17 +286,22 @@ rxPredict(dForestD_mrs, data = test_mrs,
residVarNames = "cnt_Resid_D", residVarNames = "cnt_Resid_D",
overwrite = TRUE, computeResiduals = TRUE) overwrite = TRUE, computeResiduals = TRUE)
#---------------------------Prepare outputs---------------------------
# Prepare outputs
## Set A: ## Set A:
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE). # Calculate three statistical measures: Mean Absolute Error (MAE),
sumA <- rxSummary(~ cnt_Resid_A_abs+cnt_Resid_A_2+cnt_rel_A, data = testA_mrs, summaryStats = "Mean", # Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
sumA <- rxSummary( ~ cnt_Resid_A_abs + cnt_Resid_A_2 + cnt_rel_A,
data = testA_mrs, summaryStats = "Mean",
transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A), transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A),
cnt_Resid_A_2 = cnt_Resid_A^2, cnt_Resid_A_2 = cnt_Resid_A^2,
cnt_rel_A = abs(cnt_Resid_A)/cnt) cnt_rel_A = abs(cnt_Resid_A)/cnt)
)$sDataFrame )$sDataFrame
## Set B, C & D: ## Set B, C & D:
sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_Resid_C_2+cnt_rel_C+cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D, sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
cnt_Resid_C_abs + cnt_Resid_C_2 + cnt_rel_C +
cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
data = test_mrs, summaryStats = "Mean", data = test_mrs, summaryStats = "Mean",
transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B), transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B),
cnt_Resid_B_2 = cnt_Resid_B^2, cnt_Resid_B_2 = cnt_Resid_B^2,
@ -252,7 +312,7 @@ sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_R
cnt_Resid_D_abs = abs(cnt_Resid_D), cnt_Resid_D_abs = abs(cnt_Resid_D),
cnt_Resid_D_2 = cnt_Resid_D^2, cnt_Resid_D_2 = cnt_Resid_D^2,
cnt_rel_D = abs(cnt_Resid_D)/cnt) cnt_rel_D = abs(cnt_Resid_D)/cnt)
)$sDataFrame )$sDataFrame
# Add row names. # Add row names.
features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day", features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
@ -263,7 +323,10 @@ features <- c("baseline: weather + holiday + weekday + weekend features for the
# List all measures in a data frame. # List all measures in a data frame.
measures <- data.frame(Features = features, measures <- data.frame(Features = features,
MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]), MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
RMSE = c(sqrt(sumA[2, 2]), sqrt(sum[2, 2]), sqrt(sum[5, 2]), sqrt(sum[8, 2])), RMSE = c(sqrt(sumA[2, 2]),
sqrt(sum[2, 2]),
sqrt(sum[5, 2]),
sqrt(sum[8, 2])),
RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2])) RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))
# Review the measures. # Review the measures.

Просмотреть файл

@ -1,4 +1,6 @@
# Install a package if it's not already installed. # This script shows how to import data into R that is referenced by a URL.
# Install the RCurl package if it's not already installed.
(if (!require("RCurl", quietly = TRUE)) install.packages("RCurl")) (if (!require("RCurl", quietly = TRUE)) install.packages("RCurl"))
# Load packages. # Load packages.

Просмотреть файл

@ -3,16 +3,17 @@
# Check whether the "RevoScaleR" package is loaded in the current environment. # Check whether the "RevoScaleR" package is loaded in the current environment.
if (!require("RevoScaleR")) { if (!require("RevoScaleR")) {
cat("RevoScaleR package does not seem to exist. cat("RevoScaleR package does not seem to exist.
\nThis means that the functions starting with 'rx' will not run. \nThis means that the functions starting with 'rx' will not run.
\nIf you have Microsoft R Server installed, please switch the R engine. \nIf you have Microsoft R Server installed,
\nFor example, in R Tools for Visual Studio: \nplease switch the R engine.
\nR Tools -> Options -> R Engine. \nFor example, in R Tools for Visual Studio:
\nIf Microsoft R Server is not installed, you can download it from: \nR Tools -> Options -> R Engine.
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/ \nIf Microsoft R Server is not installed, you can download it from:
\n") \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
\n")
quit() quit()
} }
# A URL contains the raw data. # A URL contains the raw data.

Просмотреть файл

@ -1,11 +1,11 @@
# ---------------------------------------------------------------------------- # This sample gives an introduction to plotting using the ggplot2 package.
# purpose: introduction to plotting using the ggplot2 package
# audience: you are expected to have some prior experience with R
# ----------------------------------------------------------------------------
# R has a number of built-in datasets # The ggplot2 package is tremendously popular because it allows you to create
# In this example you use the dataset called quakes # beautiful plots by describing the plot structure
# This data contains locations of earthquakes off Fiji
# R has a number of built-in datasets.
# In this example you use the dataset called quakes.
# This data contains locations of earthquakes off Fiji.
# Read the help page for more information # Read the help page for more information
?quakes ?quakes
@ -24,71 +24,55 @@ if (!require("mapproj"))
install.packages("mapproj") # required for map projections install.packages("mapproj") # required for map projections
library("mapproj") library("mapproj")
# ----------------------------------------------------------------------------
# Starting to use the ggplot2 package
# ----------------------------------------------------------------------------
# The ggplot2 package is tremendously popular because it allows you to create
# beautiful plots by describing the plot structure
# Plot longitude and latitude of quakes # Plot longitude and latitude of quakes
# To create a plot, you have to specify the data, then map aesthetics to # To create a plot, you have to specify the data, then map aesthetics to
# columns in your data. In this example, you map the column long to the x-axis # columns in your data. In this example, you map the column long to the x-axis
# and lat to the y-axis. # and lat to the y-axis.
# Then you add a layer with points (geom_point) and a layer to plot maps # Then you add a layer with points (geom_point) and a layer to plot maps.
p0 <- ggplot(quakes, aes(x = long, y = lat)) + p0 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point() + geom_point() +
coord_map() coord_map()
p0 p0
# You can use a number of different aesthetics, for example colour or size # You can use a number of different aesthetics, for example colour or size
# of the points # of the points.
# Map the depth column to the colour aesthetic # Map the depth column to the colour aesthetic
p1 <- ggplot(quakes, aes(x = long, y = lat)) + p1 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth)) + geom_point(aes(colour = depth)) +
coord_map() coord_map()
p1 p1
# Add size for magnitude. The bigger the magnitude, the larger the point # Add size for magnitude. The bigger the magnitude, the larger the point
p2 <- ggplot(quakes, aes(x = long, y = lat)) + p2 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag)) + geom_point(aes(colour = depth, size = mag)) +
coord_map() coord_map()
p2 p2
# You can control the transparancy of a plot object with the alpha aesthetic # You can control the transparancy of a plot object with the alpha aesthetic
# High values of alpha (close to 1) are opaque, while low values (close to 0) # High values of alpha (close to 1) are opaque, while low values (close to 0)
# are translucent # are translucent
# Add alpha level to hide overplotting, thus revealing detail # Add alpha level to hide overplotting, thus revealing detail
p3 <- ggplot(quakes, aes(x = long, y = lat)) + p3 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) + geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map() coord_map()
p3 p3
# Change colour gradient by adding a gradient scale # Change colour gradient by adding a gradient scale
p4 <- ggplot(quakes, aes(x = long, y = lat)) + p4 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) + geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
coord_map() + coord_map() +
scale_colour_gradient(low = "blue", high = "red") scale_colour_gradient(low = "blue", high = "red")
p4 p4
# Add a plot title # Add a plot title
p5 <- ggplot(quakes, aes(x = long, y = lat)) + p5 <- ggplot(quakes, aes(x = long, y = lat)) +
geom_point(aes(colour = depth, size = mag), alpha = 0.25) + geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
scale_colour_gradient(low = "blue", high = "red") + scale_colour_gradient(low = "blue", high = "red") +
ggtitle("Distribution of earthquakes near Fiji") + ggtitle("Distribution of earthquakes near Fiji") +
coord_map() coord_map()
p5 p5
# Now plot multiple plots on the same graphic # Now plot multiple plots on the same graphic
# The package "grid" is built into R and allows you to take control of the # The package "grid" is built into R and allows you to take control of the
# plotting area. A grob is the abbreviation for "graphical object", and the # plotting area. A grob is the abbreviation for "graphical object", and the
@ -100,38 +84,35 @@ theme_set(theme_grey(12) + theme(legend.key.size = unit(0.5, "lines")))
library(grid) library(grid)
plot.new() plot.new()
grid.draw(cbind( grid.draw(cbind(
ggplotGrob(p1), ggplotGrob(p1),
ggplotGrob(p2), ggplotGrob(p2),
ggplotGrob(p3), ggplotGrob(p3),
size = "last" size = "last"
)) ))
# ----------------------------------------------------------------------------
# The package "gtable" allows you to work with objects called grob tables. # The package "gtable" allows you to work with objects called grob tables.
# A grob table captures all the information needed to layout grobs in a table # A grob table captures all the information needed to layout grobs in a table
# structure. It supports row and column spanning, offers some tools to # structure. It supports row and column spanning, offers some tools to
# automatically figure out the correct dimensions, and makes it easy to align # automatically figure out the correct dimensions, and makes it easy to align
# and combine multiple tables. # and combine multiple tables.
if (!require("gtable"))
if (!require("gtable")) install.packages("gtable") install.packages("gtable")
library(gtable) library(gtable)
plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) + plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) +
geom_point(alpha = 0.5) + geom_point(alpha = 0.5) +
ggtitle("Top view") ggtitle("Top view")
plongdep <- ggplot(quakes, aes(x = long, y = -depth, size = mag, col = depth)) + plongdep <- ggplot(quakes, aes(x = long, y = -depth, size = mag, col = depth)) +
geom_point(alpha = 0.5) + geom_point(alpha = 0.5) +
ggtitle("Side view") ggtitle("Side view")
platdep <- ggplot(quakes, aes(x = depth, y = lat, size = mag, col = depth)) + platdep <- ggplot(quakes, aes(x = depth, y = lat, size = mag, col = depth)) +
geom_point(alpha = 0.5) + geom_point(alpha = 0.5) +
ggtitle("Side view") ggtitle("Side view")
# Next, define a gtable and add grobs to the table
# Next, define a gtable and add grobs to the table.
gt <- gtable(widths = unit(rep(1,2), "null"), gt <- gtable(widths = unit(rep(1,2), "null"),
heights = unit(rep(1,2), "null")) heights = unit(rep(1,2), "null"))
gt <- gtable_add_grob(gt, gt <- gtable_add_grob(gt,