зеркало из https://github.com/microsoft/RTVS-docs.git
proofreading
This commit is contained in:
Родитель
ac4919044e
Коммит
226842a54a
|
@ -89,7 +89,7 @@ library(help = foreign)
|
||||||
# To install a new package, use install.packages()
|
# To install a new package, use install.packages()
|
||||||
# Install the ggplot2 package for it's plotting capability.
|
# Install the ggplot2 package for it's plotting capability.
|
||||||
if (!require("ggplot2"))
|
if (!require("ggplot2"))
|
||||||
install.packages("ggplot2")
|
install.packages("ggplot2")
|
||||||
|
|
||||||
# Then load the package.
|
# Then load the package.
|
||||||
library("ggplot2")
|
library("ggplot2")
|
||||||
|
@ -146,7 +146,7 @@ theme_set(theme_gray(base_size = 18))
|
||||||
|
|
||||||
# In this sample you use ggplot2.
|
# In this sample you use ggplot2.
|
||||||
ggplot(diamondSample, aes(x = carat, y = price)) +
|
ggplot(diamondSample, aes(x = carat, y = price)) +
|
||||||
geom_point(colour = "blue")
|
geom_point(colour = "blue")
|
||||||
|
|
||||||
# Add a log scale.
|
# Add a log scale.
|
||||||
ggplot(diamondSample, aes(x = carat, y = price)) +
|
ggplot(diamondSample, aes(x = carat, y = price)) +
|
||||||
|
@ -155,9 +155,9 @@ ggplot(diamondSample, aes(x = carat, y = price)) +
|
||||||
|
|
||||||
# Add a log scale for both scales.
|
# Add a log scale for both scales.
|
||||||
ggplot(diamondSample, aes(x = carat, y = price)) +
|
ggplot(diamondSample, aes(x = carat, y = price)) +
|
||||||
geom_point(colour = "blue") +
|
geom_point(colour = "blue") +
|
||||||
scale_x_log10() +
|
scale_x_log10() +
|
||||||
scale_y_log10()
|
scale_y_log10()
|
||||||
|
|
||||||
### Linear Regression in R
|
### Linear Regression in R
|
||||||
|
|
||||||
|
@ -177,10 +177,10 @@ exp(coef(model)[1])
|
||||||
|
|
||||||
# Show the model in a plot.
|
# Show the model in a plot.
|
||||||
ggplot(diamondSample, aes(x = carat, y = price)) +
|
ggplot(diamondSample, aes(x = carat, y = price)) +
|
||||||
geom_point(colour = "blue") +
|
geom_point(colour = "blue") +
|
||||||
geom_smooth(method = "lm", colour = "red", size = 2) +
|
geom_smooth(method = "lm", colour = "red", size = 2) +
|
||||||
scale_x_log10() +
|
scale_x_log10() +
|
||||||
scale_y_log10()
|
scale_y_log10()
|
||||||
|
|
||||||
|
|
||||||
### Regression Diagnostics
|
### Regression Diagnostics
|
||||||
|
|
|
@ -36,8 +36,8 @@ theme_set(theme_gray(base_size = 18))
|
||||||
# and lat to the y-axis.
|
# and lat to the y-axis.
|
||||||
# Then you add a layer with points (geom_point) and a layer to plot maps.
|
# Then you add a layer with points (geom_point) and a layer to plot maps.
|
||||||
p0 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p0 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point() +
|
geom_point() +
|
||||||
coord_map()
|
coord_map()
|
||||||
p0
|
p0
|
||||||
|
|
||||||
# You can use a number of different aesthetics, for example colour or size
|
# You can use a number of different aesthetics, for example colour or size
|
||||||
|
@ -45,14 +45,14 @@ p0
|
||||||
|
|
||||||
# Map the depth column to the colour aesthetic.
|
# Map the depth column to the colour aesthetic.
|
||||||
p1 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p1 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth)) +
|
geom_point(aes(colour = depth)) +
|
||||||
coord_map()
|
coord_map()
|
||||||
p1
|
p1
|
||||||
|
|
||||||
# Add size for magnitude. The bigger the magnitude, the larger the point.
|
# Add size for magnitude. The bigger the magnitude, the larger the point.
|
||||||
p2 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p2 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth, size = mag)) +
|
geom_point(aes(colour = depth, size = mag)) +
|
||||||
coord_map()
|
coord_map()
|
||||||
p2
|
p2
|
||||||
|
|
||||||
# You can control the transparancy of a plot object with the alpha aesthetic.
|
# You can control the transparancy of a plot object with the alpha aesthetic.
|
||||||
|
@ -60,24 +60,24 @@ p2
|
||||||
# are translucent.
|
# are translucent.
|
||||||
# Add alpha level to hide overplotting, thus revealing detail.
|
# Add alpha level to hide overplotting, thus revealing detail.
|
||||||
p3 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p3 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
||||||
coord_map()
|
coord_map()
|
||||||
p3
|
p3
|
||||||
|
|
||||||
# Change colour gradient by adding a gradient scale.
|
# Change colour gradient by adding a gradient scale.
|
||||||
p4 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p4 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
||||||
coord_map() +
|
coord_map() +
|
||||||
scale_colour_gradient(low = "blue", high = "red")
|
scale_colour_gradient(low = "blue", high = "red")
|
||||||
p4
|
p4
|
||||||
|
|
||||||
# Add a plot title.
|
# Add a plot title.
|
||||||
p5 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p5 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
||||||
scale_colour_gradient(low = "blue", high = "red") +
|
scale_colour_gradient(low = "blue", high = "red") +
|
||||||
ggtitle("Distribution of earthquakes near Fiji") +
|
ggtitle("Distribution of earthquakes near Fiji") +
|
||||||
coord_map()
|
coord_map()
|
||||||
p5
|
p5
|
||||||
|
|
||||||
# Now plot multiple plots on the same graphic.
|
# Now plot multiple plots on the same graphic.
|
||||||
# The package "grid" is built into R and allows you to take control of the
|
# The package "grid" is built into R and allows you to take control of the
|
||||||
|
@ -89,10 +89,10 @@ theme_set(theme_grey(12) + theme(legend.key.size = unit(0.5, "lines")))
|
||||||
library(grid)
|
library(grid)
|
||||||
plot.new()
|
plot.new()
|
||||||
grid.draw(cbind(
|
grid.draw(cbind(
|
||||||
ggplotGrob(p1),
|
ggplotGrob(p1),
|
||||||
ggplotGrob(p2),
|
ggplotGrob(p2),
|
||||||
ggplotGrob(p3),
|
ggplotGrob(p3),
|
||||||
size = "last"
|
size = "last"
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,30 +1,27 @@
|
||||||
#-------------------------------------------------------------------------------------------------------------
|
# Regression: Demand estimation with Microsoft R Server
|
||||||
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
|
|
||||||
#-------------------------------------------------------------------------------------------------------------
|
# This example demonstrates the feature engineering process for building
|
||||||
#
|
# a regression model to predict bike rental demand.
|
||||||
#
|
|
||||||
# This example demonstrates the feature engineering process for building a regression model to predict
|
# The dataset contains 17,379 rows and 17 columns, each row representing
|
||||||
# bike rental demand.
|
# the number of bike rentals within a specific hour of a day in the years
|
||||||
#
|
# 2011 or 2012. Weather conditions (such as temperature, humidity,
|
||||||
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within
|
# and wind speed) were included in this raw feature set, and
|
||||||
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity,
|
# the dates were categorized as holiday vs. weekday, etc.
|
||||||
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
|
|
||||||
# weekday etc.
|
# The field to predict is "cnt", which contains a count value ranging
|
||||||
#
|
# from 1 to 977, representing the number of bike rentals within
|
||||||
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
|
# a specific hour. The lag features we add in the data set are the number
|
||||||
# of bike rentals within a specific hour. The lag features we add in the data set is the number of bikes that
|
# of bikes that were rented in each of the previous 12 hours.
|
||||||
# were rented in each of the previous 12 hours, which caputures the very recent demand for the bikes.
|
# This caputures the very recent demand for the bikes.
|
||||||
#
|
|
||||||
# The following scripts include five basic steps of building this example using Microsoft R Server.
|
# The following scripts include five basic steps of building this example
|
||||||
#
|
# using Microsoft R Server.
|
||||||
#
|
|
||||||
#--------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
#---------------------------Step 0: Get Started-------------------------------
|
# Step 0: Get Started
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# Check if Microsoft R Server (RRE 8.0) is installed
|
# Check whether Microsoft R Server (RRE 8.0) is installed
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
if (!require("RevoScaleR")) {
|
if (!require("RevoScaleR")) {
|
||||||
cat("RevoScaleR package does not seem to exist.
|
cat("RevoScaleR package does not seem to exist.
|
||||||
\nThis means that the functions starting with 'rx' will not run.
|
\nThis means that the functions starting with 'rx' will not run.
|
||||||
|
@ -38,7 +35,7 @@ if (!require("RevoScaleR")) {
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# Initial some variables.
|
# Initialize some variables.
|
||||||
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
|
||||||
inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
|
inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
|
||||||
|
|
||||||
|
@ -47,81 +44,115 @@ td <- tempdir()
|
||||||
outFileBike <- paste0(td, "/bike.xdf")
|
outFileBike <- paste0(td, "/bike.xdf")
|
||||||
outFileLag <- paste0(td, "/lagData.xdf")
|
outFileLag <- paste0(td, "/lagData.xdf")
|
||||||
|
|
||||||
#---------------------------Step 1: Import the Bike Data---------------------------
|
|
||||||
bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
|
# Step 1: Import the Bike Data
|
||||||
|
|
||||||
|
bike <- rxImport(inData = inputFileBikeURL,
|
||||||
|
outFile = outFileBike, overwrite = TRUE,
|
||||||
missingValueString = "M", stringsAsFactors = FALSE,
|
missingValueString = "M", stringsAsFactors = FALSE,
|
||||||
# Remove timestamps and all columns that are part of the label.
|
# Remove timestamps and all columns that are
|
||||||
|
# part of the label.
|
||||||
varsToDrop = c("instant", "dteday", "casual", "registered"),
|
varsToDrop = c("instant", "dteday", "casual", "registered"),
|
||||||
# Definite year, weather conditions and season columns as categorical.
|
# Define year, weather conditions and season columns
|
||||||
|
# as categorical.
|
||||||
colInfo = list(yr = list(type = "factor"),
|
colInfo = list(yr = list(type = "factor"),
|
||||||
weathersit = list(type = "factor"),
|
weathersit = list(type = "factor"),
|
||||||
season = list(type = "factor")))
|
season = list(type = "factor")))
|
||||||
|
|
||||||
#---------------------------Step 2: Feature Engineering---------------------------
|
|
||||||
# Add number of bikes that were rented in each of the previous 12 hours as 12 lag features.
|
# Step 2: Feature Engineering
|
||||||
computeLagFeatures <- function (dataList) { # function for computing lag features.
|
|
||||||
|
# Add the number of bikes that were rented in each of the previous
|
||||||
|
# 12 hours as 12 lag features.
|
||||||
|
computeLagFeatures <- function(dataList) {
|
||||||
|
# Total number of lags that need to be added.
|
||||||
|
numLags <- length(nLagsVector)
|
||||||
|
# lag feature names as lagN
|
||||||
|
varLagNameVector <- paste("lag", nLagsVector, sep="")
|
||||||
|
|
||||||
numLags <- length(nLagsVector) # total number of lags that need to be added
|
# Set the value of an object "storeLagData" in the transform environment.
|
||||||
varLagNameVector <- paste("lag", nLagsVector, sep="") # lag feature names as lagN
|
if (!exists("storeLagData"))
|
||||||
|
{
|
||||||
# Set the value of an object "storeLagData" in the transform environment.
|
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
|
||||||
if (!exists("storeLagData"))
|
names(lagData) <- varLagNameVector
|
||||||
{
|
.rxSet("storeLagData",lagData)
|
||||||
lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
|
|
||||||
names(lagData) <- varLagNameVector
|
|
||||||
.rxSet("storeLagData",lagData)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!.rxIsTestChunk)
|
|
||||||
{
|
|
||||||
for (iL in 1:numLags)
|
|
||||||
{
|
|
||||||
numRowsInChunk <- length(dataList[[varName]]) # number of rows in the current chunk
|
|
||||||
nlags <- nLagsVector[iL]
|
|
||||||
varLagName <- paste("lag", nlags, sep="")
|
|
||||||
lagData <- .rxGet("storeLagData") # retrieve lag data from the previous chunk
|
|
||||||
allData <- c(lagData[[varLagName]], dataList[[varName]]) # concatenate lagData and the "cnt" feature
|
|
||||||
dataList[[varLagName]] <- allData[1:numRowsInChunk] # take the first N rows of allData, where N is the total number of rows in the original dataList
|
|
||||||
lagData[[varLagName]] <- tail(allData, nlags) # save last nlag rows as the new lagData to be used to process in the next chunk
|
|
||||||
.rxSet("storeLagData", lagData)
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return(dataList)
|
if (!.rxIsTestChunk)
|
||||||
|
{
|
||||||
|
for (iL in 1:numLags) {
|
||||||
|
# Number of rows in the current chunk.
|
||||||
|
numRowsInChunk <- length(dataList[[varName]])
|
||||||
|
nlags <- nLagsVector[iL]
|
||||||
|
varLagName <- paste("lag", nlags, sep = "")
|
||||||
|
# Retrieve lag data from the previous chunk.
|
||||||
|
lagData <- .rxGet("storeLagData")
|
||||||
|
# Concatenate lagData and the "cnt" feature.
|
||||||
|
allData <- c(lagData[[varLagName]], dataList[[varName]])
|
||||||
|
# Take the first N rows of allData, where N is the total
|
||||||
|
# number of rows in the original dataList.
|
||||||
|
dataList[[varLagName]] <- allData[1:numRowsInChunk]
|
||||||
|
# Save last nlag rows as the new lagData to be used
|
||||||
|
# to process in the next chunk.
|
||||||
|
lagData[[varLagName]] <- tail(allData, nlags)
|
||||||
|
.rxSet("storeLagData", lagData)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return(dataList)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Apply the "computeLagFeatures" on the bike data.
|
# Apply the "computeLagFeatures" on the bike data.
|
||||||
lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = computeLagFeatures,
|
lagData <- rxDataStep(inData = bike, outFile = outFileLag,
|
||||||
transformObjects = list(varName = "cnt", nLagsVector = seq(12)),
|
transformFunc = computeLagFeatures,
|
||||||
|
transformObjects = list(varName = "cnt",
|
||||||
|
nLagsVector = seq(12)),
|
||||||
transformVars = "cnt", overwrite=TRUE)
|
transformVars = "cnt", overwrite=TRUE)
|
||||||
|
|
||||||
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
|
|
||||||
# Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012.
|
# Step 3: Prepare Training and Test Datasets
|
||||||
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0)
|
|
||||||
|
# Split data by "yr" so that the training data contains records
|
||||||
|
# for the year 2011 and the test data contains records for 2012.
|
||||||
|
rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"),
|
||||||
|
splitByFactor = "yr", overwrite = TRUE,
|
||||||
|
reportProgress = 0, verbose = 0)
|
||||||
|
|
||||||
# Point to the .xdf files for the training and test set.
|
# Point to the .xdf files for the training and test set.
|
||||||
train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
|
train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
|
||||||
test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))
|
test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))
|
||||||
|
|
||||||
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
|
|
||||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
|
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
|
||||||
modelFormula <- formula(train, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
|
|
||||||
|
# Build a formula for the regression model and remove the "yr",
|
||||||
|
# which is used to split the training and test data.
|
||||||
|
modelFormula <- formula(train, depVars = "cnt",
|
||||||
|
varsToDrop = c("RowNum", "yr"))
|
||||||
|
|
||||||
# Fit a Decision Forest Regression model on the training data.
|
# Fit a Decision Forest Regression model on the training data.
|
||||||
dForest <- rxDForest(modelFormula, data = train, importance = TRUE, seed = 123)
|
dForest <- rxDForest(modelFormula, data = train, importance = TRUE,
|
||||||
|
seed = 123)
|
||||||
|
|
||||||
|
|
||||||
|
# Step 5: Predict over new data and review the model performance
|
||||||
|
|
||||||
#---------------------------Step 5: Predict over new data and review the model performance---------------------------
|
|
||||||
# Predict the probability on the test dataset.
|
# Predict the probability on the test dataset.
|
||||||
predict <- rxPredict(dForest, data = test, overwrite = TRUE, computeResiduals = TRUE)
|
predict <- rxPredict(dForest, data = test, overwrite = TRUE,
|
||||||
|
computeResiduals = TRUE)
|
||||||
|
|
||||||
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
|
# Calculate three statistical measures: Mean Absolute Error (MAE),
|
||||||
sum <- rxSummary(~ cnt_Resid_abs+cnt_Resid_2+cnt_rel, data = predict, summaryStats = "Mean",
|
# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
|
||||||
|
sum <- rxSummary(~cnt_Resid_abs + cnt_Resid_2 + cnt_rel, data = predict,
|
||||||
|
summaryStats = "Mean",
|
||||||
transforms = list(cnt_Resid_abs = abs(cnt_Resid),
|
transforms = list(cnt_Resid_abs = abs(cnt_Resid),
|
||||||
cnt_Resid_2 = cnt_Resid^2,
|
cnt_Resid_2 = cnt_Resid^2,
|
||||||
cnt_rel = abs(cnt_Resid)/cnt)
|
cnt_rel = abs(cnt_Resid)/cnt)
|
||||||
)$sDataFrame
|
)$sDataFrame
|
||||||
|
|
||||||
# List all measures in a data frame.
|
# List all measures in a data frame.
|
||||||
measures <- data.frame(MAE = sum[1, 2], RMSE = sqrt(sum[2, 2]), RAE = sum[3, 2])
|
measures <- data.frame(MAE = sum[1, 2],
|
||||||
|
RMSE = sqrt(sum[2, 2]),
|
||||||
|
RAE = sum[3, 2])
|
||||||
|
|
||||||
# Review the measures.
|
# Review the measures.
|
||||||
measures
|
measures
|
||||||
|
|
|
@ -1,58 +1,61 @@
|
||||||
#-------------------------------------------------------------------------------------------------------------
|
# Regression: Demand estimation with Microsoft R Server
|
||||||
#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
|
|
||||||
#-------------------------------------------------------------------------------------------------------------
|
# This example a replication of an existing Azure Machine Learning
|
||||||
#
|
# Experiment - Regression: Demand Estimation
|
||||||
#
|
|
||||||
# This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation
|
|
||||||
# https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
|
# https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
|
||||||
#
|
|
||||||
# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within
|
# The dataset contains 17,379 rows and 17 columns, each row representing
|
||||||
# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity,
|
# the number of bike rentals within a given hour of a given day in
|
||||||
# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs.
|
# the years 2011 or 2012. Weather conditions (such as temperature, humidity,
|
||||||
# weekday etc.
|
# and wind speed) were included in this raw feature set, and the dates
|
||||||
#
|
# were categorized as holiday vs. weekday, etc.
|
||||||
# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
|
|
||||||
# of bike rentals within a specific hour.
|
# The field to predict is "cnt", which contains a count value ranging
|
||||||
#
|
# from 1 to 977, representing the number of bike rentals within
|
||||||
# We built four models using the same algorithm, but with four different training datasets. The four training
|
# a given hour.
|
||||||
# datasets that we constructed were all based on the same raw input data, but we added different additional
|
|
||||||
# features to each training set.
|
# We built four models using the same algorithm, but with four different
|
||||||
#
|
# training datasets. The four training datasets that we constructed
|
||||||
|
# were all based on the same raw input data, but we added different
|
||||||
|
# additional features to each training set.
|
||||||
|
|
||||||
# Set A = weather + holiday + weekday + weekend features for the predicted day
|
# Set A = weather + holiday + weekday + weekend features for the predicted day
|
||||||
# Set B = number of bikes that were rented in each of the previous 12 hours
|
# Set B = number of bikes that were rented in each of the previous 12 hours
|
||||||
# Set C = number of bikes that were rented in each of the previous 12 days at the same hour
|
# Set C = number of bikes that were rented in each of the previous 12 days
|
||||||
# Set D = number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day
|
# at the same hour
|
||||||
#
|
# Set D = number of bikes that were rented in each of the previous 12 weeks
|
||||||
|
# at the same hour and the same day
|
||||||
|
|
||||||
# Each of these feature sets captures different aspects of the problem:
|
# Each of these feature sets captures different aspects of the problem:
|
||||||
# Feature set B captures very recent demand for the bikes.
|
# Feature set B captures very recent demand for the bikes.
|
||||||
# Feature set C captures the demand for bikes at a particular hour.
|
# Feature set C captures the demand for bikes at a particular hour.
|
||||||
# Feature set D captures demand for bikes at a particular hour and particular day of the week.
|
# Feature set D captures demand for bikes at a particular hour and
|
||||||
#
|
# particular day of the week.
|
||||||
# The four training datasets were built by combining the feature set as follows:
|
|
||||||
|
# The four training datasets were built by combining the feature set
|
||||||
|
# as follows:
|
||||||
# Training set 1: feature set A only
|
# Training set 1: feature set A only
|
||||||
# Training set 2: feature sets A+B
|
# Training set 2: feature sets A+B
|
||||||
# Training set 3: feature sets A+B+C
|
# Training set 3: feature sets A+B+C
|
||||||
# Training set 4: feature sets A+B+C+D
|
# Training set 4: feature sets A+B+C+D
|
||||||
#
|
|
||||||
# The following scripts include five basic steps of building this example using Microsoft R Server.
|
# The following scripts include five basic steps of building this example
|
||||||
#
|
# using Microsoft R Server.
|
||||||
#
|
|
||||||
#--------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
#---------------------------Step 0: Get Started-------------------------------
|
# Step 0: Get Started
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# Check if Microsoft R Server (RRE 8.0) is installed
|
# Check whether Microsoft R Server (RRE 8.0) is installed
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
if (!require("RevoScaleR")) {
|
if (!require("RevoScaleR")) {
|
||||||
cat("RevoScaleR package does not seem to exist.
|
cat("RevoScaleR package does not seem to exist.
|
||||||
\nThis means that the functions starting with 'rx' will not run.
|
\nThis means that the functions starting with 'rx' will not run.
|
||||||
\nIf you have Microsoft R Server installed, please switch the R engine.
|
\nIf you have Microsoft R Server installed,
|
||||||
\nFor example, in R Tools for Visual Studio:
|
\nplease switch the R engine.
|
||||||
\nR Tools -> Options -> R Engine.
|
\nFor example, in R Tools for Visual Studio:
|
||||||
\nIf Microsoft R Server is not installed, you can download it from:
|
\nR Tools -> Options -> R Engine.
|
||||||
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
|
\nIf Microsoft R Server is not installed, you can download it from:
|
||||||
\n")
|
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
|
||||||
|
\n")
|
||||||
|
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
@ -67,95 +70,126 @@ outFileBike <- paste0(td, "/bike.xdf")
|
||||||
outFileEdit <- paste0(td, "/editData.xdf")
|
outFileEdit <- paste0(td, "/editData.xdf")
|
||||||
outFileLag <- paste0(td, "/lagData")
|
outFileLag <- paste0(td, "/lagData")
|
||||||
|
|
||||||
#---------------------------Step 1: Import Data---------------------------
|
|
||||||
|
# Step 1: Import Data
|
||||||
|
|
||||||
# Import the bike data.
|
# Import the bike data.
|
||||||
# Remove timestamps and all columns that are part of the label.
|
# Remove timestamps and all columns that are part of the label.
|
||||||
bike_mrs <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
|
bike_mrs <- rxImport(inData = inputFileBikeURL,
|
||||||
|
outFile = outFileBike, overwrite = TRUE,
|
||||||
missingValueString = "M", stringsAsFactors = FALSE,
|
missingValueString = "M", stringsAsFactors = FALSE,
|
||||||
varsToDrop = c("instant", "dteday", "casual", "registered"))
|
varsToDrop = c("instant",
|
||||||
|
"dteday",
|
||||||
|
"casual",
|
||||||
|
"registered"))
|
||||||
|
|
||||||
# Edit Metadata: Definite year, weather conditions and season columns as categorical.
|
# Edit Metadata: Define year, weather conditions and season columns
|
||||||
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit, sortLevels = TRUE,
|
# as categorical.
|
||||||
factorInfo = c("yr", "weathersit", "season"), overwrite = TRUE)
|
editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit,
|
||||||
|
sortLevels = TRUE,
|
||||||
|
factorInfo = c("yr", "weathersit", "season"),
|
||||||
|
overwrite = TRUE)
|
||||||
|
|
||||||
|
|
||||||
|
# Step 2: Feature Engineering
|
||||||
|
|
||||||
#---------------------------Step 2: Feature Engineering---------------------------
|
|
||||||
# Create a function to construct lag features for four different aspects.
|
# Create a function to construct lag features for four different aspects.
|
||||||
computeLagFeatures <- function (dataList) {
|
computeLagFeatures <- function (dataList) {
|
||||||
|
# Total number of lags that need to be added.
|
||||||
numLags <- length(nLagsVector) # total number of lags that need to be added
|
numLags <- length(nLagsVector)
|
||||||
for (iL in 1:numLags)
|
for (iL in 1:numLags) {
|
||||||
{
|
nlag <- nLagsVector[iL]
|
||||||
nlag <- nLagsVector[iL]
|
varLagName <- paste("demand.", nlag, unit, sep = "")
|
||||||
varLagName <- paste("demand.",nlag,unit,sep="")
|
numRowsInChunk <- length(dataList[[baseVar]])
|
||||||
numRowsInChunk <- length(dataList[[baseVar]])
|
numRowsToRead <- nlag * interval
|
||||||
numRowsToRead <- nlag * interval
|
numRowsPadding <- 0
|
||||||
numRowsPadding <- 0
|
if (numRowsToRead >= .rxStartRow) {
|
||||||
if (numRowsToRead >= .rxStartRow)
|
numRowsToRead <- .rxStartRow - 1
|
||||||
{
|
numRowsPadding <- nlag * interval - numRowsToRead
|
||||||
numRowsToRead <- .rxStartRow - 1
|
}
|
||||||
numRowsPadding <- nlag * interval - numRowsToRead
|
# Determine the current row to start processing the data
|
||||||
|
# between chunks.
|
||||||
|
startRow <- .rxStartRow - numRowsToRead
|
||||||
|
previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
|
||||||
|
varsToKeep = baseVar,
|
||||||
|
startRow = startRow,
|
||||||
|
numRows = numRowsToRead,
|
||||||
|
returnDataFrame = FALSE)
|
||||||
|
paddingRowsDataList <- rxReadXdf(file = .rxReadFileName,
|
||||||
|
varsToKeep = baseVar,
|
||||||
|
startRow = 1,
|
||||||
|
numRows = numRowsPadding,
|
||||||
|
returnDataFrame = FALSE)
|
||||||
|
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]],
|
||||||
|
previousRowsDataList[[baseVar]],
|
||||||
|
dataList[[baseVar]])[1:numRowsInChunk]
|
||||||
}
|
}
|
||||||
startRow <- .rxStartRow - numRowsToRead # determine the current row to start processing the data between chunks.
|
return(dataList)
|
||||||
previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
|
|
||||||
varsToKeep = baseVar,
|
|
||||||
startRow = startRow, numRows = numRowsToRead,
|
|
||||||
returnDataFrame = FALSE)
|
|
||||||
paddingRowsDataList <- rxReadXdf(file=.rxReadFileName,
|
|
||||||
varsToKeep = baseVar,
|
|
||||||
startRow = 1, numRows = numRowsPadding,
|
|
||||||
returnDataFrame = FALSE)
|
|
||||||
dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]], previousRowsDataList[[baseVar]], dataList[[baseVar]])[1:numRowsInChunk]
|
|
||||||
}
|
|
||||||
return(dataList)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create a function to add lag features a set of columns at a time.
|
# Create a function to add lag features a set of columns at a time.
|
||||||
addLag <- function(inputData, outputFileBase) {
|
addLag <- function(inputData, outputFileBase) {
|
||||||
|
|
||||||
inputFile <- inputData
|
inputFile <- inputData
|
||||||
outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
|
outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
|
||||||
outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
|
outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
|
||||||
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week",".xdf",sep="")
|
outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf",
|
||||||
|
sep="")
|
||||||
|
|
||||||
# Initialize some fix values.
|
# Initialize some fix values.
|
||||||
hourInterval <- 1
|
hourInterval <- 1
|
||||||
dayInterval <- 24
|
dayInterval <- 24
|
||||||
weekInterval <- 168
|
weekInterval <- 168
|
||||||
previous <- 12
|
previous <- 12
|
||||||
|
|
||||||
# Add number of bikes that were rented in each of the previous 12 hours (for Set B).
|
# Add number of bikes that were rented in each of the previous 12 hours
|
||||||
rxDataStep(inData = inputFile,outFile = outputFileHour, transformFunc=computeLagFeatures,
|
# (for Set B).
|
||||||
transformObjects = list(baseVar = "cnt", unit = "hour",nLagsVector=seq(12),
|
rxDataStep(inData = inputFile, outFile = outputFileHour,
|
||||||
interval = hourInterval),
|
transformFunc=computeLagFeatures,
|
||||||
transformVars = c("cnt"), overwrite=TRUE)
|
transformObjects = list(baseVar = "cnt", unit = "hour",
|
||||||
|
nLagsVector=seq(12),
|
||||||
|
interval = hourInterval),
|
||||||
|
transformVars = c("cnt"), overwrite=TRUE)
|
||||||
|
|
||||||
# Add number of bikes that were rented in each of the previous 12 days at the same hour (for Set C).
|
# Add number of bikes that were rented in each of the previous 12 days
|
||||||
rxDataStep(inData = outputFileHour,outFile = outputFileHourDay, transformFunc=computeLagFeatures,
|
# at the same hour (for Set C).
|
||||||
transformObjects = list(baseVar = "cnt", unit = "day",nLagsVector=seq(12),
|
rxDataStep(inData = outputFileHour, outFile = outputFileHourDay,
|
||||||
interval = dayInterval),
|
transformFunc=computeLagFeatures,
|
||||||
transformVars = c("cnt"), overwrite=TRUE)
|
transformObjects = list(baseVar = "cnt",
|
||||||
|
unit = "day",
|
||||||
|
nLagsVector=seq(12),
|
||||||
|
interval = dayInterval),
|
||||||
|
transformVars = c("cnt"),
|
||||||
|
overwrite=TRUE)
|
||||||
|
|
||||||
# Add number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day (for Set D).
|
# Add number of bikes that were rented in each of the previous 12 weeks
|
||||||
rxDataStep(inData = outputFileHourDay,outFile = outputFileHourDayWeek, transformFunc=computeLagFeatures,
|
# at the same hour and the same day (for Set D).
|
||||||
transformObjects = list(baseVar = "cnt", unit = "week",nLagsVector=seq(12),
|
rxDataStep(inData = outputFileHourDay,
|
||||||
interval = weekInterval),
|
outFile = outputFileHourDayWeek,
|
||||||
transformVars = c("cnt"), overwrite=TRUE)
|
transformFunc=computeLagFeatures,
|
||||||
|
transformObjects = list(baseVar = "cnt",
|
||||||
|
unit = "week",
|
||||||
|
nLagsVector=seq(12),
|
||||||
|
interval = weekInterval),
|
||||||
|
transformVars = c("cnt"), overwrite=TRUE)
|
||||||
|
|
||||||
file.remove(outputFileHour)
|
file.remove(outputFileHour)
|
||||||
file.remove(outputFileHourDay)
|
file.remove(outputFileHourDay)
|
||||||
|
return(outputFileHourDayWeek)
|
||||||
return(outputFileHourDayWeek)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Set A = weather + holiday + weekday + weekend features for the predicted day.
|
# Set A = weather + holiday + weekday + weekend features for
|
||||||
|
# the predicted day.
|
||||||
finalDataA_mrs <- editData_mrs
|
finalDataA_mrs <- editData_mrs
|
||||||
|
|
||||||
# Set B, C & D.
|
# Set B, C & D.
|
||||||
finalDataLag_dir <- addLag(inputData = editData_mrs, outputFileBase = outFileLag)
|
finalDataLag_dir <- addLag(inputData = editData_mrs,
|
||||||
|
outputFileBase = outFileLag)
|
||||||
finalDataLag_mrs <- RxXdfData(finalDataLag_dir)
|
finalDataLag_mrs <- RxXdfData(finalDataLag_dir)
|
||||||
|
|
||||||
|
|
||||||
#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
|
# Step 3: Prepare Training and Test Datasets
|
||||||
|
|
||||||
## Set A:
|
## Set A:
|
||||||
# Split Data.
|
# Split Data.
|
||||||
rxSplit(inData = finalDataA_mrs,
|
rxSplit(inData = finalDataA_mrs,
|
||||||
|
@ -168,41 +202,62 @@ testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf"))
|
||||||
|
|
||||||
## Set B, C & D:
|
## Set B, C & D:
|
||||||
# Split Data.
|
# Split Data.
|
||||||
rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
|
rxSplit(inData = finalDataLag_mrs,
|
||||||
|
outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
|
||||||
overwrite = TRUE, reportProgress = 0, verbose = 0)
|
overwrite = TRUE, reportProgress = 0, verbose = 0)
|
||||||
# Point to the .xdf files for the training and test set.
|
# Point to the .xdf files for the training and test set.
|
||||||
train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
|
train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
|
||||||
test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))
|
test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))
|
||||||
|
|
||||||
#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
|
|
||||||
|
# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
|
||||||
|
|
||||||
newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
|
newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
|
||||||
newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")
|
newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")
|
||||||
|
|
||||||
## Set A:
|
## Set A:
|
||||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
|
# Build a formula for the regression model and remove the "yr",
|
||||||
formA_mrs <- formula(trainA_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
|
# which is used to split the training and test data.
|
||||||
|
formA_mrs <- formula(trainA_mrs, depVars = "cnt",
|
||||||
|
varsToDrop = c("RowNum", "yr"))
|
||||||
# Fit Decision Forest Regression model.
|
# Fit Decision Forest Regression model.
|
||||||
dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs, importance = TRUE, seed = 123)
|
dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs,
|
||||||
|
importance = TRUE, seed = 123)
|
||||||
|
|
||||||
## Set B:
|
## Set B:
|
||||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set C and D.
|
# Build a formula for the regression model and remove the "yr",
|
||||||
formB_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newDayFeatures, newWeekFeatures))
|
# which is used to split the training and test data, and lag features
|
||||||
|
# for Set C and D.
|
||||||
|
formB_mrs <- formula(train_mrs, depVars = "cnt",
|
||||||
|
varsToDrop = c("RowNum", "yr",
|
||||||
|
newDayFeatures,
|
||||||
|
newWeekFeatures))
|
||||||
# Fit Decision Forest Regression model.
|
# Fit Decision Forest Regression model.
|
||||||
dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs, importance = TRUE, seed = 123)
|
dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs,
|
||||||
|
importance = TRUE, seed = 123)
|
||||||
|
|
||||||
## Set C:
|
## Set C:
|
||||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set D.
|
# Build a formula for the regression model and remove the "yr",
|
||||||
formC_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newWeekFeatures))
|
# which is used to split the training and test data, and lag features
|
||||||
|
# for Set D.
|
||||||
|
formC_mrs <- formula(train_mrs, depVars = "cnt",
|
||||||
|
varsToDrop = c("RowNum", "yr", newWeekFeatures))
|
||||||
# Fit Decision Forest Regression model.
|
# Fit Decision Forest Regression model.
|
||||||
dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs, importance = TRUE, seed = 123)
|
dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs,
|
||||||
|
importance = TRUE, seed = 123)
|
||||||
|
|
||||||
## Set D:
|
## Set D:
|
||||||
# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
|
# Build a formula for the regression model and remove the "yr",
|
||||||
formD_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
|
# which is used to split the training and test data.
|
||||||
|
formD_mrs <- formula(train_mrs, depVars = "cnt",
|
||||||
|
varsToDrop = c("RowNum", "yr"))
|
||||||
# Fit Decision Forest Regression model.
|
# Fit Decision Forest Regression model.
|
||||||
dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs, importance = TRUE, seed = 123)
|
dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs,
|
||||||
|
importance = TRUE, seed = 123)
|
||||||
|
|
||||||
|
|
||||||
|
# Step 5: Predict over new data
|
||||||
|
|
||||||
#---------------------------Step 5: Predict over new data---------------------------
|
|
||||||
## Set A:
|
## Set A:
|
||||||
# Predict the probability on the test dataset.
|
# Predict the probability on the test dataset.
|
||||||
rxPredict(dForestA_mrs, data = testA_mrs,
|
rxPredict(dForestA_mrs, data = testA_mrs,
|
||||||
|
@ -231,17 +286,22 @@ rxPredict(dForestD_mrs, data = test_mrs,
|
||||||
residVarNames = "cnt_Resid_D",
|
residVarNames = "cnt_Resid_D",
|
||||||
overwrite = TRUE, computeResiduals = TRUE)
|
overwrite = TRUE, computeResiduals = TRUE)
|
||||||
|
|
||||||
#---------------------------Prepare outputs---------------------------
|
|
||||||
|
# Prepare outputs
|
||||||
## Set A:
|
## Set A:
|
||||||
# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
|
# Calculate three statistical measures: Mean Absolute Error (MAE),
|
||||||
sumA <- rxSummary(~ cnt_Resid_A_abs+cnt_Resid_A_2+cnt_rel_A, data = testA_mrs, summaryStats = "Mean",
|
# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
|
||||||
|
sumA <- rxSummary( ~ cnt_Resid_A_abs + cnt_Resid_A_2 + cnt_rel_A,
|
||||||
|
data = testA_mrs, summaryStats = "Mean",
|
||||||
transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A),
|
transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A),
|
||||||
cnt_Resid_A_2 = cnt_Resid_A^2,
|
cnt_Resid_A_2 = cnt_Resid_A^2,
|
||||||
cnt_rel_A = abs(cnt_Resid_A)/cnt)
|
cnt_rel_A = abs(cnt_Resid_A)/cnt)
|
||||||
)$sDataFrame
|
)$sDataFrame
|
||||||
|
|
||||||
## Set B, C & D:
|
## Set B, C & D:
|
||||||
sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_Resid_C_2+cnt_rel_C+cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
|
sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
|
||||||
|
cnt_Resid_C_abs + cnt_Resid_C_2 + cnt_rel_C +
|
||||||
|
cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D,
|
||||||
data = test_mrs, summaryStats = "Mean",
|
data = test_mrs, summaryStats = "Mean",
|
||||||
transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B),
|
transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B),
|
||||||
cnt_Resid_B_2 = cnt_Resid_B^2,
|
cnt_Resid_B_2 = cnt_Resid_B^2,
|
||||||
|
@ -252,7 +312,7 @@ sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_R
|
||||||
cnt_Resid_D_abs = abs(cnt_Resid_D),
|
cnt_Resid_D_abs = abs(cnt_Resid_D),
|
||||||
cnt_Resid_D_2 = cnt_Resid_D^2,
|
cnt_Resid_D_2 = cnt_Resid_D^2,
|
||||||
cnt_rel_D = abs(cnt_Resid_D)/cnt)
|
cnt_rel_D = abs(cnt_Resid_D)/cnt)
|
||||||
)$sDataFrame
|
)$sDataFrame
|
||||||
|
|
||||||
# Add row names.
|
# Add row names.
|
||||||
features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
|
features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
|
||||||
|
@ -263,7 +323,10 @@ features <- c("baseline: weather + holiday + weekday + weekend features for the
|
||||||
# List all measures in a data frame.
|
# List all measures in a data frame.
|
||||||
measures <- data.frame(Features = features,
|
measures <- data.frame(Features = features,
|
||||||
MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
|
MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
|
||||||
RMSE = c(sqrt(sumA[2, 2]), sqrt(sum[2, 2]), sqrt(sum[5, 2]), sqrt(sum[8, 2])),
|
RMSE = c(sqrt(sumA[2, 2]),
|
||||||
|
sqrt(sum[2, 2]),
|
||||||
|
sqrt(sum[5, 2]),
|
||||||
|
sqrt(sum[8, 2])),
|
||||||
RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))
|
RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))
|
||||||
|
|
||||||
# Review the measures.
|
# Review the measures.
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
# Install a package if it's not already installed.
|
# This script shows how to import data into R that is referenced by a URL.
|
||||||
|
|
||||||
|
# Install the RCurl package if it's not already installed.
|
||||||
(if (!require("RCurl", quietly = TRUE)) install.packages("RCurl"))
|
(if (!require("RCurl", quietly = TRUE)) install.packages("RCurl"))
|
||||||
|
|
||||||
# Load packages.
|
# Load packages.
|
||||||
|
|
|
@ -3,16 +3,17 @@
|
||||||
|
|
||||||
# Check whether the "RevoScaleR" package is loaded in the current environment.
|
# Check whether the "RevoScaleR" package is loaded in the current environment.
|
||||||
if (!require("RevoScaleR")) {
|
if (!require("RevoScaleR")) {
|
||||||
cat("RevoScaleR package does not seem to exist.
|
cat("RevoScaleR package does not seem to exist.
|
||||||
\nThis means that the functions starting with 'rx' will not run.
|
\nThis means that the functions starting with 'rx' will not run.
|
||||||
\nIf you have Microsoft R Server installed, please switch the R engine.
|
\nIf you have Microsoft R Server installed,
|
||||||
\nFor example, in R Tools for Visual Studio:
|
\nplease switch the R engine.
|
||||||
\nR Tools -> Options -> R Engine.
|
\nFor example, in R Tools for Visual Studio:
|
||||||
\nIf Microsoft R Server is not installed, you can download it from:
|
\nR Tools -> Options -> R Engine.
|
||||||
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
|
\nIf Microsoft R Server is not installed, you can download it from:
|
||||||
\n")
|
\nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
|
||||||
|
\n")
|
||||||
|
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# A URL contains the raw data.
|
# A URL contains the raw data.
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
# ----------------------------------------------------------------------------
|
# This sample gives an introduction to plotting using the ggplot2 package.
|
||||||
# purpose: introduction to plotting using the ggplot2 package
|
|
||||||
# audience: you are expected to have some prior experience with R
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# R has a number of built-in datasets
|
# The ggplot2 package is tremendously popular because it allows you to create
|
||||||
# In this example you use the dataset called quakes
|
# beautiful plots by describing the plot structure
|
||||||
# This data contains locations of earthquakes off Fiji
|
|
||||||
|
# R has a number of built-in datasets.
|
||||||
|
# In this example you use the dataset called quakes.
|
||||||
|
# This data contains locations of earthquakes off Fiji.
|
||||||
|
|
||||||
# Read the help page for more information
|
# Read the help page for more information
|
||||||
?quakes
|
?quakes
|
||||||
|
@ -24,71 +24,55 @@ if (!require("mapproj"))
|
||||||
install.packages("mapproj") # required for map projections
|
install.packages("mapproj") # required for map projections
|
||||||
library("mapproj")
|
library("mapproj")
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
# Starting to use the ggplot2 package
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# The ggplot2 package is tremendously popular because it allows you to create
|
|
||||||
# beautiful plots by describing the plot structure
|
|
||||||
|
|
||||||
|
|
||||||
# Plot longitude and latitude of quakes
|
# Plot longitude and latitude of quakes
|
||||||
# To create a plot, you have to specify the data, then map aesthetics to
|
# To create a plot, you have to specify the data, then map aesthetics to
|
||||||
# columns in your data. In this example, you map the column long to the x-axis
|
# columns in your data. In this example, you map the column long to the x-axis
|
||||||
# and lat to the y-axis.
|
# and lat to the y-axis.
|
||||||
# Then you add a layer with points (geom_point) and a layer to plot maps
|
# Then you add a layer with points (geom_point) and a layer to plot maps.
|
||||||
|
|
||||||
p0 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p0 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point() +
|
geom_point() +
|
||||||
coord_map()
|
coord_map()
|
||||||
p0
|
p0
|
||||||
|
|
||||||
# You can use a number of different aesthetics, for example colour or size
|
# You can use a number of different aesthetics, for example colour or size
|
||||||
# of the points
|
# of the points.
|
||||||
|
|
||||||
# Map the depth column to the colour aesthetic
|
# Map the depth column to the colour aesthetic
|
||||||
|
|
||||||
p1 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p1 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth)) +
|
geom_point(aes(colour = depth)) +
|
||||||
coord_map()
|
coord_map()
|
||||||
p1
|
p1
|
||||||
|
|
||||||
# Add size for magnitude. The bigger the magnitude, the larger the point
|
# Add size for magnitude. The bigger the magnitude, the larger the point
|
||||||
|
|
||||||
p2 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p2 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth, size = mag)) +
|
geom_point(aes(colour = depth, size = mag)) +
|
||||||
coord_map()
|
coord_map()
|
||||||
p2
|
p2
|
||||||
|
|
||||||
# You can control the transparancy of a plot object with the alpha aesthetic
|
# You can control the transparancy of a plot object with the alpha aesthetic
|
||||||
# High values of alpha (close to 1) are opaque, while low values (close to 0)
|
# High values of alpha (close to 1) are opaque, while low values (close to 0)
|
||||||
# are translucent
|
# are translucent
|
||||||
# Add alpha level to hide overplotting, thus revealing detail
|
# Add alpha level to hide overplotting, thus revealing detail
|
||||||
|
|
||||||
p3 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p3 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
||||||
coord_map()
|
coord_map()
|
||||||
p3
|
p3
|
||||||
|
|
||||||
# Change colour gradient by adding a gradient scale
|
# Change colour gradient by adding a gradient scale
|
||||||
|
|
||||||
p4 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p4 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
||||||
coord_map() +
|
coord_map() +
|
||||||
scale_colour_gradient(low = "blue", high = "red")
|
scale_colour_gradient(low = "blue", high = "red")
|
||||||
p4
|
p4
|
||||||
|
|
||||||
# Add a plot title
|
# Add a plot title
|
||||||
|
|
||||||
p5 <- ggplot(quakes, aes(x = long, y = lat)) +
|
p5 <- ggplot(quakes, aes(x = long, y = lat)) +
|
||||||
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
geom_point(aes(colour = depth, size = mag), alpha = 0.25) +
|
||||||
scale_colour_gradient(low = "blue", high = "red") +
|
scale_colour_gradient(low = "blue", high = "red") +
|
||||||
ggtitle("Distribution of earthquakes near Fiji") +
|
ggtitle("Distribution of earthquakes near Fiji") +
|
||||||
coord_map()
|
coord_map()
|
||||||
|
|
||||||
p5
|
p5
|
||||||
|
|
||||||
|
|
||||||
# Now plot multiple plots on the same graphic
|
# Now plot multiple plots on the same graphic
|
||||||
# The package "grid" is built into R and allows you to take control of the
|
# The package "grid" is built into R and allows you to take control of the
|
||||||
# plotting area. A grob is the abbreviation for "graphical object", and the
|
# plotting area. A grob is the abbreviation for "graphical object", and the
|
||||||
|
@ -100,38 +84,35 @@ theme_set(theme_grey(12) + theme(legend.key.size = unit(0.5, "lines")))
|
||||||
library(grid)
|
library(grid)
|
||||||
plot.new()
|
plot.new()
|
||||||
grid.draw(cbind(
|
grid.draw(cbind(
|
||||||
ggplotGrob(p1),
|
ggplotGrob(p1),
|
||||||
ggplotGrob(p2),
|
ggplotGrob(p2),
|
||||||
ggplotGrob(p3),
|
ggplotGrob(p3),
|
||||||
size = "last"
|
size = "last"
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# The package "gtable" allows you to work with objects called grob tables.
|
# The package "gtable" allows you to work with objects called grob tables.
|
||||||
# A grob table captures all the information needed to layout grobs in a table
|
# A grob table captures all the information needed to layout grobs in a table
|
||||||
# structure. It supports row and column spanning, offers some tools to
|
# structure. It supports row and column spanning, offers some tools to
|
||||||
# automatically figure out the correct dimensions, and makes it easy to align
|
# automatically figure out the correct dimensions, and makes it easy to align
|
||||||
# and combine multiple tables.
|
# and combine multiple tables.
|
||||||
|
if (!require("gtable"))
|
||||||
if (!require("gtable")) install.packages("gtable")
|
install.packages("gtable")
|
||||||
library(gtable)
|
library(gtable)
|
||||||
|
|
||||||
plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) +
|
plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) +
|
||||||
geom_point(alpha = 0.5) +
|
geom_point(alpha = 0.5) +
|
||||||
ggtitle("Top view")
|
ggtitle("Top view")
|
||||||
|
|
||||||
plongdep <- ggplot(quakes, aes(x = long, y = -depth, size = mag, col = depth)) +
|
plongdep <- ggplot(quakes, aes(x = long, y = -depth, size = mag, col = depth)) +
|
||||||
geom_point(alpha = 0.5) +
|
geom_point(alpha = 0.5) +
|
||||||
ggtitle("Side view")
|
ggtitle("Side view")
|
||||||
|
|
||||||
platdep <- ggplot(quakes, aes(x = depth, y = lat, size = mag, col = depth)) +
|
platdep <- ggplot(quakes, aes(x = depth, y = lat, size = mag, col = depth)) +
|
||||||
geom_point(alpha = 0.5) +
|
geom_point(alpha = 0.5) +
|
||||||
ggtitle("Side view")
|
ggtitle("Side view")
|
||||||
|
|
||||||
# Next, define a gtable and add grobs to the table
|
|
||||||
|
|
||||||
|
# Next, define a gtable and add grobs to the table.
|
||||||
gt <- gtable(widths = unit(rep(1,2), "null"),
|
gt <- gtable(widths = unit(rep(1,2), "null"),
|
||||||
heights = unit(rep(1,2), "null"))
|
heights = unit(rep(1,2), "null"))
|
||||||
gt <- gtable_add_grob(gt,
|
gt <- gtable_add_grob(gt,
|
||||||
|
|
Загрузка…
Ссылка в новой задаче