proofreading

2016-03-15 16:06:28 -04:00 · 2016-03-15 16:06:28 -04:00 · 226842a54a
--- a/R/1-Getting_Started_with_R.R
+++ b/R/1-Getting_Started_with_R.R
@ -89,7 +89,7 @@ library(help = foreign)
 # To install a new package, use install.packages()
 # Install the ggplot2 package for it's plotting capability.
 if (!require("ggplot2"))
-  install.packages("ggplot2")
+    install.packages("ggplot2")

 # Then load the package.
 library("ggplot2")
@ -146,7 +146,7 @@ theme_set(theme_gray(base_size = 18))

 # In this sample you use ggplot2.
 ggplot(diamondSample, aes(x = carat, y = price)) +
-  geom_point(colour = "blue")
+    geom_point(colour = "blue")

 # Add a log scale.
 ggplot(diamondSample, aes(x = carat, y = price)) +
@ -155,9 +155,9 @@ ggplot(diamondSample, aes(x = carat, y = price)) +

 # Add a log scale for both scales.
 ggplot(diamondSample, aes(x = carat, y = price)) +
-        geom_point(colour = "blue") +
-        scale_x_log10() +
-        scale_y_log10()
+    geom_point(colour = "blue") +
+    scale_x_log10() +
+    scale_y_log10()

 ### Linear Regression in R	

@ -177,10 +177,10 @@ exp(coef(model)[1])

 # Show the model in a plot.
 ggplot(diamondSample, aes(x = carat, y = price)) +
-        geom_point(colour = "blue") +
-        geom_smooth(method = "lm", colour = "red", size = 2) +
-        scale_x_log10() +
-        scale_y_log10()
+    geom_point(colour = "blue") +
+    geom_smooth(method = "lm", colour = "red", size = 2) +
+    scale_x_log10() +
+    scale_y_log10()


 ### Regression Diagnostics	
--- a/R/2-Introduction_to_ggplot2.R
+++ b/R/2-Introduction_to_ggplot2.R
@ -36,8 +36,8 @@ theme_set(theme_gray(base_size = 18))
 # and lat to the y-axis.
 # Then you add a layer with points (geom_point) and a layer to plot maps.
 p0 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point() + 
-  coord_map()
+    geom_point() + 
+    coord_map()
 p0 

 # You can use a number of different aesthetics, for example colour or size
@ -45,14 +45,14 @@ p0

 # Map the depth column to the colour aesthetic.
 p1 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth)) + 
-  coord_map()
+    geom_point(aes(colour = depth)) + 
+    coord_map()
 p1

 # Add size for magnitude. The bigger the magnitude, the larger the point.
 p2 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag)) + 
-  coord_map()
+    geom_point(aes(colour = depth, size = mag)) + 
+    coord_map()
 p2

 # You can control the transparancy of a plot object with the alpha aesthetic.
@ -60,24 +60,24 @@ p2
 # are translucent.
 # Add alpha level to hide overplotting, thus revealing detail.
 p3 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  coord_map()
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    coord_map()
 p3

 # Change colour gradient by adding a gradient scale.
 p4 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  coord_map() +
-  scale_colour_gradient(low = "blue", high = "red")
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    coord_map() +
+    scale_colour_gradient(low = "blue", high = "red")
 p4

 # Add a plot title.
 p5 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  scale_colour_gradient(low = "blue", high = "red") + 
-  ggtitle("Distribution of earthquakes near Fiji") +
-  coord_map()  
-  p5
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    scale_colour_gradient(low = "blue", high = "red") + 
+    ggtitle("Distribution of earthquakes near Fiji") +
+    coord_map()  
+p5

 # Now plot multiple plots on the same graphic.
 # The package "grid" is built into R and allows you to take control of the 
@ -89,10 +89,10 @@ theme_set(theme_grey(12) + theme(legend.key.size  =  unit(0.5, "lines")))
 library(grid)
 plot.new()
 grid.draw(cbind(
-  ggplotGrob(p1), 
-  ggplotGrob(p2),
-  ggplotGrob(p3),
-  size = "last"
-))
+    ggplotGrob(p1), 
+    ggplotGrob(p2),
+    ggplotGrob(p3),
+    size = "last"
+    ))


--- a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R
+++ b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R
@ -1,30 +1,27 @@
-#-------------------------------------------------------------------------------------------------------------
-#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
-#-------------------------------------------------------------------------------------------------------------
-# 
-#
-# This example demonstrates the feature engineering process for building a regression model to predict 
-# bike rental demand. 
-#
-# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within 
-# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
-# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs. 
-# weekday etc.
-#
-# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
-# of bike rentals within a specific hour. The lag features we add in the data set is the number of bikes that 
-# were rented in each of the previous 12 hours, which caputures the very recent demand for the bikes.
-#
-# The following scripts include five basic steps of building this example using Microsoft R Server.
-#
-#
-#--------------------------------------------------------------------------------------------------------------
+# Regression: Demand estimation with Microsoft R Server 
+
+# This example demonstrates the feature engineering process for building 
+# a regression model to predict bike rental demand. 
+
+# The dataset contains 17,379 rows and 17 columns, each row representing 
+# the number of bike rentals within a specific hour of a day in the years 
+# 2011 or 2012. Weather conditions (such as temperature, humidity, 
+# and wind speed) were included in this raw feature set, and 
+# the dates were categorized as holiday vs. weekday, etc.
+
+# The field to predict is "cnt", which contains a count value ranging 
+# from 1 to 977, representing the number of bike rentals within 
+# a specific hour. The lag features we add in the data set are the number 
+# of bikes that were rented in each of the previous 12 hours.
+# This caputures the very recent demand for the bikes.
+
+# The following scripts include five basic steps of building this example 
+# using Microsoft R Server.


-#---------------------------Step 0: Get Started-------------------------------
-# ----------------------------------------------------------------------------
-# Check if Microsoft R Server (RRE 8.0) is installed
-# ----------------------------------------------------------------------------
+# Step 0: Get Started
+
+# Check whether Microsoft R Server (RRE 8.0) is installed
 if (!require("RevoScaleR")) {
  cat("RevoScaleR package does not seem to exist. 
      \nThis means that the functions starting with 'rx' will not run. 
@ -38,7 +35,7 @@ if (!require("RevoScaleR")) {
  quit()
 }

-# Initial some variables.
+# Initialize some variables.
 github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")

@ -47,81 +44,115 @@ td <- tempdir()
 outFileBike <- paste0(td, "/bike.xdf")
 outFileLag <- paste0(td, "/lagData.xdf")

-#---------------------------Step 1: Import the Bike Data---------------------------
-bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
+
+# Step 1: Import the Bike Data
+
+bike <- rxImport(inData = inputFileBikeURL,
+                 outFile = outFileBike, overwrite = TRUE,
                 missingValueString = "M", stringsAsFactors = FALSE,
-                 # Remove timestamps and all columns that are part of the label.
+                 # Remove timestamps and all columns that are 
+                 # part of the label.
                 varsToDrop = c("instant", "dteday", "casual", "registered"),
-                 # Definite year, weather conditions and season columns as categorical.
+                 # Define year, weather conditions and season columns 
+                 # as categorical.
                 colInfo = list(yr = list(type = "factor"),
                                weathersit = list(type = "factor"),
                                season = list(type = "factor")))

-#---------------------------Step 2: Feature Engineering---------------------------
-# Add number of bikes that were rented in each of the previous 12 hours as 12 lag features.
-computeLagFeatures <- function (dataList) {  # function for computing lag features.
+
+# Step 2: Feature Engineering
+
+# Add the number of bikes that were rented in each of the previous 
+# 12 hours as 12 lag features.
+computeLagFeatures <- function(dataList) {
+    # Total number of lags that need to be added.
+    numLags <- length(nLagsVector)
+    # lag feature names as lagN    
+    varLagNameVector <- paste("lag", nLagsVector, sep="")  
  
-  numLags <- length(nLagsVector)   # total number of lags that need to be added
-  varLagNameVector <- paste("lag", nLagsVector, sep="")  # lag feature names as lagN
-  
-  # Set the value of an object "storeLagData" in the transform environment.
-  if (!exists("storeLagData")) 
-  {              
-    lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
-    names(lagData) <- varLagNameVector
-    .rxSet("storeLagData",lagData)
-  }
-  
-  if (!.rxIsTestChunk)
-  {
-    for (iL in 1:numLags)
-    {
-      numRowsInChunk <- length(dataList[[varName]])  # number of rows in the current chunk
-      nlags <- nLagsVector[iL]  
-      varLagName <- paste("lag", nlags, sep="")
-      lagData <- .rxGet("storeLagData")   # retrieve lag data from the previous chunk
-      allData <- c(lagData[[varLagName]], dataList[[varName]])  # concatenate lagData and the "cnt" feature 
-      dataList[[varLagName]] <- allData[1:numRowsInChunk]  # take the first N rows of allData, where N is the total number of rows in the original dataList
-      lagData[[varLagName]] <- tail(allData, nlags)  # save last nlag rows as the new lagData to be used to process in the next chunk
-      .rxSet("storeLagData", lagData)   
+    # Set the value of an object "storeLagData" in the transform environment.
+    if (!exists("storeLagData")) 
+    {              
+        lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
+        names(lagData) <- varLagNameVector
+        .rxSet("storeLagData",lagData)
    }
-  }
-  return(dataList)                
+  
+    if (!.rxIsTestChunk)
+    {
+        for (iL in 1:numLags) {
+            # Number of rows in the current chunk.
+            numRowsInChunk <- length(dataList[[varName]])
+            nlags <- nLagsVector[iL]
+            varLagName <- paste("lag", nlags, sep = "")
+            # Retrieve lag data from the previous chunk.        
+            lagData <- .rxGet("storeLagData")
+            # Concatenate lagData and the "cnt" feature.
+            allData <- c(lagData[[varLagName]], dataList[[varName]])
+            # Take the first N rows of allData, where N is the total 
+            # number of rows in the original dataList.
+            dataList[[varLagName]] <- allData[1:numRowsInChunk]
+            # Save last nlag rows as the new lagData to be used 
+            # to process in the next chunk.
+            lagData[[varLagName]] <- tail(allData, nlags) 
+            .rxSet("storeLagData", lagData)
+        }
+    }
+    return(dataList)                
 }

 # Apply the "computeLagFeatures" on the bike data.
-lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = computeLagFeatures,
-                      transformObjects = list(varName = "cnt", nLagsVector = seq(12)),
+lagData <- rxDataStep(inData = bike, outFile = outFileLag,
+                      transformFunc = computeLagFeatures,
+                      transformObjects = list(varName = "cnt",
+                                              nLagsVector = seq(12)),
                      transformVars = "cnt", overwrite=TRUE)

-#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
-# Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012.
-rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0)
+
+# Step 3: Prepare Training and Test Datasets
+
+# Split data by "yr" so that the training data contains records 
+# for the year 2011 and the test data contains records for 2012.
+rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"),
+        splitByFactor = "yr", overwrite = TRUE,
+        reportProgress = 0, verbose = 0)

 # Point to the .xdf files for the training and test set.
 train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
 test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))

-#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
-modelFormula <- formula(train, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
+
+# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
+
+# Build a formula for the regression model and remove the "yr", 
+# which is used to split the training and test data.
+modelFormula <- formula(train, depVars = "cnt",
+                        varsToDrop = c("RowNum", "yr"))

 # Fit a Decision Forest Regression model on the training data.
-dForest <- rxDForest(modelFormula, data = train, importance = TRUE, seed = 123)
+dForest <- rxDForest(modelFormula, data = train, importance = TRUE,
+                     seed = 123)
+
+
+# Step 5: Predict over new data and review the model performance

-#---------------------------Step 5: Predict over new data and review the model performance---------------------------
 # Predict the probability on the test dataset.
-predict <- rxPredict(dForest, data = test, overwrite = TRUE, computeResiduals = TRUE)
+predict <- rxPredict(dForest, data = test, overwrite = TRUE,
+                     computeResiduals = TRUE)

-# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
-sum <- rxSummary(~ cnt_Resid_abs+cnt_Resid_2+cnt_rel, data = predict, summaryStats = "Mean", 
+# Calculate three statistical measures: Mean Absolute Error (MAE), 
+# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
+sum <- rxSummary(~cnt_Resid_abs + cnt_Resid_2 + cnt_rel, data = predict,
+                 summaryStats = "Mean", 
                 transforms = list(cnt_Resid_abs = abs(cnt_Resid), 
                                   cnt_Resid_2 = cnt_Resid^2, 
                                   cnt_rel = abs(cnt_Resid)/cnt)
-)$sDataFrame
+                )$sDataFrame

 # List all measures in a data frame.
-measures <- data.frame(MAE = sum[1, 2], RMSE = sqrt(sum[2, 2]), RAE = sum[3, 2])
+measures <- data.frame(MAE = sum[1, 2],
+                       RMSE = sqrt(sum[2, 2]),
+                       RAE = sum[3, 2])

 # Review the measures.
 measures
--- a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R
+++ b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R
@ -1,58 +1,61 @@
-#-------------------------------------------------------------------------------------------------------------
-#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
-#-------------------------------------------------------------------------------------------------------------
-# 
-#
-# This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation 
+# Regression: Demand estimation with Microsoft R Server
+
+# This example a replication of an existing Azure Machine Learning 
+# Experiment - Regression: Demand Estimation 
 # https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
-#
-# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within 
-# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
-# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs. 
-# weekday etc.
-#
-# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
-# of bike rentals within a specific hour.
-#
-# We built four models using the same algorithm, but with four different training datasets. The four training 
-# datasets that we constructed were all based on the same raw input data, but we added different additional 
-# features to each training set.
-#
+
+# The dataset contains 17,379 rows and 17 columns, each row representing 
+# the number of bike rentals within a given hour of a given day in 
+# the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
+# and wind speed) were included in this raw feature set, and the dates 
+# were categorized as holiday vs. weekday, etc.
+
+# The field to predict is "cnt", which contains a count value ranging 
+# from 1 to 977, representing the number of bike rentals within 
+# a given hour.
+
+# We built four models using the same algorithm, but with four different 
+# training datasets. The four training datasets that we constructed 
+# were all based on the same raw input data, but we added different 
+# additional features to each training set.
+
 # Set A = weather + holiday + weekday + weekend features for the predicted day
 # Set B = number of bikes that were rented in each of the previous 12 hours
-# Set C = number of bikes that were rented in each of the previous 12 days at the same hour
-# Set D = number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day
-#
+# Set C = number of bikes that were rented in each of the previous 12 days 
+#         at the same hour
+# Set D = number of bikes that were rented in each of the previous 12 weeks 
+#         at the same hour and the same day
+
 # Each of these feature sets captures different aspects of the problem:
 # Feature set B captures very recent demand for the bikes.
 # Feature set C captures the demand for bikes at a particular hour.
-# Feature set D captures demand for bikes at a particular hour and particular day of the week.
-#
-# The four training datasets were built by combining the feature set as follows:
+# Feature set D captures demand for bikes at a particular hour and 
+#   particular day of the week.
+
+# The four training datasets were built by combining the feature set 
+# as follows:
 # Training set 1: feature set A only
 # Training set 2: feature sets A+B
 # Training set 3: feature sets A+B+C
 # Training set 4: feature sets A+B+C+D
-#
-# The following scripts include five basic steps of building this example using Microsoft R Server.
-#
-#
-#--------------------------------------------------------------------------------------------------------------
+
+# The following scripts include five basic steps of building this example 
+# using Microsoft R Server.


-#---------------------------Step 0: Get Started-------------------------------
-# ----------------------------------------------------------------------------
-# Check if Microsoft R Server (RRE 8.0) is installed
-# ----------------------------------------------------------------------------
+# Step 0: Get Started
+
+# Check whether Microsoft R Server (RRE 8.0) is installed
 if (!require("RevoScaleR")) {
    cat("RevoScaleR package does not seem to exist. 
-      \nThis means that the functions starting with 'rx' will not run. 
-      \nIf you have Microsoft R Server installed, please switch the R engine.
-      \nFor example, in R Tools for Visual Studio: 
-      \nR Tools -> Options -> R Engine. 
-      \nIf Microsoft R Server is not installed, you can download it from: 
-      \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
-      \n")
+        \nThis means that the functions starting with 'rx' will not run. 
+        \nIf you have Microsoft R Server installed, 
+        \nplease switch the R engine.
+        \nFor example, in R Tools for Visual Studio: 
+        \nR Tools -> Options -> R Engine. 
+        \nIf Microsoft R Server is not installed, you can download it from: 
+        \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
+        \n")

    quit()
 }
@ -67,95 +70,126 @@ outFileBike <- paste0(td, "/bike.xdf")
 outFileEdit <- paste0(td, "/editData.xdf")
 outFileLag <- paste0(td, "/lagData")

-#---------------------------Step 1: Import Data---------------------------
+
+# Step 1: Import Data
+
 # Import the bike data.
 # Remove timestamps and all columns that are part of the label.
-bike_mrs <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
+bike_mrs <- rxImport(inData = inputFileBikeURL,
+                     outFile = outFileBike, overwrite = TRUE,
                     missingValueString = "M", stringsAsFactors = FALSE,
-                     varsToDrop = c("instant", "dteday", "casual", "registered"))
+                     varsToDrop = c("instant", 
+                                    "dteday", 
+                                    "casual", 
+                                    "registered"))

-# Edit Metadata: Definite year, weather conditions and season columns as categorical.
-editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit, sortLevels = TRUE,
-                          factorInfo = c("yr", "weathersit", "season"), overwrite = TRUE)
+# Edit Metadata: Define year, weather conditions and season columns 
+# as categorical.
+editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit,
+                          sortLevels = TRUE,
+                          factorInfo = c("yr", "weathersit", "season"),
+                          overwrite = TRUE)
+
+
+# Step 2: Feature Engineering

-#---------------------------Step 2: Feature Engineering---------------------------
 # Create a function to construct lag features for four different aspects. 
 computeLagFeatures <- function (dataList) {   
-  
-  numLags <- length(nLagsVector) # total number of lags that need to be added
-  for (iL in 1:numLags)
-  {
-    nlag <- nLagsVector[iL]
-    varLagName <- paste("demand.",nlag,unit,sep="")
-    numRowsInChunk <- length(dataList[[baseVar]])
-    numRowsToRead <- nlag * interval
-    numRowsPadding <- 0
-    if (numRowsToRead >= .rxStartRow)
-    {
-      numRowsToRead <- .rxStartRow - 1
-      numRowsPadding <- nlag * interval - numRowsToRead
+    # Total number of lags that need to be added.
+    numLags <- length(nLagsVector) 
+    for (iL in 1:numLags) {
+        nlag <- nLagsVector[iL]
+        varLagName <- paste("demand.", nlag, unit, sep = "")
+        numRowsInChunk <- length(dataList[[baseVar]])
+        numRowsToRead <- nlag * interval
+        numRowsPadding <- 0
+        if (numRowsToRead >= .rxStartRow) {
+            numRowsToRead <- .rxStartRow - 1
+            numRowsPadding <- nlag * interval - numRowsToRead
+        }
+        # Determine the current row to start processing the data 
+        # between chunks.
+        startRow <- .rxStartRow - numRowsToRead
+        previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
+                                                varsToKeep = baseVar,
+                                                startRow = startRow,
+                                                numRows = numRowsToRead,
+                                                returnDataFrame = FALSE)
+        paddingRowsDataList <- rxReadXdf(file = .rxReadFileName,
+                                                varsToKeep = baseVar,
+                                                startRow = 1,
+                                                numRows = numRowsPadding,
+                                                returnDataFrame = FALSE)
+        dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]],
+                                    previousRowsDataList[[baseVar]],
+                                    dataList[[baseVar]])[1:numRowsInChunk]
    }
-    startRow <- .rxStartRow - numRowsToRead  # determine the current row to start processing the data between chunks.
-    previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
-                                      varsToKeep = baseVar,
-                                      startRow = startRow, numRows = numRowsToRead,
-                                      returnDataFrame = FALSE)
-    paddingRowsDataList <- rxReadXdf(file=.rxReadFileName,
-                                     varsToKeep = baseVar,
-                                     startRow = 1, numRows = numRowsPadding,
-                                     returnDataFrame = FALSE)
-    dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]], previousRowsDataList[[baseVar]], dataList[[baseVar]])[1:numRowsInChunk]	
-  }
-  return(dataList)
+    return(dataList)
 }

 # Create a function to add lag features a set of columns at a time.
 addLag <- function(inputData, outputFileBase) {
  
-  inputFile <- inputData
-  outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
-  outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
-  outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week",".xdf",sep="")
+    inputFile <- inputData
+    outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
+    outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
+    outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf",
+                                    sep="")
  
-  # Initialize some fix values.
-  hourInterval <- 1
-  dayInterval <- 24
-  weekInterval <- 168
-  previous <- 12
+    # Initialize some fix values.
+    hourInterval <- 1
+    dayInterval <- 24
+    weekInterval <- 168
+    previous <- 12
  
-  # Add number of bikes that were rented in each of the previous 12 hours (for Set B).
-  rxDataStep(inData = inputFile,outFile = outputFileHour, transformFunc=computeLagFeatures,
-             transformObjects = list(baseVar = "cnt", unit = "hour",nLagsVector=seq(12), 
-                                     interval = hourInterval),
-             transformVars = c("cnt"), overwrite=TRUE)
+    # Add number of bikes that were rented in each of the previous 12 hours 
+    # (for Set B).
+    rxDataStep(inData = inputFile, outFile = outputFileHour,
+                transformFunc=computeLagFeatures,
+                transformObjects = list(baseVar = "cnt", unit = "hour",
+                                        nLagsVector=seq(12), 
+                                        interval = hourInterval),
+                transformVars = c("cnt"), overwrite=TRUE)
  
-  # Add number of bikes that were rented in each of the previous 12 days at the same hour (for Set C).
-  rxDataStep(inData = outputFileHour,outFile = outputFileHourDay, transformFunc=computeLagFeatures,
-             transformObjects = list(baseVar = "cnt", unit = "day",nLagsVector=seq(12), 
-                                     interval = dayInterval),
-             transformVars = c("cnt"), overwrite=TRUE)
+    # Add number of bikes that were rented in each of the previous 12 days 
+    # at the same hour (for Set C).
+    rxDataStep(inData = outputFileHour, outFile = outputFileHourDay,
+               transformFunc=computeLagFeatures,
+               transformObjects = list(baseVar = "cnt",
+                                       unit = "day",
+                                       nLagsVector=seq(12), 
+                                       interval = dayInterval),
+                                       transformVars = c("cnt"),
+                                       overwrite=TRUE)
  
-  # Add number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day (for Set D).
-  rxDataStep(inData = outputFileHourDay,outFile = outputFileHourDayWeek, transformFunc=computeLagFeatures,
-             transformObjects = list(baseVar = "cnt", unit = "week",nLagsVector=seq(12), 
-                                     interval = weekInterval),
-             transformVars = c("cnt"), overwrite=TRUE)
+    # Add number of bikes that were rented in each of the previous 12 weeks 
+    # at the same hour and the same day (for Set D).
+    rxDataStep(inData = outputFileHourDay,
+               outFile = outputFileHourDayWeek,
+               transformFunc=computeLagFeatures,
+               transformObjects = list(baseVar = "cnt",
+                                       unit = "week",
+                                       nLagsVector=seq(12), 
+                                       interval = weekInterval),
+               transformVars = c("cnt"), overwrite=TRUE)
  
-  file.remove(outputFileHour)
-  file.remove(outputFileHourDay)
-  
-  return(outputFileHourDayWeek)
+    file.remove(outputFileHour)
+    file.remove(outputFileHourDay)
+    return(outputFileHourDayWeek)
 }

-# Set A = weather + holiday + weekday + weekend features for the predicted day.
+# Set A = weather + holiday + weekday + weekend features for 
+# the predicted day.
 finalDataA_mrs <- editData_mrs

 # Set B, C & D.
-finalDataLag_dir <- addLag(inputData = editData_mrs, outputFileBase = outFileLag)
+finalDataLag_dir <- addLag(inputData = editData_mrs,
+                           outputFileBase = outFileLag)
 finalDataLag_mrs <- RxXdfData(finalDataLag_dir)


-#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
+# Step 3: Prepare Training and Test Datasets
+
 ## Set A:
 # Split Data.
 rxSplit(inData = finalDataA_mrs,
@ -168,41 +202,62 @@ testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf"))

 ## Set B, C & D:
 # Split Data.
-rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
+rxSplit(inData = finalDataLag_mrs,
+        outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
        overwrite = TRUE, reportProgress = 0, verbose = 0)
 # Point to the .xdf files for the training and test set.
 train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
 test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))

-#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
+  
+# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
+
 newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
 newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")

 ## Set A:
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
-formA_mrs <- formula(trainA_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
+# Build a formula for the regression model and remove the "yr", 
+# which is used to split the training and test data.
+formA_mrs <- formula(trainA_mrs, depVars = "cnt",
+                     varsToDrop = c("RowNum", "yr"))
 # Fit Decision Forest Regression model.
-dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs, importance = TRUE, seed = 123)
+dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs,
+                          importance = TRUE, seed = 123)

 ## Set B:
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set C and D.
-formB_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newDayFeatures, newWeekFeatures))
+# Build a formula for the regression model and remove the "yr", 
+# which is used to split the training and test data, and lag features 
+# for Set C and D.
+formB_mrs <- formula(train_mrs, depVars = "cnt",
+                     varsToDrop = c("RowNum", "yr",
+                                    newDayFeatures,
+                                    newWeekFeatures))
 # Fit Decision Forest Regression model.
-dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs, importance = TRUE, seed = 123)
+dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs,
+                          importance = TRUE, seed = 123)

 ## Set C:
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set D.
-formC_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newWeekFeatures))
+# Build a formula for the regression model and remove the "yr", 
+# which is used to split the training and test data, and lag features 
+# for Set D.
+formC_mrs <- formula(train_mrs, depVars = "cnt",
+                     varsToDrop = c("RowNum", "yr", newWeekFeatures))
 # Fit Decision Forest Regression model.
-dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs, importance = TRUE, seed = 123)
+dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs,
+                          importance = TRUE, seed = 123)

 ## Set D:
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
-formD_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
+# Build a formula for the regression model and remove the "yr", 
+# which is used to split the training and test data.
+formD_mrs <- formula(train_mrs, depVars = "cnt",
+                     varsToDrop = c("RowNum", "yr"))
 # Fit Decision Forest Regression model.
-dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs, importance = TRUE, seed = 123)
+dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs,
+                          importance = TRUE, seed = 123)
+
+
+# Step 5: Predict over new data

-#---------------------------Step 5: Predict over new data---------------------------
 ## Set A:
 # Predict the probability on the test dataset.
 rxPredict(dForestA_mrs, data = testA_mrs, 
@ -231,17 +286,22 @@ rxPredict(dForestD_mrs, data = test_mrs,
          residVarNames = "cnt_Resid_D",
          overwrite = TRUE, computeResiduals = TRUE)

-#---------------------------Prepare outputs---------------------------
+
+# Prepare outputs
 ## Set A:
-# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
-sumA <- rxSummary(~ cnt_Resid_A_abs+cnt_Resid_A_2+cnt_rel_A, data = testA_mrs, summaryStats = "Mean", 
+# Calculate three statistical measures: Mean Absolute Error (MAE), 
+# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
+sumA <- rxSummary( ~ cnt_Resid_A_abs + cnt_Resid_A_2 + cnt_rel_A,
+                  data = testA_mrs, summaryStats = "Mean", 
                  transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A), 
                                    cnt_Resid_A_2 = cnt_Resid_A^2, 
                                    cnt_rel_A = abs(cnt_Resid_A)/cnt)
-)$sDataFrame
+                 )$sDataFrame

 ## Set B, C & D:
-sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_Resid_C_2+cnt_rel_C+cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D, 
+sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
+                 cnt_Resid_C_abs + cnt_Resid_C_2 + cnt_rel_C +
+                 cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D, 
                 data = test_mrs, summaryStats = "Mean", 
                 transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B), 
                                   cnt_Resid_B_2 = cnt_Resid_B^2, 
@ -252,7 +312,7 @@ sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_R
                                   cnt_Resid_D_abs = abs(cnt_Resid_D), 
                                   cnt_Resid_D_2 = cnt_Resid_D^2, 
                                   cnt_rel_D = abs(cnt_Resid_D)/cnt)
-)$sDataFrame
+                 )$sDataFrame

 # Add row names.
 features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
@ -263,7 +323,10 @@ features <- c("baseline: weather + holiday + weekday + weekend features for the
 # List all measures in a data frame.
 measures <- data.frame(Features = features,
                       MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
-                       RMSE = c(sqrt(sumA[2, 2]), sqrt(sum[2, 2]), sqrt(sum[5, 2]), sqrt(sum[8, 2])),
+                       RMSE = c(sqrt(sumA[2, 2]),
+                                sqrt(sum[2, 2]),
+                                sqrt(sum[5, 2]),
+                                sqrt(sum[8, 2])),
                       RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))

 # Review the measures.
--- a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R
+++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R
@ -1,4 +1,6 @@
-# Install a package if it's not already installed.
+# This script shows how to import data into R that is referenced by a URL.
+
+# Install the RCurl package if it's not already installed.
 (if (!require("RCurl", quietly = TRUE)) install.packages("RCurl"))

 # Load packages.
--- a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R
+++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R
@ -3,16 +3,17 @@

 # Check whether the "RevoScaleR" package is loaded in the current environment.
 if (!require("RevoScaleR")) {
-  cat("RevoScaleR package does not seem to exist. 
-      \nThis means that the functions starting with 'rx' will not run. 
-      \nIf you have Microsoft R Server installed, please switch the R engine.
-      \nFor example, in R Tools for Visual Studio: 
-      \nR Tools -> Options -> R Engine. 
-      \nIf Microsoft R Server is not installed, you can download it from: 
-      \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
-      \n")
+    cat("RevoScaleR package does not seem to exist. 
+        \nThis means that the functions starting with 'rx' will not run. 
+        \nIf you have Microsoft R Server installed, 
+        \nplease switch the R engine.
+        \nFor example, in R Tools for Visual Studio: 
+        \nR Tools -> Options -> R Engine. 
+        \nIf Microsoft R Server is not installed, you can download it from: 
+        \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
+        \n")
  
-  quit()
+    quit()
 } 

 # A URL contains the raw data.
--- a/examples/MRS_and_Machine_Learning/Data_Exploration/Using_ggplot2.R
+++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Using_ggplot2.R
@ -1,11 +1,11 @@
-# ----------------------------------------------------------------------------
-# purpose: 	introduction to plotting using the ggplot2 package
-# audience: you are expected to have some prior experience with R
-# ----------------------------------------------------------------------------
+# This sample gives an introduction to plotting using the ggplot2 package.

-# R has a number of built-in datasets
-# In this example you use the dataset called quakes
-# This data contains locations of earthquakes off Fiji
+# The ggplot2 package is tremendously popular because it allows you to create
+# beautiful plots by describing the plot structure
+
+# R has a number of built-in datasets.
+# In this example you use the dataset called quakes.
+# This data contains locations of earthquakes off Fiji.

 # Read the help page for more information
 ?quakes
@ -24,71 +24,55 @@ if (!require("mapproj"))
  install.packages("mapproj") # required for map projections
 library("mapproj")

-# ----------------------------------------------------------------------------
-# Starting to use the ggplot2 package
-# ----------------------------------------------------------------------------
-
-# The ggplot2 package is tremendously popular because it allows you to create
-# beautiful plots by describing the plot structure
-
-
 # Plot longitude and latitude of quakes
 # To create a plot, you have to specify the data, then map aesthetics to 
 # columns in your data. In this example, you map the column long to the x-axis
 # and lat to the y-axis.
-# Then you add a layer with points (geom_point) and a layer to plot maps
-
+# Then you add a layer with points (geom_point) and a layer to plot maps.
 p0 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point() + 
-  coord_map()
+    geom_point() + 
+    coord_map()
 p0 

 # You can use a number of different aesthetics, for example colour or size
-# of the points
+# of the points.

 # Map the depth column to the colour aesthetic
-
 p1 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth)) + 
-  coord_map()
+    geom_point(aes(colour = depth)) + 
+    coord_map()
 p1

 # Add size for magnitude. The bigger the magnitude, the larger the point
-
 p2 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag)) + 
-  coord_map()
+    geom_point(aes(colour = depth, size = mag)) + 
+    coord_map()
 p2

 # You can control the transparancy of a plot object with the alpha aesthetic
 # High values of alpha (close to 1) are opaque, while low values (close to 0)
 # are translucent
 # Add alpha level to hide overplotting, thus revealing detail
-
 p3 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  coord_map()
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    coord_map()
 p3

 # Change colour gradient by adding a gradient scale
-
 p4 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  coord_map() +
-  scale_colour_gradient(low = "blue", high = "red")
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    coord_map() +
+    scale_colour_gradient(low = "blue", high = "red")
 p4

 # Add a plot title
-
 p5 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  scale_colour_gradient(low = "blue", high = "red") + 
-  ggtitle("Distribution of earthquakes near Fiji") +
-  coord_map()  
-
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    scale_colour_gradient(low = "blue", high = "red") + 
+    ggtitle("Distribution of earthquakes near Fiji") +
+    coord_map()  
 p5

-
 # Now plot multiple plots on the same graphic
 # The package "grid" is built into R and allows you to take control of the 
 # plotting area. A grob is the abbreviation for "graphical object", and the 
@ -100,38 +84,35 @@ theme_set(theme_grey(12) + theme(legend.key.size  =  unit(0.5, "lines")))
 library(grid)
 plot.new()
 grid.draw(cbind(
-  ggplotGrob(p1), 
-  ggplotGrob(p2),
-  ggplotGrob(p3),
-  size = "last"
-))
+    ggplotGrob(p1), 
+    ggplotGrob(p2),
+    ggplotGrob(p3),
+    size = "last"
+    ))


-# ----------------------------------------------------------------------------
-
 # The package "gtable" allows you to work with objects called grob tables.
 # A grob table captures all the information needed to layout grobs in a table
 # structure. It supports row and column spanning, offers some tools to
 # automatically figure out the correct dimensions, and makes it easy to align
 # and combine multiple tables. 
-
-if (!require("gtable")) install.packages("gtable")
+if (!require("gtable"))
+    install.packages("gtable")
 library(gtable)

-plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) + 
-  geom_point(alpha = 0.5) + 
-  ggtitle("Top view")
+plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) +
+    geom_point(alpha = 0.5) + 
+    ggtitle("Top view")

 plongdep <- ggplot(quakes, aes(x = long, y = -depth, size = mag, col = depth)) + 
-  geom_point(alpha = 0.5) + 
-  ggtitle("Side view")
+    geom_point(alpha = 0.5) + 
+    ggtitle("Side view")

 platdep  <- ggplot(quakes, aes(x = depth, y = lat, size = mag, col = depth)) + 
-  geom_point(alpha = 0.5) + 
-  ggtitle("Side view")
-
-# Next, define a gtable and add grobs to the table
+    geom_point(alpha = 0.5) + 
+    ggtitle("Side view")

+# Next, define a gtable and add grobs to the table.
 gt <- gtable(widths = unit(rep(1,2), "null"),
             heights = unit(rep(1,2), "null"))
 gt <- gtable_add_grob(gt,