proofreading

2016-03-15 16:06:28 -04:00 · 2016-03-15 16:06:28 -04:00 · 226842a54a
--- a/R/1-Getting_Started_with_R.R
+++ b/R/1-Getting_Started_with_R.R
@ -89,7 +89,7 @@ library(help = foreign)
 # To install a new package, use install.packages()
 # Install the ggplot2 package for it's plotting capability.
 if (!require("ggplot2"))
-  install.packages("ggplot2")
+    install.packages("ggplot2")
 # Then load the package.
 library("ggplot2")
@ -146,7 +146,7 @@ theme_set(theme_gray(base_size = 18))
 # In this sample you use ggplot2.
 ggplot(diamondSample, aes(x = carat, y = price)) +
-  geom_point(colour = "blue")
+    geom_point(colour = "blue")
 # Add a log scale.
 ggplot(diamondSample, aes(x = carat, y = price)) +
@ -155,9 +155,9 @@ ggplot(diamondSample, aes(x = carat, y = price)) +
 # Add a log scale for both scales.
 ggplot(diamondSample, aes(x = carat, y = price)) +
-        geom_point(colour = "blue") +
+    geom_point(colour = "blue") +
-        scale_x_log10() +
+    scale_x_log10() +
-        scale_y_log10()
+    scale_y_log10()
 ### Linear Regression in R	
@ -177,10 +177,10 @@ exp(coef(model)[1])
 # Show the model in a plot.
 ggplot(diamondSample, aes(x = carat, y = price)) +
-        geom_point(colour = "blue") +
+    geom_point(colour = "blue") +
-        geom_smooth(method = "lm", colour = "red", size = 2) +
+    geom_smooth(method = "lm", colour = "red", size = 2) +
-        scale_x_log10() +
+    scale_x_log10() +
-        scale_y_log10()
+    scale_y_log10()
 ### Regression Diagnostics	
--- a/R/2-Introduction_to_ggplot2.R
+++ b/R/2-Introduction_to_ggplot2.R
@ -36,8 +36,8 @@ theme_set(theme_gray(base_size = 18))
 # and lat to the y-axis.
 # Then you add a layer with points (geom_point) and a layer to plot maps.
 p0 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point() + 
+    geom_point() + 
-  coord_map()
+    coord_map()
 p0 
 # You can use a number of different aesthetics, for example colour or size
@ -45,14 +45,14 @@ p0
 # Map the depth column to the colour aesthetic.
 p1 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth)) + 
+    geom_point(aes(colour = depth)) + 
-  coord_map()
+    coord_map()
 p1
 # Add size for magnitude. The bigger the magnitude, the larger the point.
 p2 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag)) + 
+    geom_point(aes(colour = depth, size = mag)) + 
-  coord_map()
+    coord_map()
 p2
 # You can control the transparancy of a plot object with the alpha aesthetic.
@ -60,24 +60,24 @@ p2
 # are translucent.
 # Add alpha level to hide overplotting, thus revealing detail.
 p3 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  coord_map()
+    coord_map()
 p3
 # Change colour gradient by adding a gradient scale.
 p4 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  coord_map() +
+    coord_map() +
-  scale_colour_gradient(low = "blue", high = "red")
+    scale_colour_gradient(low = "blue", high = "red")
 p4
 # Add a plot title.
 p5 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  scale_colour_gradient(low = "blue", high = "red") + 
+    scale_colour_gradient(low = "blue", high = "red") + 
-  ggtitle("Distribution of earthquakes near Fiji") +
+    ggtitle("Distribution of earthquakes near Fiji") +
-  coord_map()  
+    coord_map()  
-  p5
+p5
 # Now plot multiple plots on the same graphic.
 # The package "grid" is built into R and allows you to take control of the 
@ -89,10 +89,10 @@ theme_set(theme_grey(12) + theme(legend.key.size  =  unit(0.5, "lines")))
 library(grid)
 plot.new()
 grid.draw(cbind(
-  ggplotGrob(p1), 
+    ggplotGrob(p1), 
-  ggplotGrob(p2),
+    ggplotGrob(p2),
-  ggplotGrob(p3),
+    ggplotGrob(p3),
-  size = "last"
+    size = "last"
-))
+    ))
--- a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R
+++ b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R
@ -1,30 +1,27 @@
-#-------------------------------------------------------------------------------------------------------------
+# Regression: Demand estimation with Microsoft R Server 
-#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
+
-#-------------------------------------------------------------------------------------------------------------
+# This example demonstrates the feature engineering process for building 
-# 
+# a regression model to predict bike rental demand. 
-#
+
-# This example demonstrates the feature engineering process for building a regression model to predict 
+# The dataset contains 17,379 rows and 17 columns, each row representing 
-# bike rental demand. 
+# the number of bike rentals within a specific hour of a day in the years 
-#
+# 2011 or 2012. Weather conditions (such as temperature, humidity, 
-# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within 
+# and wind speed) were included in this raw feature set, and 
-# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
+# the dates were categorized as holiday vs. weekday, etc.
-# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs. 
+
-# weekday etc.
+# The field to predict is "cnt", which contains a count value ranging 
-#
+# from 1 to 977, representing the number of bike rentals within 
-# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
+# a specific hour. The lag features we add in the data set are the number 
-# of bike rentals within a specific hour. The lag features we add in the data set is the number of bikes that 
+# of bikes that were rented in each of the previous 12 hours.
-# were rented in each of the previous 12 hours, which caputures the very recent demand for the bikes.
+# This caputures the very recent demand for the bikes.
-#
+
-# The following scripts include five basic steps of building this example using Microsoft R Server.
+# The following scripts include five basic steps of building this example 
-#
+# using Microsoft R Server.
 #
 #--------------------------------------------------------------------------------------------------------------
-#---------------------------Step 0: Get Started-------------------------------
+# Step 0: Get Started
-# ----------------------------------------------------------------------------
+
-# Check if Microsoft R Server (RRE 8.0) is installed
+# Check whether Microsoft R Server (RRE 8.0) is installed
 # ----------------------------------------------------------------------------
 if (!require("RevoScaleR")) {
  cat("RevoScaleR package does not seem to exist. 
      \nThis means that the functions starting with 'rx' will not run. 
@ -38,7 +35,7 @@ if (!require("RevoScaleR")) {
  quit()
 }
-# Initial some variables.
+# Initialize some variables.
 github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/"
 inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv")
@ -47,81 +44,115 @@ td <- tempdir()
 outFileBike <- paste0(td, "/bike.xdf")
 outFileLag <- paste0(td, "/lagData.xdf")
-#---------------------------Step 1: Import the Bike Data---------------------------
+
-bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
+# Step 1: Import the Bike Data
 bike <- rxImport(inData = inputFileBikeURL,
                 outFile = outFileBike, overwrite = TRUE,
                 missingValueString = "M", stringsAsFactors = FALSE,
-                 # Remove timestamps and all columns that are part of the label.
+                 # Remove timestamps and all columns that are 
                 # part of the label.
                 varsToDrop = c("instant", "dteday", "casual", "registered"),
-                 # Definite year, weather conditions and season columns as categorical.
+                 # Define year, weather conditions and season columns 
                 # as categorical.
                 colInfo = list(yr = list(type = "factor"),
                                weathersit = list(type = "factor"),
                                season = list(type = "factor")))
-#---------------------------Step 2: Feature Engineering---------------------------
+
-# Add number of bikes that were rented in each of the previous 12 hours as 12 lag features.
+# Step 2: Feature Engineering
-computeLagFeatures <- function (dataList) {  # function for computing lag features.
+
 # Add the number of bikes that were rented in each of the previous 
 # 12 hours as 12 lag features.
 computeLagFeatures <- function(dataList) {
    # Total number of lags that need to be added.
    numLags <- length(nLagsVector)
    # lag feature names as lagN    
    varLagNameVector <- paste("lag", nLagsVector, sep="")  
-  numLags <- length(nLagsVector)   # total number of lags that need to be added
+    # Set the value of an object "storeLagData" in the transform environment.
-  varLagNameVector <- paste("lag", nLagsVector, sep="")  # lag feature names as lagN
+    if (!exists("storeLagData")) 
-  
+    {              
-  # Set the value of an object "storeLagData" in the transform environment.
+        lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
-  if (!exists("storeLagData")) 
+        names(lagData) <- varLagNameVector
-  {              
+        .rxSet("storeLagData",lagData)
    lagData <- mapply(rep, dataList[[varName]][1], times = nLagsVector)
    names(lagData) <- varLagNameVector
    .rxSet("storeLagData",lagData)
  }
  if (!.rxIsTestChunk)
  {
    for (iL in 1:numLags)
    {
      numRowsInChunk <- length(dataList[[varName]])  # number of rows in the current chunk
      nlags <- nLagsVector[iL]  
      varLagName <- paste("lag", nlags, sep="")
      lagData <- .rxGet("storeLagData")   # retrieve lag data from the previous chunk
      allData <- c(lagData[[varLagName]], dataList[[varName]])  # concatenate lagData and the "cnt" feature 
      dataList[[varLagName]] <- allData[1:numRowsInChunk]  # take the first N rows of allData, where N is the total number of rows in the original dataList
      lagData[[varLagName]] <- tail(allData, nlags)  # save last nlag rows as the new lagData to be used to process in the next chunk
      .rxSet("storeLagData", lagData)   
    }
-  }
+  
-  return(dataList)                
+    if (!.rxIsTestChunk)
    {
        for (iL in 1:numLags) {
            # Number of rows in the current chunk.
            numRowsInChunk <- length(dataList[[varName]])
            nlags <- nLagsVector[iL]
            varLagName <- paste("lag", nlags, sep = "")
            # Retrieve lag data from the previous chunk.        
            lagData <- .rxGet("storeLagData")
            # Concatenate lagData and the "cnt" feature.
            allData <- c(lagData[[varLagName]], dataList[[varName]])
            # Take the first N rows of allData, where N is the total 
            # number of rows in the original dataList.
            dataList[[varLagName]] <- allData[1:numRowsInChunk]
            # Save last nlag rows as the new lagData to be used 
            # to process in the next chunk.
            lagData[[varLagName]] <- tail(allData, nlags) 
            .rxSet("storeLagData", lagData)
        }
    }
    return(dataList)                
 }
 # Apply the "computeLagFeatures" on the bike data.
-lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = computeLagFeatures,
+lagData <- rxDataStep(inData = bike, outFile = outFileLag,
-                      transformObjects = list(varName = "cnt", nLagsVector = seq(12)),
+                      transformFunc = computeLagFeatures,
                      transformObjects = list(varName = "cnt",
                                              nLagsVector = seq(12)),
                      transformVars = "cnt", overwrite=TRUE)
-#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
+
-# Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012.
+# Step 3: Prepare Training and Test Datasets
-rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0)
+
 # Split data by "yr" so that the training data contains records 
 # for the year 2011 and the test data contains records for 2012.
 rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"),
        splitByFactor = "yr", overwrite = TRUE,
        reportProgress = 0, verbose = 0)
 # Point to the .xdf files for the training and test set.
 train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf"))
 test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf"))
-#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
+
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
+# Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
-modelFormula <- formula(train, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
+
 # Build a formula for the regression model and remove the "yr", 
 # which is used to split the training and test data.
 modelFormula <- formula(train, depVars = "cnt",
                        varsToDrop = c("RowNum", "yr"))
 # Fit a Decision Forest Regression model on the training data.
-dForest <- rxDForest(modelFormula, data = train, importance = TRUE, seed = 123)
+dForest <- rxDForest(modelFormula, data = train, importance = TRUE,
                     seed = 123)
 # Step 5: Predict over new data and review the model performance
 #---------------------------Step 5: Predict over new data and review the model performance---------------------------
 # Predict the probability on the test dataset.
-predict <- rxPredict(dForest, data = test, overwrite = TRUE, computeResiduals = TRUE)
+predict <- rxPredict(dForest, data = test, overwrite = TRUE,
                     computeResiduals = TRUE)
-# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
+# Calculate three statistical measures: Mean Absolute Error (MAE), 
-sum <- rxSummary(~ cnt_Resid_abs+cnt_Resid_2+cnt_rel, data = predict, summaryStats = "Mean", 
+# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
 sum <- rxSummary(~cnt_Resid_abs + cnt_Resid_2 + cnt_rel, data = predict,
                 summaryStats = "Mean", 
                 transforms = list(cnt_Resid_abs = abs(cnt_Resid), 
                                   cnt_Resid_2 = cnt_Resid^2, 
                                   cnt_rel = abs(cnt_Resid)/cnt)
-)$sDataFrame
+                )$sDataFrame
 # List all measures in a data frame.
-measures <- data.frame(MAE = sum[1, 2], RMSE = sqrt(sum[2, 2]), RAE = sum[3, 2])
+measures <- data.frame(MAE = sum[1, 2],
                       RMSE = sqrt(sum[2, 2]),
                       RAE = sum[3, 2])
 # Review the measures.
 measures
--- a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R
+++ b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R
@ -1,58 +1,61 @@
-#-------------------------------------------------------------------------------------------------------------
+# Regression: Demand estimation with Microsoft R Server
-#-------------------------- Regression: Demand estimation with Microsoft R Server ----------------------------
+
-#-------------------------------------------------------------------------------------------------------------
+# This example a replication of an existing Azure Machine Learning 
-# 
+# Experiment - Regression: Demand Estimation 
 #
 # This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation 
 # https://gallery.cortanaanalytics.com/Experiment/Regression-Demand-estimation-4.
-#
+
-# The dataset contains 17,379 rows and 17 columns, each row representing the number of bike rentals within 
+# The dataset contains 17,379 rows and 17 columns, each row representing 
-# a specific hour of a day in the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
+# the number of bike rentals within a given hour of a given day in 
-# and wind speed) were included in this raw feature set, and the dates were categorized as holiday vs. 
+# the years 2011 or 2012. Weather conditions (such as temperature, humidity, 
-# weekday etc.
+# and wind speed) were included in this raw feature set, and the dates 
-#
+# were categorized as holiday vs. weekday, etc.
-# The field to predict is "cnt", which contain a count value ranging from 1 to 977, representing the number
+
-# of bike rentals within a specific hour.
+# The field to predict is "cnt", which contains a count value ranging 
-#
+# from 1 to 977, representing the number of bike rentals within 
-# We built four models using the same algorithm, but with four different training datasets. The four training 
+# a given hour.
-# datasets that we constructed were all based on the same raw input data, but we added different additional 
+
-# features to each training set.
+# We built four models using the same algorithm, but with four different 
-#
+# training datasets. The four training datasets that we constructed 
 # were all based on the same raw input data, but we added different 
 # additional features to each training set.
 # Set A = weather + holiday + weekday + weekend features for the predicted day
 # Set B = number of bikes that were rented in each of the previous 12 hours
-# Set C = number of bikes that were rented in each of the previous 12 days at the same hour
+# Set C = number of bikes that were rented in each of the previous 12 days 
-# Set D = number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day
+#         at the same hour
-#
+# Set D = number of bikes that were rented in each of the previous 12 weeks 
 #         at the same hour and the same day
 # Each of these feature sets captures different aspects of the problem:
 # Feature set B captures very recent demand for the bikes.
 # Feature set C captures the demand for bikes at a particular hour.
-# Feature set D captures demand for bikes at a particular hour and particular day of the week.
+# Feature set D captures demand for bikes at a particular hour and 
-#
+#   particular day of the week.
-# The four training datasets were built by combining the feature set as follows:
+
 # The four training datasets were built by combining the feature set 
 # as follows:
 # Training set 1: feature set A only
 # Training set 2: feature sets A+B
 # Training set 3: feature sets A+B+C
 # Training set 4: feature sets A+B+C+D
-#
+
-# The following scripts include five basic steps of building this example using Microsoft R Server.
+# The following scripts include five basic steps of building this example 
-#
+# using Microsoft R Server.
 #
 #--------------------------------------------------------------------------------------------------------------
-#---------------------------Step 0: Get Started-------------------------------
+# Step 0: Get Started
-# ----------------------------------------------------------------------------
+
-# Check if Microsoft R Server (RRE 8.0) is installed
+# Check whether Microsoft R Server (RRE 8.0) is installed
 # ----------------------------------------------------------------------------
 if (!require("RevoScaleR")) {
    cat("RevoScaleR package does not seem to exist. 
-      \nThis means that the functions starting with 'rx' will not run. 
+        \nThis means that the functions starting with 'rx' will not run. 
-      \nIf you have Microsoft R Server installed, please switch the R engine.
+        \nIf you have Microsoft R Server installed, 
-      \nFor example, in R Tools for Visual Studio: 
+        \nplease switch the R engine.
-      \nR Tools -> Options -> R Engine. 
+        \nFor example, in R Tools for Visual Studio: 
-      \nIf Microsoft R Server is not installed, you can download it from: 
+        \nR Tools -> Options -> R Engine. 
-      \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
+        \nIf Microsoft R Server is not installed, you can download it from: 
-      \n")
+        \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
        \n")
    quit()
 }
@ -67,95 +70,126 @@ outFileBike <- paste0(td, "/bike.xdf")
 outFileEdit <- paste0(td, "/editData.xdf")
 outFileLag <- paste0(td, "/lagData")
-#---------------------------Step 1: Import Data---------------------------
+
 # Step 1: Import Data
 # Import the bike data.
 # Remove timestamps and all columns that are part of the label.
-bike_mrs <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE,
+bike_mrs <- rxImport(inData = inputFileBikeURL,
                     outFile = outFileBike, overwrite = TRUE,
                     missingValueString = "M", stringsAsFactors = FALSE,
-                     varsToDrop = c("instant", "dteday", "casual", "registered"))
+                     varsToDrop = c("instant", 
                                    "dteday", 
                                    "casual", 
                                    "registered"))
-# Edit Metadata: Definite year, weather conditions and season columns as categorical.
+# Edit Metadata: Define year, weather conditions and season columns 
-editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit, sortLevels = TRUE,
+# as categorical.
-                          factorInfo = c("yr", "weathersit", "season"), overwrite = TRUE)
+editData_mrs <- rxFactors(inData = bike_mrs, outFile = outFileEdit,
                          sortLevels = TRUE,
                          factorInfo = c("yr", "weathersit", "season"),
                          overwrite = TRUE)
 # Step 2: Feature Engineering
 #---------------------------Step 2: Feature Engineering---------------------------
 # Create a function to construct lag features for four different aspects. 
 computeLagFeatures <- function (dataList) {   
-  
+    # Total number of lags that need to be added.
-  numLags <- length(nLagsVector) # total number of lags that need to be added
+    numLags <- length(nLagsVector) 
-  for (iL in 1:numLags)
+    for (iL in 1:numLags) {
-  {
+        nlag <- nLagsVector[iL]
-    nlag <- nLagsVector[iL]
+        varLagName <- paste("demand.", nlag, unit, sep = "")
-    varLagName <- paste("demand.",nlag,unit,sep="")
+        numRowsInChunk <- length(dataList[[baseVar]])
-    numRowsInChunk <- length(dataList[[baseVar]])
+        numRowsToRead <- nlag * interval
-    numRowsToRead <- nlag * interval
+        numRowsPadding <- 0
-    numRowsPadding <- 0
+        if (numRowsToRead >= .rxStartRow) {
-    if (numRowsToRead >= .rxStartRow)
+            numRowsToRead <- .rxStartRow - 1
-    {
+            numRowsPadding <- nlag * interval - numRowsToRead
-      numRowsToRead <- .rxStartRow - 1
+        }
-      numRowsPadding <- nlag * interval - numRowsToRead
+        # Determine the current row to start processing the data 
        # between chunks.
        startRow <- .rxStartRow - numRowsToRead
        previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
                                                varsToKeep = baseVar,
                                                startRow = startRow,
                                                numRows = numRowsToRead,
                                                returnDataFrame = FALSE)
        paddingRowsDataList <- rxReadXdf(file = .rxReadFileName,
                                                varsToKeep = baseVar,
                                                startRow = 1,
                                                numRows = numRowsPadding,
                                                returnDataFrame = FALSE)
        dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]],
                                    previousRowsDataList[[baseVar]],
                                    dataList[[baseVar]])[1:numRowsInChunk]
    }
-    startRow <- .rxStartRow - numRowsToRead  # determine the current row to start processing the data between chunks.
+    return(dataList)
    previousRowsDataList <- rxReadXdf(file = .rxReadFileName,
                                      varsToKeep = baseVar,
                                      startRow = startRow, numRows = numRowsToRead,
                                      returnDataFrame = FALSE)
    paddingRowsDataList <- rxReadXdf(file=.rxReadFileName,
                                     varsToKeep = baseVar,
                                     startRow = 1, numRows = numRowsPadding,
                                     returnDataFrame = FALSE)
    dataList[[varLagName]] <- c(paddingRowsDataList[[baseVar]], previousRowsDataList[[baseVar]], dataList[[baseVar]])[1:numRowsInChunk]	
  }
  return(dataList)
 }
 # Create a function to add lag features a set of columns at a time.
 addLag <- function(inputData, outputFileBase) {
-  inputFile <- inputData
+    inputFile <- inputData
-  outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
+    outputFileHour <- paste(outputFileBase, "_hour",".xdf",sep="")
-  outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
+    outputFileHourDay <- paste(outputFileBase, "_hour_day",".xdf",sep="")
-  outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week",".xdf",sep="")
+    outputFileHourDayWeek <- paste(outputFileBase, "_hour_day_week", ".xdf",
                                    sep="")
-  # Initialize some fix values.
+    # Initialize some fix values.
-  hourInterval <- 1
+    hourInterval <- 1
-  dayInterval <- 24
+    dayInterval <- 24
-  weekInterval <- 168
+    weekInterval <- 168
-  previous <- 12
+    previous <- 12
-  # Add number of bikes that were rented in each of the previous 12 hours (for Set B).
+    # Add number of bikes that were rented in each of the previous 12 hours 
-  rxDataStep(inData = inputFile,outFile = outputFileHour, transformFunc=computeLagFeatures,
+    # (for Set B).
-             transformObjects = list(baseVar = "cnt", unit = "hour",nLagsVector=seq(12), 
+    rxDataStep(inData = inputFile, outFile = outputFileHour,
-                                     interval = hourInterval),
+                transformFunc=computeLagFeatures,
-             transformVars = c("cnt"), overwrite=TRUE)
+                transformObjects = list(baseVar = "cnt", unit = "hour",
                                        nLagsVector=seq(12), 
                                        interval = hourInterval),
                transformVars = c("cnt"), overwrite=TRUE)
-  # Add number of bikes that were rented in each of the previous 12 days at the same hour (for Set C).
+    # Add number of bikes that were rented in each of the previous 12 days 
-  rxDataStep(inData = outputFileHour,outFile = outputFileHourDay, transformFunc=computeLagFeatures,
+    # at the same hour (for Set C).
-             transformObjects = list(baseVar = "cnt", unit = "day",nLagsVector=seq(12), 
+    rxDataStep(inData = outputFileHour, outFile = outputFileHourDay,
-                                     interval = dayInterval),
+               transformFunc=computeLagFeatures,
-             transformVars = c("cnt"), overwrite=TRUE)
+               transformObjects = list(baseVar = "cnt",
                                       unit = "day",
                                       nLagsVector=seq(12), 
                                       interval = dayInterval),
                                       transformVars = c("cnt"),
                                       overwrite=TRUE)
-  # Add number of bikes that were rented in each of the previous 12 weeks at the same hour and the same day (for Set D).
+    # Add number of bikes that were rented in each of the previous 12 weeks 
-  rxDataStep(inData = outputFileHourDay,outFile = outputFileHourDayWeek, transformFunc=computeLagFeatures,
+    # at the same hour and the same day (for Set D).
-             transformObjects = list(baseVar = "cnt", unit = "week",nLagsVector=seq(12), 
+    rxDataStep(inData = outputFileHourDay,
-                                     interval = weekInterval),
+               outFile = outputFileHourDayWeek,
-             transformVars = c("cnt"), overwrite=TRUE)
+               transformFunc=computeLagFeatures,
               transformObjects = list(baseVar = "cnt",
                                       unit = "week",
                                       nLagsVector=seq(12), 
                                       interval = weekInterval),
               transformVars = c("cnt"), overwrite=TRUE)
-  file.remove(outputFileHour)
+    file.remove(outputFileHour)
-  file.remove(outputFileHourDay)
+    file.remove(outputFileHourDay)
-  
+    return(outputFileHourDayWeek)
  return(outputFileHourDayWeek)
 }
-# Set A = weather + holiday + weekday + weekend features for the predicted day.
+# Set A = weather + holiday + weekday + weekend features for 
 # the predicted day.
 finalDataA_mrs <- editData_mrs
 # Set B, C & D.
-finalDataLag_dir <- addLag(inputData = editData_mrs, outputFileBase = outFileLag)
+finalDataLag_dir <- addLag(inputData = editData_mrs,
                           outputFileBase = outFileLag)
 finalDataLag_mrs <- RxXdfData(finalDataLag_dir)
-#---------------------------Step 3: Prepare Training and Test Datasets---------------------------
+# Step 3: Prepare Training and Test Datasets
 ## Set A:
 # Split Data.
 rxSplit(inData = finalDataA_mrs,
@ -168,41 +202,62 @@ testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf"))
 ## Set B, C & D:
 # Split Data.
-rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
+rxSplit(inData = finalDataLag_mrs,
        outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr",
        overwrite = TRUE, reportProgress = 0, verbose = 0)
 # Point to the .xdf files for the training and test set.
 train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf"))
 test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf"))
-#---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)---------------------------
+  
 # Step 4: Choose and apply a learning algorithm (Decision Forest Regression)
 newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "")
 newWeekFeatures <- paste("demand", ".", seq(12), "week", sep = "")
 ## Set A:
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
+# Build a formula for the regression model and remove the "yr", 
-formA_mrs <- formula(trainA_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
+# which is used to split the training and test data.
 formA_mrs <- formula(trainA_mrs, depVars = "cnt",
                     varsToDrop = c("RowNum", "yr"))
 # Fit Decision Forest Regression model.
-dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs, importance = TRUE, seed = 123)
+dForestA_mrs <- rxDForest(formA_mrs, data = trainA_mrs,
                          importance = TRUE, seed = 123)
 ## Set B:
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set C and D.
+# Build a formula for the regression model and remove the "yr", 
-formB_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newDayFeatures, newWeekFeatures))
+# which is used to split the training and test data, and lag features 
 # for Set C and D.
 formB_mrs <- formula(train_mrs, depVars = "cnt",
                     varsToDrop = c("RowNum", "yr",
                                    newDayFeatures,
                                    newWeekFeatures))
 # Fit Decision Forest Regression model.
-dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs, importance = TRUE, seed = 123)
+dForestB_mrs <- rxDForest(formB_mrs, data = train_mrs,
                          importance = TRUE, seed = 123)
 ## Set C:
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data, and lag features for Set D.
+# Build a formula for the regression model and remove the "yr", 
-formC_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr", newWeekFeatures))
+# which is used to split the training and test data, and lag features 
 # for Set D.
 formC_mrs <- formula(train_mrs, depVars = "cnt",
                     varsToDrop = c("RowNum", "yr", newWeekFeatures))
 # Fit Decision Forest Regression model.
-dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs, importance = TRUE, seed = 123)
+dForestC_mrs <- rxDForest(formC_mrs, data = train_mrs,
                          importance = TRUE, seed = 123)
 ## Set D:
-# Build a formula for the regression model and remove the "yr", which is used to split the training and test data.
+# Build a formula for the regression model and remove the "yr", 
-formD_mrs <- formula(train_mrs, depVars = "cnt", varsToDrop = c("RowNum", "yr"))
+# which is used to split the training and test data.
 formD_mrs <- formula(train_mrs, depVars = "cnt",
                     varsToDrop = c("RowNum", "yr"))
 # Fit Decision Forest Regression model.
-dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs, importance = TRUE, seed = 123)
+dForestD_mrs <- rxDForest(formD_mrs, data = train_mrs,
                          importance = TRUE, seed = 123)
 # Step 5: Predict over new data
 #---------------------------Step 5: Predict over new data---------------------------
 ## Set A:
 # Predict the probability on the test dataset.
 rxPredict(dForestA_mrs, data = testA_mrs, 
@ -231,17 +286,22 @@ rxPredict(dForestD_mrs, data = test_mrs,
          residVarNames = "cnt_Resid_D",
          overwrite = TRUE, computeResiduals = TRUE)
-#---------------------------Prepare outputs---------------------------
+
 # Prepare outputs
 ## Set A:
-# Calculate three statistical measures: Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
+# Calculate three statistical measures: Mean Absolute Error (MAE), 
-sumA <- rxSummary(~ cnt_Resid_A_abs+cnt_Resid_A_2+cnt_rel_A, data = testA_mrs, summaryStats = "Mean", 
+# Root Mean Squared Error (RMSE), and Relative Absolute Error (RAE).
 sumA <- rxSummary( ~ cnt_Resid_A_abs + cnt_Resid_A_2 + cnt_rel_A,
                  data = testA_mrs, summaryStats = "Mean", 
                  transforms = list(cnt_Resid_A_abs = abs(cnt_Resid_A), 
                                    cnt_Resid_A_2 = cnt_Resid_A^2, 
                                    cnt_rel_A = abs(cnt_Resid_A)/cnt)
-)$sDataFrame
+                 )$sDataFrame
 ## Set B, C & D:
-sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_Resid_C_2+cnt_rel_C+cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D, 
+sum <- rxSummary( ~ cnt_Resid_B_abs + cnt_Resid_B_2 + cnt_rel_B +
                 cnt_Resid_C_abs + cnt_Resid_C_2 + cnt_rel_C +
                 cnt_Resid_D_abs+cnt_Resid_D_2+cnt_rel_D, 
                 data = test_mrs, summaryStats = "Mean", 
                 transforms = list(cnt_Resid_B_abs = abs(cnt_Resid_B), 
                                   cnt_Resid_B_2 = cnt_Resid_B^2, 
@ -252,7 +312,7 @@ sum <- rxSummary(~ cnt_Resid_B_abs+cnt_Resid_B_2+cnt_rel_B+cnt_Resid_C_abs+cnt_R
                                   cnt_Resid_D_abs = abs(cnt_Resid_D), 
                                   cnt_Resid_D_2 = cnt_Resid_D^2, 
                                   cnt_rel_D = abs(cnt_Resid_D)/cnt)
-)$sDataFrame
+                 )$sDataFrame
 # Add row names.
 features <- c("baseline: weather + holiday + weekday + weekend features for the predicted day",
@ -263,7 +323,10 @@ features <- c("baseline: weather + holiday + weekday + weekend features for the
 # List all measures in a data frame.
 measures <- data.frame(Features = features,
                       MAE = c(sumA[1, 2], sum[1, 2], sum[4, 2], sum[7, 2]),
-                       RMSE = c(sqrt(sumA[2, 2]), sqrt(sum[2, 2]), sqrt(sum[5, 2]), sqrt(sum[8, 2])),
+                       RMSE = c(sqrt(sumA[2, 2]),
                                sqrt(sum[2, 2]),
                                sqrt(sum[5, 2]),
                                sqrt(sum[8, 2])),
                       RAE = c(sumA[3, 2], sum[3, 2], sum[6, 2], sum[9, 2]))
 # Review the measures.
--- a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R
+++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R
@ -1,4 +1,6 @@
-# Install a package if it's not already installed.
+# This script shows how to import data into R that is referenced by a URL.
 # Install the RCurl package if it's not already installed.
 (if (!require("RCurl", quietly = TRUE)) install.packages("RCurl"))
 # Load packages.
--- a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R
+++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R
@ -3,16 +3,17 @@
 # Check whether the "RevoScaleR" package is loaded in the current environment.
 if (!require("RevoScaleR")) {
-  cat("RevoScaleR package does not seem to exist. 
+    cat("RevoScaleR package does not seem to exist. 
-      \nThis means that the functions starting with 'rx' will not run. 
+        \nThis means that the functions starting with 'rx' will not run. 
-      \nIf you have Microsoft R Server installed, please switch the R engine.
+        \nIf you have Microsoft R Server installed, 
-      \nFor example, in R Tools for Visual Studio: 
+        \nplease switch the R engine.
-      \nR Tools -> Options -> R Engine. 
+        \nFor example, in R Tools for Visual Studio: 
-      \nIf Microsoft R Server is not installed, you can download it from: 
+        \nR Tools -> Options -> R Engine. 
-      \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
+        \nIf Microsoft R Server is not installed, you can download it from: 
-      \n")
+        \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/
        \n")
-  quit()
+    quit()
 } 
 # A URL contains the raw data.
--- a/examples/MRS_and_Machine_Learning/Data_Exploration/Using_ggplot2.R
+++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Using_ggplot2.R
@ -1,11 +1,11 @@
-# ----------------------------------------------------------------------------
+# This sample gives an introduction to plotting using the ggplot2 package.
 # purpose: 	introduction to plotting using the ggplot2 package
 # audience: you are expected to have some prior experience with R
 # ----------------------------------------------------------------------------
-# R has a number of built-in datasets
+# The ggplot2 package is tremendously popular because it allows you to create
-# In this example you use the dataset called quakes
+# beautiful plots by describing the plot structure
-# This data contains locations of earthquakes off Fiji
+
 # R has a number of built-in datasets.
 # In this example you use the dataset called quakes.
 # This data contains locations of earthquakes off Fiji.
 # Read the help page for more information
 ?quakes
@ -24,71 +24,55 @@ if (!require("mapproj"))
  install.packages("mapproj") # required for map projections
 library("mapproj")
 # ----------------------------------------------------------------------------
 # Starting to use the ggplot2 package
 # ----------------------------------------------------------------------------
 # The ggplot2 package is tremendously popular because it allows you to create
 # beautiful plots by describing the plot structure
 # Plot longitude and latitude of quakes
 # To create a plot, you have to specify the data, then map aesthetics to 
 # columns in your data. In this example, you map the column long to the x-axis
 # and lat to the y-axis.
-# Then you add a layer with points (geom_point) and a layer to plot maps
+# Then you add a layer with points (geom_point) and a layer to plot maps.
 p0 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point() + 
+    geom_point() + 
-  coord_map()
+    coord_map()
 p0 
 # You can use a number of different aesthetics, for example colour or size
-# of the points
+# of the points.
 # Map the depth column to the colour aesthetic
 p1 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth)) + 
+    geom_point(aes(colour = depth)) + 
-  coord_map()
+    coord_map()
 p1
 # Add size for magnitude. The bigger the magnitude, the larger the point
 p2 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag)) + 
+    geom_point(aes(colour = depth, size = mag)) + 
-  coord_map()
+    coord_map()
 p2
 # You can control the transparancy of a plot object with the alpha aesthetic
 # High values of alpha (close to 1) are opaque, while low values (close to 0)
 # are translucent
 # Add alpha level to hide overplotting, thus revealing detail
 p3 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  coord_map()
+    coord_map()
 p3
 # Change colour gradient by adding a gradient scale
 p4 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  coord_map() +
+    coord_map() +
-  scale_colour_gradient(low = "blue", high = "red")
+    scale_colour_gradient(low = "blue", high = "red")
 p4
 # Add a plot title
 p5 <- ggplot(quakes, aes(x = long, y = lat)) + 
-  geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
+    geom_point(aes(colour = depth, size = mag), alpha = 0.25) + 
-  scale_colour_gradient(low = "blue", high = "red") + 
+    scale_colour_gradient(low = "blue", high = "red") + 
-  ggtitle("Distribution of earthquakes near Fiji") +
+    ggtitle("Distribution of earthquakes near Fiji") +
-  coord_map()  
+    coord_map()  
 p5
 # Now plot multiple plots on the same graphic
 # The package "grid" is built into R and allows you to take control of the 
 # plotting area. A grob is the abbreviation for "graphical object", and the 
@ -100,38 +84,35 @@ theme_set(theme_grey(12) + theme(legend.key.size  =  unit(0.5, "lines")))
 library(grid)
 plot.new()
 grid.draw(cbind(
-  ggplotGrob(p1), 
+    ggplotGrob(p1), 
-  ggplotGrob(p2),
+    ggplotGrob(p2),
-  ggplotGrob(p3),
+    ggplotGrob(p3),
-  size = "last"
+    size = "last"
-))
+    ))
 # ----------------------------------------------------------------------------
 # The package "gtable" allows you to work with objects called grob tables.
 # A grob table captures all the information needed to layout grobs in a table
 # structure. It supports row and column spanning, offers some tools to
 # automatically figure out the correct dimensions, and makes it easy to align
 # and combine multiple tables. 
-
+if (!require("gtable"))
-if (!require("gtable")) install.packages("gtable")
+    install.packages("gtable")
 library(gtable)
-plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) + 
+plonglat <- ggplot(quakes, aes(x = long, y = lat, size = mag, col = depth)) +
-  geom_point(alpha = 0.5) + 
+    geom_point(alpha = 0.5) + 
-  ggtitle("Top view")
+    ggtitle("Top view")
 plongdep <- ggplot(quakes, aes(x = long, y = -depth, size = mag, col = depth)) + 
-  geom_point(alpha = 0.5) + 
+    geom_point(alpha = 0.5) + 
-  ggtitle("Side view")
+    ggtitle("Side view")
 platdep  <- ggplot(quakes, aes(x = depth, y = lat, size = mag, col = depth)) + 
-  geom_point(alpha = 0.5) + 
+    geom_point(alpha = 0.5) + 
-  ggtitle("Side view")
+    ggtitle("Side view")
 # Next, define a gtable and add grobs to the table
 # Next, define a gtable and add grobs to the table.
 gt <- gtable(widths = unit(rep(1,2), "null"),
             heights = unit(rep(1,2), "null"))
 gt <- gtable_add_grob(gt,