From e19d27ef418e9b8650822be54e077569031cb3fe Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 14 Mar 2016 13:13:50 -0400 Subject: [PATCH] handle no MRS installation situation in MRS scripts --- .../MRS_bike_rental_estimation.R | 53 ++++++------ ...S_bike_rental_estimation_AML_replication.R | 63 ++++++++------- .../Data_Exploration/Import_Data_from_URL.R | 13 +-- .../Import_Data_from_URL_to_xdf.R | 35 ++++---- .../MRS_flight_delays.R | 73 ++++++++++------- .../MRS_flight_delays_with_R_comparison.R | 81 ++++++++++--------- 6 files changed, 177 insertions(+), 141 deletions(-) diff --git a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R index cfaddd5..acd2126 100644 --- a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R +++ b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation.R @@ -1,6 +1,6 @@ -############################################################################################################## -##################### Regression: Demand estimation with Microsoft R Server ################################## -############################################################################################################## +#------------------------------------------------------------------------------------------------------------- +#-------------------------- Regression: Demand estimation with Microsoft R Server ---------------------------- +#------------------------------------------------------------------------------------------------------------- # # # This example demonstrates the Feature Engineering process for building a regression model to predict @@ -18,29 +18,34 @@ # The following scripts include five basic steps of building this example using Microsoft R Server. # # -############################################################################################################## +#-------------------------------------------------------------------------------------------------------------- -#---------------------------Step 0: Get Started--------------------------- -# Check whether the "RevoScaleR" package is loaded in the current environment. -if (require("RevoScaleR")) { - library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server. - message("RevoScaleR package is succesfully loaded.") -} else { - message("Can't find RevoScaleR package...") - message("If you have Microsoft R Server installed,") - message("please switch the R engine") - message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.") - message("If Microsoft R Server is not installed,") - message("please download it from here:") - message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.") +#---------------------------Step 0: Get Started------------------------------- +# ---------------------------------------------------------------------------- +# Check if Microsoft R Server (RRE 8.0) is installed +# ---------------------------------------------------------------------------- +if (!require("RevoScaleR")) { + cat("RevoScaleR package does not seem to exist. + \nThis means that the functions starting with 'rx' will not run. + \nIf you have Microsoft R Server installed, please switch the R engine. + \nFor example, in R Tools for Visual Studio: + \nR Tools -> Options -> R Engine. + \nIf Microsoft R Server is not installed, you can download it from: + \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/ + \n") + + quit() } # Initial some variables. -inputFileBikeURL <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/Bike_Rental_UCI_Dataset.csv" -(if (!exists("tmp")) dir.create("tmp", showWarnings = FALSE)) # create a temporary folder to store the .xdf files. -outFileBike <- "tmp/bike.xdf" -outFileLag <- "tmp/lagData.xdf" +github <- "https://raw.githubusercontent.com/mezmicrosoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/" +inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv") + +# Create a temporary directory to store the intermediate .xdf files. +td <- tempdir() +outFileBike <- paste0(td, "/bike.xdf") +outFileLag <- paste0(td, "/lagData.xdf") #---------------------------Step 1: Import the Bike Data--------------------------- bike <- rxImport(inData = inputFileBikeURL, outFile = outFileBike, overwrite = TRUE, @@ -91,11 +96,11 @@ lagData <- rxDataStep(inData = bike, outFile = outFileLag, transformFunc = compu #---------------------------Step 3: Prepare Training and Test Datasets--------------------------- # Split data by "yr" so that the training data contains records for the year 2011 and the test data contains records for 2012. -rxSplit(inData = lagData, outFilesBase = "tmp/modelData", splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0) +rxSplit(inData = lagData, outFilesBase = paste0(td, "/modelData"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0) # Point to the .xdf files for the training and test set. -train <- RxXdfData("tmp/modelData.yr.0.xdf") -test <- RxXdfData("tmp/modelData.yr.1.xdf") +train <- RxXdfData(paste0(td, "/modelData.yr.0.xdf")) +test <- RxXdfData(paste0(td, "/modelData.yr.1.xdf")) #---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)--------------------------- # Build a formula for the regression model and remove the "yr", which is used to split the training and test data. diff --git a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R index c5519e9..10acaaa 100644 --- a/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R +++ b/examples/MRS_and_Machine_Learning/Bike_Rental_Estimation_with_MRS/MRS_bike_rental_estimation_AML_replication.R @@ -1,6 +1,6 @@ -############################################################################################################## -##################### Regression: Demand estimation with Microsoft R Server ################################## -############################################################################################################## +#------------------------------------------------------------------------------------------------------------- +#-------------------------- Regression: Demand estimation with Microsoft R Server ---------------------------- +#------------------------------------------------------------------------------------------------------------- # # # This example a replication of an existing Azure Machine Learning Experiment - Regression: Demand Estimation @@ -37,30 +37,35 @@ # The following scripts include five basic steps of building this example using Microsoft R Server. # # -############################################################################################################## +#-------------------------------------------------------------------------------------------------------------- -#---------------------------Step 0: Get Started--------------------------- -# Check whether the "RevoScaleR" package is loaded in the current environment. -if (require("RevoScaleR")) { - library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server. - message("RevoScaleR package is succesfully loaded.") -} else { - message("Can't find RevoScaleR package...") - message("If you have Microsoft R Server installed,") - message("please switch the R engine") - message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.") - message("If Microsoft R Server is not installed,") - message("please download it from here:") - message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.") +#---------------------------Step 0: Get Started------------------------------- +# ---------------------------------------------------------------------------- +# Check if Microsoft R Server (RRE 8.0) is installed +# ---------------------------------------------------------------------------- +if (!require("RevoScaleR")) { + cat("RevoScaleR package does not seem to exist. + \nThis means that the functions starting with 'rx' will not run. + \nIf you have Microsoft R Server installed, please switch the R engine. + \nFor example, in R Tools for Visual Studio: + \nR Tools -> Options -> R Engine. + \nIf Microsoft R Server is not installed, you can download it from: + \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/ + \n") + + quit() } # Initial some variables. -inputFileBikeURL <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/Bike_Rental_UCI_Dataset.csv" -(if (!exists("tmp")) dir.create("tmp", showWarnings = FALSE)) # create a temporary folder to store the .xdf files. -outFileBike <- "tmp/bike2.xdf" -outFileEdit <- "tmp/editData.xdf" -outFileLag <- "tmp/lagData" +github <- "https://raw.githubusercontent.com/mezmicrosoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/" +inputFileBikeURL <- paste0(github, "Bike_Rental_UCI_Dataset.csv") + +# Create a temporary directory to store the intermediate .xdf files. +td <- tempdir() +outFileBike <- paste0(td, "/bike.xdf") +outFileEdit <- paste0(td, "/editData.xdf") +outFileLag <- paste0(td, "/lagData") #---------------------------Step 1: Import Data--------------------------- # Import the bike data. @@ -153,19 +158,21 @@ finalDataLag_mrs <- RxXdfData(finalDataLag_dir) #---------------------------Step 3: Prepare Training and Test Datasets--------------------------- ## Set A: # Split Data. -rxSplit(inData = finalDataA_mrs, outFilesBase = "tmp/modelDataA", splitByFactor = "yr", +rxSplit(inData = finalDataA_mrs, + outFilesBase = paste0(td, "/modelDataA"), + splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0) # Point to the .xdf files for the training and test set. -trainA_mrs <- RxXdfData("tmp/modelDataA.yr.0.xdf") -testA_mrs <- RxXdfData("tmp/modelDataA.yr.1.xdf") +trainA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.0.xdf")) +testA_mrs <- RxXdfData(paste0(td, "/modelDataA.yr.1.xdf")) ## Set B, C & D: # Split Data. -rxSplit(inData = finalDataLag_mrs, outFilesBase = "tmp/modelDataLag", splitByFactor = "yr", +rxSplit(inData = finalDataLag_mrs, outFilesBase = paste0(td, "/modelDataLag"), splitByFactor = "yr", overwrite = TRUE, reportProgress = 0, verbose = 0) # Point to the .xdf files for the training and test set. -train_mrs <- RxXdfData("tmp/modelDataLag.yr.0.xdf") -test_mrs <- RxXdfData("tmp/modelDataLag.yr.1.xdf") +train_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.0.xdf")) +test_mrs <- RxXdfData(paste0(td, "/modelDataLag.yr.1.xdf")) #---------------------------Step 4: Choose and apply a learning algorithm (Decision Forest Regression)--------------------------- newDayFeatures <- paste("demand", ".", seq(12), "day", sep = "") diff --git a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R index db38c83..57f4154 100644 --- a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R +++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL.R @@ -1,11 +1,12 @@ -# Import packages. -(if (!require("RCurl")) install.packages("RCurl")) -library("RCurl") -(if (!require("foreign")) install.packages("foreign")) -library("foreign") +# Install a package if it's not already installed. +(if (!require("RCurl", quietly = TRUE)) install.packages("RCurl")) + +# Load packages. +library("RCurl", quietly = TRUE) # A URL contains the raw data. -inputDataURL <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/Flight_Delays_Sample.csv" +github <- "https://raw.githubusercontent.com/mezmicrosoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/" +inputDataURL <- paste0(github, "Flight_Delays_Sample.csv") # Download data from the URL. inputData <- getURL(inputDataURL) diff --git a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R index 62f6a11..958109d 100644 --- a/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R +++ b/examples/MRS_and_Machine_Learning/Data_Exploration/Import_Data_from_URL_to_xdf.R @@ -2,29 +2,30 @@ # to download data from a URL. # Check whether the "RevoScaleR" package is loaded in the current environment. -if (require("RevoScaleR")) { - library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server. - message("RevoScaleR package is succesfully loaded.") -} else { - message("Can't find RevoScaleR package...") - message("If you have Microsoft R Server installed,") - message("please switch the R engine") - message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.") - message("If Microsoft R Server is not installed,") - message("please download it from here:") - message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.") -} +if (!require("RevoScaleR")) { + cat("RevoScaleR package does not seem to exist. + \nThis means that the functions starting with 'rx' will not run. + \nIf you have Microsoft R Server installed, please switch the R engine. + \nFor example, in R Tools for Visual Studio: + \nR Tools -> Options -> R Engine. + \nIf Microsoft R Server is not installed, you can download it from: + \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/ + \n") + + quit() +} # A URL contains the raw data. -inputDataURL <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/Flight_Delays_Sample.csv" +github <- "https://raw.githubusercontent.com/mezmicrosoft/RTVS-docs/master/examples/MRS_and_Machine_Learning/Datasets/" +inputDataURL <- paste0(github, "Flight_Delays_Sample.csv") -# Create a temporary folder to store the .xdf files. -(if (!exists("tmp")) dir.create("tmp", showWarnings = FALSE)) +# Create a temporary directory to store the intermediate .xdf files. +td <- tempdir() # Read a downloaded .csv file into a RxXdfData object. -outFile <- 'tmp/data.xdf' +outFile <- paste0(td, "/data.xdf") df_xdf <- rxImport(inData = inputDataURL, outFile = outFile, missingValueString = "M", stringsAsFactors = FALSE) # Review the first 6 rows of data. -head(df_xdf) +head(df_xdf) \ No newline at end of file diff --git a/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays.R b/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays.R index 72d0f86..1759611 100644 --- a/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays.R +++ b/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays.R @@ -1,43 +1,52 @@ -# -# Flight Delay Prediction with Microsoft R Server -# +#------------------------------------------------------------------------------------------------------------------------------------ +#------------------------------------- Flight Delay Prediction with Microsoft R Server ---------------------------------------------- +#------------------------------------------------------------------------------------------------------------------------------------ -# In this example, we use historical on-time performance and weather data to predict whether the arrival of a scheduled passenger flight will be delayed by more than 15 minutes. +# In this example, we use historical on-time performance and weather data to predict whether the arrival of a scheduled passenger +# flight will be delayed by more than 15 minutes. -# We approach this problem as a classification problem, predicting two classes -- whether the flight will be delayed, or whether it will be on time. Broadly speaking, in machine learning and statistics, classification is the task of identifying the class or category to which a new observation belongs, on the basis of a training set of data containing observations with known categories. Classification is generally a supervised learning problem. Since this is a binary classification task, there are only two classes. +# We approach this problem as a classification problem, predicting two classes -- whether the flight will be delayed, or whether it +# will be on time. Broadly speaking, in machine learning and statistics, classification is the task of identifying the class or category +# to which a new observation belongs, on the basis of a training set of data containing observations with known categories. +# Classification is generally a supervised learning problem. Since this is a binary classification task, there are only two classes. -# In this example, we train a model using a large number of examples from historic flight data, along with an outcome measure that indicates the appropriate category or class for each example. The two classes are labeled 1 if a flight was delayed, and labeled 0 if the flight was on time. +# In this example, we train a model using a large number of examples from historic flight data, along with an outcome measure that +# indicates the appropriate category or class for each example. The two classes are labeled 1 if a flight was delayed, and labeled 0 +# if the flight was on time. # The following scripts include five basic steps of building this example using Microsoft R Server. -#---------------------------Step 0: Get Started--------------------------- +#---------------------------Step 0: Get Started------------------------------- # ---------------------------------------------------------------------------- -# check if Microsoft R Server (RRE 8.0) is installed +# Check if Microsoft R Server (RRE 8.0) is installed # ---------------------------------------------------------------------------- -if (require("RevoScaleR")) { - library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server. - message("RevoScaleR package is succesfully loaded.") -} else { - message("Can't find RevoScaleR package...") - message("If you have Microsoft R Server installed,") - message("please switch the R engine") - message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.") - message("If Microsoft R Server is not installed,") - message("please download it from here:") - message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.") +if (!require("RevoScaleR")) { + cat("RevoScaleR package does not seem to exist. + \nThis means that the functions starting with 'rx' will not run. + \nIf you have Microsoft R Server installed, please switch the R engine. + \nFor example, in R Tools for Visual Studio: + \nR Tools -> Options -> R Engine. + \nIf Microsoft R Server is not installed, you can download it from: + \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/ + \n") + + quit() } # Initial some variables. github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/" inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv") inputFileWeatherURL <- paste0(github, "Weather_Sample.csv") -outFileFlight <- "flight.xdf" -outFileWeather <- "weather.xdf" -outFileOrigin <- "originData.xdf" -outFileDest <- "destData.xdf" -outFileFinal <- "finalData.xdf" + +# Create a temporary directory to store the intermediate .xdf files. +td <- tempdir() +outFileFlight <- paste0(td, "/flight.xdf") +outFileWeather <- paste0(td, "/weather.xdf") +outFileOrigin <- paste0(td, "/originData.xdf") +outFileDest <- paste0(td, "/destData.xdf") +outFileFinal <- paste0(td, "/finalData.xdf") #---------------------------Step 1: Import Data-------------------------------- @@ -140,7 +149,7 @@ rxFactors(inData = destData_mrs, outFile = outFileFinal, sortLevels = TRUE, # Randomly split 80% data as training set and the remaining 20% as test set. rxSplit(inData = outFileFinal, - outFilesBase = "tmp/modelData", + outFilesBase = paste0(td, "/modelData"), outFileSuffixes = c("Train", "Test"), splitByFactor = "splitVar", overwrite = TRUE, @@ -154,8 +163,8 @@ rxSplit(inData = outFileFinal, consoleOutput = TRUE) # Point to the .xdf files for the training and test set. -train <- RxXdfData("tmp/modelData.splitVar.Train.xdf") -test <- RxXdfData("tmp/modelData.splitVar.Test.xdf") +train <- RxXdfData(paste0(td, "/modelData.splitVar.Train.xdf")) +test <- RxXdfData(paste0(td, "/modelData.splitVar.Test.xdf")) #- Step 4A: Choose and apply a learning algorithm (Logistic Regression) ------- # Build the formula. @@ -176,7 +185,10 @@ rxPredict(logitModel_mrs, data = test, overwrite = TRUE) # Calculate Area Under the Curve (AUC). -rxAuc(rxRoc("ArrDel15", "ArrDel15_Pred_Logit", test)) +paste0("AUC of Logistic Regression Model:", + rxAuc(rxRoc("ArrDel15", "ArrDel15_Pred_Logit", test))) + +# Plot the ROC curve. rxRocCurve("ArrDel15", "ArrDel15_Pred_Logit", data = test, title = "ROC curve - Logistic regression") @@ -197,7 +209,10 @@ rxPredict(dTree2_mrs, data = test, overwrite = TRUE) # Calculate Area Under the Curve (AUC). -rxAuc(rxRoc("ArrDel15", "ArrDel15_Pred_Tree", test)) +paste0("AUC of Decision Tree Model:", + rxAuc(rxRoc(" ArrDel15 ", " ArrDel15_Pred_Tree ", test))) + +# Plot the ROC curve. rxRocCurve("ArrDel15", predVarNames = c("ArrDel15_Pred_Tree", "ArrDel15_Pred_Logit"), data = test, diff --git a/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays_with_R_comparison.R b/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays_with_R_comparison.R index fb36a83..51c5d32 100644 --- a/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays_with_R_comparison.R +++ b/examples/MRS_and_Machine_Learning/Flight_Delays_Prediction_with_MRS/MRS_flight_delays_with_R_comparison.R @@ -1,6 +1,6 @@ -##################################################################################################################################### -################################ Flight Delay Prediction with Microsoft R Server #################################################### -##################################################################################################################################### +#------------------------------------------------------------------------------------------------------------------------------------ +#------------------------------------- Flight Delay Prediction with Microsoft R Server ---------------------------------------------- +#------------------------------------------------------------------------------------------------------------------------------------ # # # This example demostrates a step-by-step comparison of solving a Machine Learning use case using open @@ -25,36 +25,40 @@ # The following scripts include five basic steps of building this example using Microsoft R Server. # # -##################################################################################################################################### +#------------------------------------------------------------------------------------------------------------------------------------ -#---------------------------Step 0: Get Started--------------------------- -# Check whether the "RevoScaleR" package is loaded in the current environment. -if (require("RevoScaleR")) { - library("RevoScaleR") # Load RevoScaleR package from Microsoft R Server. - message("RevoScaleR package is succesfully loaded.") -} else { - message("Can't find RevoScaleR package...") - message("If you have Microsoft R Server installed,") - message("please switch the R engine") - message("in R Tools for Visual Studio: R Tools -> Options -> R Engine.") - message("If Microsoft R Server is not installed,") - message("please download it from here:") - message("https://www.microsoft.com/en-us/server-cloud/products/r-server/.") +#---------------------------Step 0: Get Started------------------------------- +# ---------------------------------------------------------------------------- +# Check if Microsoft R Server (RRE 8.0) is installed +# ---------------------------------------------------------------------------- +if (!require("RevoScaleR")) { + cat("RevoScaleR package does not seem to exist. + \nThis means that the functions starting with 'rx' will not run. + \nIf you have Microsoft R Server installed, please switch the R engine. + \nFor example, in R Tools for Visual Studio: + \nR Tools -> Options -> R Engine. + \nIf Microsoft R Server is not installed, you can download it from: + \nhttps://www.microsoft.com/en-us/server-cloud/products/r-server/ + \n") + + quit() } # Initial some variables. github <- "https://raw.githubusercontent.com/Microsoft/RTVS-docs/master/examples/Datasets/" inputFileFlightURL <- paste0(github, "Flight_Delays_Sample.csv") inputFileWeatherURL <- paste0(github, "Weather_Sample.csv") -(if (!exists("tmp")) dir.create("tmp", showWarnings = FALSE)) # create a temporary folder to store the .xdf files. -outFileFlight <- 'tmp/flight2.xdf' -outFileFlight2 <- 'tmp/flight2_2.xdf' -outFileWeather <- 'tmp/weather2.xdf' -outFileWeather2 <- 'tmp/weather2_2.xdf' -outFileOrigin <- 'tmp/originData2.xdf' -outFileDest <- 'tmp/DestData2.xdf' -outFileFinal <- 'tmp/finalData2.xdf' + +# Create a temporary directory to store the intermediate .xdf files. +td <- tempdir() +outFileFlight <- paste0(td, "/flight.xdf") +outFileFlight2 <- paste0(td, "/flight2.xdf") +outFileWeather <- paste0(td, "/weather.xdf") +outFileWeather2 <- paste0(td, "/weather2.xdf") +outFileOrigin <- paste0(td, "/originData.xdf") +outFileDest <- paste0(td, "/DestData.xdf") +outFileFinal <- paste0(td, "/finalData.xdf") #---------------------------Step 1: Import Data--------------------------- # Import the flight data. @@ -141,7 +145,7 @@ finalData_mrs <- rxDataStep(inData = destData_mrs, outFile = outFileFinal, #---------------------------Step 3: Prepare Training and Test Datasets--------------------------- # Randomly split 80% data as training set and the remaining 20% as test set. rxSplit(inData = outFileFinal, - outFilesBase = "tmp/modelData", + outFilesBase = paste0(td, "/modelData"), outFileSuffixes = c("Train", "Test"), splitByFactor = "splitVar", overwrite = TRUE, @@ -151,8 +155,8 @@ rxSplit(inData = outFileFinal, consoleOutput = TRUE) # Point to the .xdf files for the training and test set. -train <- RxXdfData("tmp/modelData.splitVar.Train.xdf") -test <- RxXdfData("tmp/modelData.splitVar.Test.xdf") +train <- RxXdfData(paste0(td, "/modelData.splitVar.Train.xdf")) +test <- RxXdfData(paste0(td, "/modelData.splitVar.Test.xdf")) #---------------------------Step 4A: Choose and apply a learning algorithm (Logistic Regression)--------------------------- # Build the formula. @@ -169,9 +173,12 @@ summary(logitModel_mrs) predictLogit_mrs <- rxPredict(logitModel_mrs, data = test, type = "response", predVarNames = "ArrDel15_Pred_Logit", overwrite = TRUE) # Calculate Area Under the Curve (AUC). -rxAuc(rxRoc("ArrDel15", "ArrDel15_Pred_Logit", predictLogit_mrs)) +paste0("AUC of Logistic Regression Model:", + rxAuc(rxRoc("ArrDel15", "ArrDel15_Pred_Logit", test))) + +# Plot the ROC curve. rxRocCurve("ArrDel15", "ArrDel15_Pred_Logit", data = test, - title = "ROC curve - Logistic regression") + title = "ROC curve - Logistic regression") #---------------------------Step 4B: Choose and apply a learning algorithm (Decision Tree)--------------------------- # Build a decision tree model. @@ -188,11 +195,11 @@ dTree2_mrs <- prune.rxDTree(dTree1_mrs, cp = treeCp_mrs) predictTree_mrs <- rxPredict(dTree2_mrs, data = test, predVarNames = "ArrDel15_Pred_Tree", overwrite = TRUE) # Calculate Area Under the Curve (AUC). -rxAuc(rxRoc("ArrDel15", "ArrDel15_Pred_Tree", predictTree_mrs)) -<<<<<<< HEAD +paste0("AUC of Decision Tree Model:", + rxAuc(rxRoc(" ArrDel15 ", " ArrDel15_Pred_Tree ", test))) + +# Plot the ROC curve. rxRocCurve("ArrDel15", - predVarNames = c("ArrDel15_Pred_Tree", "ArrDel15_Pred_Logit"), - data = test, - title = "ROC curve - Logistic regression") -======= ->>>>>>> 5677fd41eb8011dae8e1c0ecf626df3d11cd3dc4 + predVarNames = c("ArrDel15_Pred_Tree", "ArrDel15_Pred_Logit"), + data = test, + title = "ROC curve - Logistic regression")