From 841c50f4546b11a4f077a76d9eec01b658050b1e Mon Sep 17 00:00:00 2001 From: Qiong Wei Date: Tue, 20 Jun 2017 12:18:30 -0700 Subject: [PATCH] add column branch --- HDI/RSparkCluster/step2_feature_engineering.R | 4 ++-- HDI/RSparkCluster/step3_training_evaluation.R | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/HDI/RSparkCluster/step2_feature_engineering.R b/HDI/RSparkCluster/step2_feature_engineering.R index 8f83de9..9c007e4 100644 --- a/HDI/RSparkCluster/step2_feature_engineering.R +++ b/HDI/RSparkCluster/step2_feature_engineering.R @@ -48,7 +48,7 @@ feature_engineer <- function(trainingSet, for (i in featuresNum) { rxSetComputeContext(sparkContext) - mlTrans <- list(categorical(vars = c("purpose", "residentialState", "homeOwnership", "yearsEmployment")), + mlTrans <- list(categorical(vars = c("purpose", "residentialState", "branch", "homeOwnership", "yearsEmployment")), selectFeatures(modelFormula, mode = mutualInformation(numFeaturesToKeep = i))) candinateModel <- rxLogisticRegression(modelFormula, data = trainingSet, mlTransforms = mlTrans) predictedScore <- rxPredict(candinateModel, testingSet, extraVarsToWrite = c("charge_off")) @@ -74,7 +74,7 @@ feature_engineer <- function(trainingSet, print("selecting features...") rxSetComputeContext(sparkContext) modelFormula <- as.formula(paste(paste("charge_off~"), paste(featuresName, collapse = "+"))) - mlTrans <- list(categorical(vars = c("purpose", "residentialState", "homeOwnership", "yearsEmployment")), + mlTrans <- list(categorical(vars = c("purpose", "residentialState", "branch", "homeOwnership", "yearsEmployment")), selectFeatures(modelFormula, mode = mutualInformation(numFeaturesToKeep = numFeaturesToKeep))) model <- rxLogisticRegression(modelFormula, data = trainingSet, mlTransforms = mlTrans) selectedFeaturesName <- names(summary(model)$summary) diff --git a/HDI/RSparkCluster/step3_training_evaluation.R b/HDI/RSparkCluster/step3_training_evaluation.R index 35c18e4..5def108 100644 --- a/HDI/RSparkCluster/step3_training_evaluation.R +++ b/HDI/RSparkCluster/step3_training_evaluation.R @@ -64,7 +64,7 @@ training_evaluation <- function(HDFSWorkDir, ################################################################### # get the formula for modeling modelFormula <- as.formula(paste(paste("charge_off~"), paste(selectedFeaturesName, collapse = "+"))) - mlTrans <- list(categorical(vars = c("purpose", "residentialState", "homeOwnership", "yearsEmployment"))) + mlTrans <- list(categorical(vars = c("purpose", "residentialState", "branch", "homeOwnership", "yearsEmployment"))) # Train the Random Forest. print("Training RF model...") forest_model <- rxFastForest(modelFormula, @@ -85,6 +85,7 @@ training_evaluation <- function(HDFSWorkDir, rxSetComputeContext(sparkContext) logistic_model <- rxLogisticRegression(formula = modelFormula, data = trainingSet, + mlTransforms = mlTrans, reportProgress = 0) # save the fitted model to local edge node. @@ -99,6 +100,7 @@ training_evaluation <- function(HDFSWorkDir, rxSetComputeContext(sparkContext) tree_model <- rxFastTrees(formula = modelFormula, data = trainingSet, + mlTransforms = mlTrans, reportProgress = 0) # save the fitted model to local edge node. @@ -113,6 +115,7 @@ training_evaluation <- function(HDFSWorkDir, rxSetComputeContext(sparkContext) linear_model <- rxFastLinear(formula = modelFormula, data = trainingSet, + mlTransforms = mlTrans, reportProgress = 0) # save the fitted model to local edge node. @@ -129,6 +132,7 @@ training_evaluation <- function(HDFSWorkDir, data = trainingSet, numIterations = 42, optimizer = adaDeltaSgd(), + mlTransforms = mlTrans, reportProgress = 0) # save the fitted model to local edge node.