This commit is contained in:
Qiong Wei 2017-06-20 12:18:30 -07:00
Родитель d76a473db0
Коммит 841c50f454
2 изменённых файлов: 7 добавлений и 3 удалений

Просмотреть файл

@ -48,7 +48,7 @@ feature_engineer <- function(trainingSet,
for (i in featuresNum)
{
rxSetComputeContext(sparkContext)
mlTrans <- list(categorical(vars = c("purpose", "residentialState", "homeOwnership", "yearsEmployment")),
mlTrans <- list(categorical(vars = c("purpose", "residentialState", "branch", "homeOwnership", "yearsEmployment")),
selectFeatures(modelFormula, mode = mutualInformation(numFeaturesToKeep = i)))
candinateModel <- rxLogisticRegression(modelFormula, data = trainingSet, mlTransforms = mlTrans)
predictedScore <- rxPredict(candinateModel, testingSet, extraVarsToWrite = c("charge_off"))
@ -74,7 +74,7 @@ feature_engineer <- function(trainingSet,
print("selecting features...")
rxSetComputeContext(sparkContext)
modelFormula <- as.formula(paste(paste("charge_off~"), paste(featuresName, collapse = "+")))
mlTrans <- list(categorical(vars = c("purpose", "residentialState", "homeOwnership", "yearsEmployment")),
mlTrans <- list(categorical(vars = c("purpose", "residentialState", "branch", "homeOwnership", "yearsEmployment")),
selectFeatures(modelFormula, mode = mutualInformation(numFeaturesToKeep = numFeaturesToKeep)))
model <- rxLogisticRegression(modelFormula, data = trainingSet, mlTransforms = mlTrans)
selectedFeaturesName <- names(summary(model)$summary)

Просмотреть файл

@ -64,7 +64,7 @@ training_evaluation <- function(HDFSWorkDir,
###################################################################
# get the formula for modeling
modelFormula <- as.formula(paste(paste("charge_off~"), paste(selectedFeaturesName, collapse = "+")))
mlTrans <- list(categorical(vars = c("purpose", "residentialState", "homeOwnership", "yearsEmployment")))
mlTrans <- list(categorical(vars = c("purpose", "residentialState", "branch", "homeOwnership", "yearsEmployment")))
# Train the Random Forest.
print("Training RF model...")
forest_model <- rxFastForest(modelFormula,
@ -85,6 +85,7 @@ training_evaluation <- function(HDFSWorkDir,
rxSetComputeContext(sparkContext)
logistic_model <- rxLogisticRegression(formula = modelFormula,
data = trainingSet,
mlTransforms = mlTrans,
reportProgress = 0)
# save the fitted model to local edge node.
@ -99,6 +100,7 @@ training_evaluation <- function(HDFSWorkDir,
rxSetComputeContext(sparkContext)
tree_model <- rxFastTrees(formula = modelFormula,
data = trainingSet,
mlTransforms = mlTrans,
reportProgress = 0)
# save the fitted model to local edge node.
@ -113,6 +115,7 @@ training_evaluation <- function(HDFSWorkDir,
rxSetComputeContext(sparkContext)
linear_model <- rxFastLinear(formula = modelFormula,
data = trainingSet,
mlTransforms = mlTrans,
reportProgress = 0)
# save the fitted model to local edge node.
@ -129,6 +132,7 @@ training_evaluation <- function(HDFSWorkDir,
data = trainingSet,
numIterations = 42,
optimizer = adaDeltaSgd(),
mlTransforms = mlTrans,
reportProgress = 0)
# save the fitted model to local edge node.