2017-06-17 02:23:21 +03:00
|
|
|
/*
|
|
|
|
* SQLR script to create stored procedure for training.
|
|
|
|
*/
|
2017-06-15 22:49:50 +03:00
|
|
|
SET ANSI_NULLS ON
|
|
|
|
GO
|
|
|
|
SET QUOTED_IDENTIFIER ON
|
|
|
|
GO
|
|
|
|
|
|
|
|
DROP PROCEDURE IF EXISTS [dbo].[train_model];
|
|
|
|
GO
|
|
|
|
|
2017-06-17 02:23:21 +03:00
|
|
|
/*
|
|
|
|
* Stored Procedure for training of models using MicrosoftML algorithms. This also evaluates the models and stores
|
|
|
|
* the following stats along with serialized model binary, accuracy, auc, precision, recall, f1score.
|
|
|
|
* The parameters can be tuned for various algorithms based on performance on your data.
|
|
|
|
* Parameters:
|
|
|
|
* @training_set_table - training data table name
|
|
|
|
* @test_set_table - test data table name for model evaluation
|
|
|
|
* @scored_table - table to store scores in when doing model evaluation
|
|
|
|
* @model_table - table to store model in serialized binary format along with evaluation stats
|
2017-06-20 21:59:39 +03:00
|
|
|
* @model_alg - the algorithm to use for training the model.
|
|
|
|
* Can be one of 'logistic_reg', 'fast_trees', 'fast_forest', 'fast_linear', 'neural_net'
|
2017-06-17 02:23:21 +03:00
|
|
|
* @connectionString - connection string to connect to the database for use in the R script
|
|
|
|
*/
|
2017-06-20 21:59:39 +03:00
|
|
|
CREATE PROCEDURE [train_model] @training_set_table nvarchar(100), @test_set_table nvarchar(100), @scored_table nvarchar(100), @model_table nvarchar(100), @model_alg nvarchar(50), @connectionString nvarchar(300)
|
2017-06-15 22:49:50 +03:00
|
|
|
AS
|
|
|
|
BEGIN
|
|
|
|
|
2017-06-22 23:14:34 +03:00
|
|
|
DECLARE @payload varbinary(max), @selected_features nvarchar(1000), @auc real, @accuracy real, @precision real, @recall real, @f1score real;
|
|
|
|
DECLARE @del_cmd nvarchar(300), @ins_cmd nvarchar(300), @param_def nvarchar(300);
|
|
|
|
EXECUTE sp_execute_external_script @language = N'R',
|
|
|
|
@script = N'
|
2017-06-15 22:49:50 +03:00
|
|
|
library(RevoScaleR)
|
|
|
|
library(MicrosoftML)
|
|
|
|
# model evaluation functions
|
|
|
|
model_eval_stats <- function(scored_data, label="charge_off", predicted_prob="Probability", predicted_label="PredictedLabel")
|
|
|
|
{
|
|
|
|
roc <- rxRoc(label, grep(predicted_prob, names(scored_data), value=T), scored_data)
|
|
|
|
auc <- rxAuc(roc)
|
|
|
|
crosstab_formula <- as.formula(paste("~as.factor(", label, "):as.factor(", predicted_label, ")"))
|
|
|
|
cross_tab <- rxCrossTabs(crosstab_formula, scored_data)
|
|
|
|
conf_matrix <- cross_tab$counts[[1]]
|
|
|
|
tn <- conf_matrix[1,1]
|
|
|
|
fp <- conf_matrix[1,2]
|
|
|
|
fn <- conf_matrix[2,1]
|
|
|
|
tp <- conf_matrix[2,2]
|
|
|
|
accuracy <- (tp + tn) / (tp + fn + fp + tn)
|
|
|
|
precision <- tp/(tp+fp)
|
|
|
|
recall <- tp / (tp+fn)
|
|
|
|
f1score <- 2 * (precision * recall) / (precision + recall)
|
|
|
|
return(list(auc=auc, accuracy=accuracy, precision = precision, recall=recall, f1score=f1score))
|
|
|
|
}
|
|
|
|
cc <- RxInSqlServer(connectionString = connection_string)
|
|
|
|
rxSetComputeContext(cc)
|
|
|
|
training_set <- RxSqlServerData(table=train_set, connectionString = connection_string)
|
|
|
|
testing_set <- RxSqlServerData(table=test_set, connectionString = connection_string)
|
|
|
|
scoring_set <- RxSqlServerData(table=score_set, connectionString = connection_string, overwrite=TRUE)
|
|
|
|
##########################################################################################################################################
|
|
|
|
## Training and evaluating model based on model selection
|
|
|
|
##########################################################################################################################################
|
|
|
|
features <- rxGetVarNames(training_set)
|
|
|
|
vars_to_remove <- c("memberId", "loanId", "payment_date", "loan_open_date", "charge_off")
|
|
|
|
feature_names <- features[!(features %in% vars_to_remove)]
|
|
|
|
model_formula <- as.formula(paste(paste("charge_off~"), paste(feature_names, collapse = "+")))
|
2017-06-21 06:15:23 +03:00
|
|
|
ml_trans <- list(categorical(vars = c("purpose", "residentialState", "branch", "homeOwnership", "yearsEmployment")),
|
2017-06-22 23:14:34 +03:00
|
|
|
selectFeatures(model_formula, mode = mutualInformation(numFeaturesToKeep = 100)))
|
2017-06-15 22:49:50 +03:00
|
|
|
|
|
|
|
if (model_name == "logistic_reg") {
|
2017-06-22 23:14:34 +03:00
|
|
|
model <- rxLogisticRegression(formula = model_formula,
|
|
|
|
data = training_set,
|
|
|
|
mlTransforms = ml_trans)
|
2017-06-15 22:49:50 +03:00
|
|
|
} else if (model_name == "fast_trees") {
|
2017-06-22 23:14:34 +03:00
|
|
|
model <- rxFastTrees(formula = model_formula,
|
|
|
|
data = training_set,
|
|
|
|
mlTransforms = ml_trans)
|
2017-06-15 22:49:50 +03:00
|
|
|
} else if (model_name == "fast_forest") {
|
2017-06-22 23:14:34 +03:00
|
|
|
model <- rxFastForest(formula = model_formula,
|
|
|
|
data = training_set,
|
|
|
|
mlTransforms = ml_trans)
|
2017-06-15 22:49:50 +03:00
|
|
|
} else if (model_name == "fast_linear") {
|
2017-06-22 23:14:34 +03:00
|
|
|
model <- rxFastLinear(formula = model_formula,
|
|
|
|
data = training_set,
|
|
|
|
mlTransforms = ml_trans)
|
2017-06-15 22:49:50 +03:00
|
|
|
} else if (model_name == "neural_net") {
|
2017-06-22 23:14:34 +03:00
|
|
|
model <- rxNeuralNet(formula = model_formula,
|
|
|
|
data = training_set,
|
|
|
|
numIterations = 42,
|
2017-06-15 22:49:50 +03:00
|
|
|
optimizer = adaDeltaSgd(),
|
2017-06-22 23:14:34 +03:00
|
|
|
mlTransforms = ml_trans)
|
2017-06-15 22:49:50 +03:00
|
|
|
}
|
|
|
|
print("Done training.")
|
2017-06-22 23:14:34 +03:00
|
|
|
|
|
|
|
# selected features
|
|
|
|
features_to_remove <- c("(Bias)")
|
|
|
|
selected_features <- rxGetVarInfo(summary(model)$summary)
|
|
|
|
selected_feature_names <- names(selected_features)
|
|
|
|
selected_feature_filtered <- selected_feature_names[!(selected_feature_names %in% features_to_remove)]
|
|
|
|
selected_features_str <- paste(selected_feature_filtered, collapse=",")
|
|
|
|
|
2017-06-15 22:49:50 +03:00
|
|
|
# evaluate model
|
2017-06-21 22:42:34 +03:00
|
|
|
rxPredict(model, testing_set, outData = scoring_set, extraVarsToWrite = c("loanId", "payment_date", "charge_off"), overwrite=TRUE)
|
2017-06-15 22:49:50 +03:00
|
|
|
print("Done writing predictions for evaluation of model.")
|
|
|
|
model_stats <- model_eval_stats(scoring_set)
|
|
|
|
print(model_stats)
|
|
|
|
modelbin <- serialize(model, connection=NULL)
|
|
|
|
stat_auc <- model_stats[[1]]
|
|
|
|
|
|
|
|
stat_accuracy <- model_stats[[2]]
|
|
|
|
stat_precision <- model_stats[[3]]
|
|
|
|
stat_recall <- model_stats[[4]]
|
|
|
|
stat_f1score <- model_stats[[5]]
|
|
|
|
'
|
2017-06-21 21:29:02 +03:00
|
|
|
, @params = N'@model_name nvarchar(50), @connection_string nvarchar(300), @train_set nvarchar(100), @test_set nvarchar(100), @score_set nvarchar(100),
|
2017-06-22 23:14:34 +03:00
|
|
|
@modelbin varbinary(max) OUTPUT, @selected_features_str nvarchar(1000) OUTPUT, @stat_auc real OUTPUT, @stat_accuracy real OUTPUT, @stat_precision real OUTPUT, @stat_recall real OUTPUT, @stat_f1score real OUTPUT'
|
2017-06-17 02:23:21 +03:00
|
|
|
, @model_name = @model_alg
|
2017-06-15 22:49:50 +03:00
|
|
|
, @connection_string = @connectionString
|
|
|
|
, @train_set = @training_set_table
|
|
|
|
, @test_set = @test_set_table
|
|
|
|
, @score_set = @scored_table
|
|
|
|
, @modelbin = @payload OUTPUT
|
2017-06-22 23:14:34 +03:00
|
|
|
, @selected_features_str = @selected_features OUTPUT
|
2017-06-15 22:49:50 +03:00
|
|
|
, @stat_auc = @auc OUTPUT
|
|
|
|
, @stat_accuracy = @accuracy OUTPUT
|
|
|
|
, @stat_precision = @precision OUTPUT
|
|
|
|
, @stat_recall = @recall OUTPUT
|
|
|
|
, @stat_f1score = @f1score OUTPUT;
|
|
|
|
|
2017-06-17 02:23:21 +03:00
|
|
|
SET @del_cmd = N'DELETE FROM ' + @model_table + N' WHERE model_name = ''' + @model_alg + ''''
|
2017-06-15 22:49:50 +03:00
|
|
|
EXEC sp_executesql @del_cmd;
|
2017-06-22 23:14:34 +03:00
|
|
|
SET @ins_cmd = N'INSERT INTO ' + @model_table + N' (model_name, model, selected_features, auc, accuracy, precision, recall, f1score) VALUES (''' + @model_alg + ''', @p_payload, @p_selected_features, @p_auc, @p_accuracy, @p_precision, @p_recall, @p_f1score)'
|
2017-06-15 22:49:50 +03:00
|
|
|
SET @param_def = N'@p_payload varbinary(max),
|
2017-06-22 23:14:34 +03:00
|
|
|
@p_selected_features nvarchar(1000),
|
|
|
|
@p_auc real,
|
|
|
|
@p_accuracy real,
|
|
|
|
@p_precision real,
|
|
|
|
@p_recall real,
|
|
|
|
@p_f1score real'
|
2017-06-15 22:49:50 +03:00
|
|
|
EXEC sp_executesql @ins_cmd, @param_def,
|
2017-06-22 23:14:34 +03:00
|
|
|
@p_payload=@payload,
|
|
|
|
@p_selected_features=@selected_features,
|
|
|
|
@p_auc=@auc,
|
|
|
|
@p_accuracy=@accuracy,
|
|
|
|
@p_precision=@precision,
|
|
|
|
@p_recall=@recall,
|
|
|
|
@p_f1score=@f1score;
|
2017-06-15 22:49:50 +03:00
|
|
|
|
|
|
|
;
|
|
|
|
END
|
|
|
|
GO
|