r-server-loan-chargeoff/SQLR/step2a_optional_feature_sel...

75 строки
3.2 KiB
Transact-SQL

/*
* SQLR script to demonstrate feature selection available in MicrosoftML package.
* We use this same mechanism during training so this step is optional to run, but
* serves as an example of an approach for feature selection, i.e., preselect features
* and store in database table for later use in training of models.
*/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
DROP TABLE IF EXISTS [selected_features_$(datasize)];
CREATE TABLE [selected_features_$(datasize)](
[feature_id] [int] IDENTITY(1,1) NOT NULL,
[feature_name] [nvarchar](500) NOT NULL
);
GO
DROP PROCEDURE IF EXISTS [dbo].[select_features];
GO
/*
* Stored procedure for feature selection.
* Parameters:
* @training_set_table - table with training data
* @test_set_table - table with test data
* @selected_features_table - table to store selected features in
* @connectionString - connection string to connect to the database for use in the R script
*/
CREATE PROCEDURE [select_features] @training_set_table nvarchar(100), @test_set_table nvarchar(100), @selected_features_table nvarchar(100), @connectionString nvarchar(300)
AS
BEGIN
DECLARE @testing_set_query nvarchar(400), @del_cmd nvarchar(100), @ins_cmd nvarchar(max)
/* select features using MicrosotML */
SET @del_cmd = 'DELETE FROM ' + @selected_features_table
EXEC sp_executesql @del_cmd
SET @ins_cmd = 'INSERT INTO ' + @selected_features_table + ' (feature_name)
EXECUTE sp_execute_external_script @language = N''R'',
@script = N''
library(RevoScaleR)
library(MicrosoftML)
testing_set <- RxSqlServerData(table=test_set, connectionString = connection_string)
training_set <- RxSqlServerData(table=train_set, connectionString = connection_string)
features <- rxGetVarNames(testing_set)
variables_to_remove <- c("memberId", "loanId", "payment_date", "loan_open_date", "charge_off")
feature_names <- features[!(features %in% variables_to_remove)]
model_formula <- as.formula(paste(paste("charge_off~"), paste(feature_names, collapse = "+")))
selected_count <- 0
ml_trans <- list(categorical(vars = c("purpose", "residentialState", "branch", "homeOwnership", "yearsEmployment")),
selectFeatures(model_formula, mode = mutualInformation(numFeaturesToKeep = 100)))
candidate_model <- rxLogisticRegression(model_formula, data = training_set, mlTransforms = ml_trans)
predicted_score <- rxPredict(candidate_model, testing_set, extraVarsToWrite = c("charge_off"))
predicted_roc <- rxRoc("charge_off", grep("Probability", names(predicted_score), value = T), predicted_score)
auc <- rxAuc(predicted_roc)
features_to_remove <- c("(Bias)")
selected_features <- rxGetVarInfo(summary(candidate_model)$summary)
selected_feature_names <- names(selected_features)
selected_feature_filtered <- selected_feature_names[!(selected_feature_names %in% features_to_remove)]
selected_features_final <- data.frame(selected_feature_filtered)''
, @output_data_1_name = N''selected_features_final''
, @params = N''@connection_string nvarchar(300), @test_set nvarchar(100), @train_set nvarchar(100)''
, @connection_string = N''' + @connectionString + '''' +
', @train_set = N''' + @training_set_table + '''' +
', @test_set = N''' + @test_set_table + ''';'
EXEC sp_executesql @ins_cmd
END
GO