389 строки
14 KiB
PowerShell
389 строки
14 KiB
PowerShell
<#
|
|
.SYNOPSIS
|
|
Script to provide Loan ChargeOff predictions, using SQL Server R Services using MicrosoftML and RevoScaleR packages.
|
|
|
|
.DESCRIPTION
|
|
This script will show the E2E work flow of loan chargeoff prediction machine learning
|
|
templates with Microsoft SQL Server 2016 and SQL Servevr R Services.
|
|
|
|
.PARAMETER ServerName
|
|
SQL Server instance
|
|
|
|
.PARAMETER DBName
|
|
Name of the database
|
|
|
|
.PARAMETER sqlUsername
|
|
Database user name
|
|
|
|
.PARAMETER sqlPassword
|
|
Database password
|
|
|
|
.PARAMETER uninterrupted
|
|
Whether to run the whole workflow uninterrupted or in interactive mode (y/n)
|
|
|
|
.PARAMETER dataPath
|
|
Folder path with raw csv data files to import into SQL Server database
|
|
|
|
.PARAMETER datasize
|
|
size of the dataset to use (10k, 100k, 1m)
|
|
|
|
.EXAMPLE
|
|
.\Loan_ChargeOff.ps1 -ServerName $env:COMPUTERNAME -DBName LoanChargeOff -sqlUsername sqluser -sqlPassword "MyP#assword1" -uninterrupted y -dataPath ..\..\Data -dataSize "100k"
|
|
#>
|
|
[CmdletBinding()]
|
|
param(
|
|
# SQL server address
|
|
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
|
|
[ValidateNotNullOrEmpty()]
|
|
[String]
|
|
$ServerName = "",
|
|
|
|
# SQL server database name
|
|
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
|
|
[ValidateNotNullOrEmpty()]
|
|
[String]
|
|
$DBName = "",
|
|
|
|
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
|
|
[ValidateNotNullOrEmpty()]
|
|
[String]
|
|
$sqlUsername ="",
|
|
|
|
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
|
|
[ValidateNotNullOrEmpty()]
|
|
[String]
|
|
$sqlPassword ="",
|
|
|
|
[parameter(Mandatory=$false,ParameterSetName = "LCR")]
|
|
[ValidateNotNullOrEmpty()]
|
|
[ValidateSet("y", "n", "yes", "no", IgnoreCase = $true)]
|
|
[String]
|
|
$uninterrupted="y",
|
|
|
|
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
|
|
[ValidateNotNullOrEmpty()]
|
|
[String]
|
|
$dataPath = "",
|
|
|
|
[parameter(Mandatory=$false,ParameterSetName = "LCR")]
|
|
[ValidateSet("10k", "100k", "1m")]
|
|
[String]
|
|
$dataSize = "10k"
|
|
)
|
|
|
|
$scriptPath = Get-Location
|
|
$filePath = $scriptPath.Path+ "\"
|
|
$dataFilePath = $dataPath + "\"
|
|
|
|
##########################################################################
|
|
# Script level variables
|
|
##########################################################################
|
|
$yesArray = "y", "yes"
|
|
$noArray = "n", "no"
|
|
$table_suffix = "_" + $dataSize
|
|
|
|
$trainingTable = "loan_chargeoff_train" + $table_suffix
|
|
$testTable = "loan_chargeoff_test" + $table_suffix
|
|
$evalScoreTable = "loan_chargeoff_eval_score" + $table_suffix
|
|
$scoreTable = "loan_chargeoff_score" + $table_suffix
|
|
$modelTable = "loan_chargeoff_models" + $table_suffix
|
|
$predictionTable = "loan_chargeoff_prediction" + $table_suffix
|
|
$selectedFeaturesTable = "selected_features" + $table_suffix
|
|
$modelNames = @{"logistic_reg" = "Logistic Regression model with rxLogisticRegression";"fast_linear" = "Linear binary classification model with rxFastLinear";"fast_trees" = "Fast Decision Trees model with rxFastTrees";"fast_forest" = "Random Forest with rxFastForest";"neural_net" = "Neural Network with rxNeuralNet"}
|
|
|
|
##########################################################################
|
|
# Function wrapper to invoke SQL command
|
|
##########################################################################
|
|
function ExecuteSQL
|
|
{
|
|
param(
|
|
[String]
|
|
$sqlscript,
|
|
[String]
|
|
$VariableArray=""
|
|
)
|
|
Invoke-Sqlcmd -ServerInstance $ServerName -Database $DBName -Username $sqlUsername -Password "$sqlPassword" -InputFile $sqlscript -Variable $VariableArray -QueryTimeout 200000
|
|
}
|
|
##########################################################################
|
|
# Function wrapper to invoke SQL query
|
|
##########################################################################
|
|
function ExecuteSQLQuery
|
|
{
|
|
param(
|
|
[String]
|
|
$sqlquery
|
|
)
|
|
Invoke-Sqlcmd -ServerInstance $ServerName -Database $DBName -Username $sqlUsername -Password "$sqlPassword" -Query $sqlquery -QueryTimeout 200000
|
|
}
|
|
|
|
##########################################################################
|
|
# Construct the SQL connection strings
|
|
##########################################################################
|
|
$connectionString = "Driver=SQL Server;Server=$ServerName;Database=$DBName;UID=$sqlUsername;PWD=$sqlPassword"
|
|
$ServerName2="localhost"
|
|
$connectionString2 = "Driver=SQL Server;Server=$ServerName2;Database=$DBName;UID=$sqlUsername;PWD=$sqlPassword"
|
|
|
|
##########################################################################
|
|
# Check if the SQL server or database exists
|
|
##########################################################################
|
|
$query = "SELECT database_id FROM sys.databases WHERE NAME = '$DBName'"
|
|
$DB_ID = Invoke-Sqlcmd -ServerInstance $ServerName -Username $sqlUsername -Password "$sqlPassword" -Query $query -ErrorAction SilentlyContinue
|
|
if (!$db_id)
|
|
{
|
|
Write-Host -ForegroundColor Red "Failed the test to connect to SQL server: $ServerName database: $DBName !"
|
|
Write-Host -ForegroundColor Red "Please make sure: `n`t 1. SQL Server: $ServerName exists;
|
|
`n`t 2. SQL database: $DBName exists;
|
|
`n`t 3. SQL user: $sqlUsername has the right credential for SQL server access."
|
|
exit
|
|
}
|
|
|
|
$query = "USE $DBName;"
|
|
Invoke-Sqlcmd -ServerInstance $ServerName -Username $sqlUsername -Password "$sqlPassword" -Query $query
|
|
|
|
##########################################################################
|
|
# Common functions
|
|
##########################################################################
|
|
function CreateTablesAndLoadData
|
|
{
|
|
# create training and test tables
|
|
Write-Host -ForeGroundColor 'green' ("Create SQL tables: member_info, loan_info, payments_info")
|
|
$script = $filePath + "step1_create_tables.sql"
|
|
ExecuteSQL $script "datasize = $dataSize"
|
|
|
|
Write-Host -ForeGroundColor 'green' ("Populate SQL tables: member_info, loan_info, payments_info")
|
|
$dataList = "member_info", "loan_info", "payments_info"
|
|
|
|
# upload csv files into SQL tables
|
|
foreach ($dataFile in $dataList)
|
|
{
|
|
$destination = $dataFilePath + $dataFile + $table_suffix + ".csv"
|
|
$error_file = $dataFilePath + $dataFile + $table_suffix + ".error"
|
|
Write-Host -ForeGroundColor 'magenta'(" Populate SQL table: {0}... from {1}" -f $dataFile, $destination)
|
|
$tableName = $DBName + ".dbo." + $dataFile + $table_suffix
|
|
$tableSchema = $dataFilePath + $dataFile + $table_suffix + ".xml"
|
|
bcp $tableName format nul -c -x -f $tableSchema -U $sqlUsername -S $ServerName -P "{$sqlPassword}" -t '|'
|
|
Write-Host -ForeGroundColor 'magenta'(" Loading {0} to SQL table..." -f $dataFile)
|
|
$bcpStart = Get-Date
|
|
bcp $tableName in $destination -t '|' -S $ServerName -f $tableSchema -F 2 -C "RAW" -b 100000 -U $sqlUsername -P "{$sqlPassword}" -e $error_file
|
|
if (!$?)
|
|
{
|
|
Write-Host -ForegroundColor Red "Error in BCP. Check any SQL error messages."
|
|
throw
|
|
}
|
|
$bcpEnd = Get-Date
|
|
$bcpTotal = ($bcpEnd - $bcpStart).ToString()
|
|
Write-Host -ForeGroundColor 'magenta'(" Done...Loading {0} to SQL table {1} Total time: {2}" -f $dataFile, $tableName, $bcpTotal)
|
|
}
|
|
}
|
|
|
|
function CreateViewsAndPersist
|
|
{
|
|
# create the views for features and label with training, test and scoring split
|
|
Write-Host -ForeGroundColor 'magenta'(" Creating features label view and persisting...")
|
|
$script = $filepath + "step2_features_label_view.sql"
|
|
$persistStart = Get-Date
|
|
ExecuteSQL $script "datasize=$dataSize"
|
|
$persistEnd = Get-Date
|
|
$persistTime = ($persistEnd - $persistStart).ToString()
|
|
Write-Host -ForeGroundColor 'magenta'(" Done creating features label view and persisting. Total time: {0}" -f $persistTime)
|
|
}
|
|
|
|
function TrainEvalModels
|
|
{
|
|
# create the stored procedure for training
|
|
$script = $filepath + "step3_train_test_model.sql"
|
|
ExecuteSQL $script "datasize=$dataSize"
|
|
Write-Host -ForeGroundColor 'magenta'(" Done creating training and eval stored proc...")
|
|
|
|
# execute the training
|
|
Write-Host -ForeGroundColor 'magenta'(" Starting training and evaluation of models...")
|
|
foreach ($modelName in $modelNames.GetEnumerator())
|
|
{
|
|
Write-Host -ForeGroundColor 'Cyan' (" Training $($modelName.Value)...")
|
|
$query = "EXEC train_model $trainingTable, $testTable, $evalScoreTable, $modelTable, $($modelName.Name), '$connectionString2'"
|
|
$startModelling = Get-Date
|
|
ExecuteSQLQuery $query
|
|
$endModelling = Get-Date
|
|
$totalModellingTime = ($endModelling-$startModelling).ToString()
|
|
Write-Host -ForeGroundColor 'Cyan' (" Finished Training $($modelName.Value) Total time: {0}..." -f $totalModellingTime)
|
|
}
|
|
|
|
Write-Host -ForeGroundColor 'Cyan' (" Done with training and evaluation of models. Evaluation stats stored in $modelTable...")
|
|
}
|
|
|
|
function ScoreData
|
|
{
|
|
# create the stored procedure for recommendations
|
|
$script = $filepath + "step4_chargeoff_batch_prediction.sql"
|
|
ExecuteSQL $script "datasize=$dataSize"
|
|
Write-Host -ForeGroundColor 'magenta'(" Done creating batch scoring stored proc...")
|
|
|
|
#score on the data
|
|
$best_model = ExecuteSQLQuery "select top 1 model_name from $modelTable where f1score in (select max(f1score) from $modelTable)"
|
|
Write-Host -ForeGroundColor 'Cyan' ("Scoring based on best performing model '$($modelNames.Get_Item($best_model.model_name))' score table = $scoreTable, prediction table = $predictionTable...")
|
|
$scoring_query = "EXEC predict_chargeoff $scoreTable, $predictionTable, $modelTable"
|
|
$scoringStart = Get-Date
|
|
ExecuteSQLQuery $scoring_query
|
|
$scoringEnd = Get-Date
|
|
$scoringTotal = ($scoringEnd - $scoringStart).ToString()
|
|
Write-Host -ForeGroundColor 'Cyan' ("Done batch scoring. Total time: {0}" -f $scoringTotal)
|
|
}
|
|
|
|
function OnDemandPrediction
|
|
{
|
|
# create the stored procedure for recommendations
|
|
$script = $filepath + "step4a_chargeoff_ondemand_prediction.sql"
|
|
ExecuteSQL $script "datasize=$dataSize"
|
|
Write-Host -ForeGroundColor 'magenta'(" Done creating on demand scoring stored proc [predict_chargeoff_ondemand]...")
|
|
}
|
|
|
|
function FeatureSelection
|
|
{
|
|
# create the stored procedure for feature engineering
|
|
$script = $filepath + "step2a_optional_feature_selection.sql"
|
|
ExecuteSQL $script "datasize=$dataSize"
|
|
|
|
# execute the feature engineering
|
|
Write-Host -ForeGroundColor 'Cyan' (" selecting features using MicrosoftML selectFeatures mlTransform with Logistic Regression...")
|
|
$query = "EXEC select_features $trainingTable, $testTable, $selectedFeaturesTable, '$connectionString2'"
|
|
ExecuteSQLQuery $query
|
|
}
|
|
##########################################################################
|
|
# Running without interruption
|
|
##########################################################################
|
|
$startTime= Get-Date
|
|
Write-Host "Start time is:" $startTime
|
|
|
|
if ($uninterrupted -iIn $yesArray)
|
|
{
|
|
try
|
|
{
|
|
# step 1. create tables and load data
|
|
CreateTablesAndLoadData
|
|
|
|
# step 2. create the views for features and label with training, test and scoring split
|
|
CreateViewsAndPersist
|
|
|
|
# step 3. Train and evaluate models
|
|
TrainEvalModels
|
|
|
|
# step 4. Data scoring
|
|
ScoreData
|
|
|
|
# step 4a. Create on demand prediction stored proc
|
|
OnDemandPrediction
|
|
|
|
}
|
|
catch
|
|
{
|
|
Write-Host -ForegroundColor Yellow "Exception executing Data Science pipeline..."
|
|
Write-Host -ForegroundColor Red $Error[0].Exception
|
|
throw
|
|
}
|
|
|
|
Write-Host -foregroundcolor 'green'("Loan ChargeOff Workflow Finished Successfully!")
|
|
}
|
|
|
|
if ($uninterrupted -iIn $noArray)
|
|
{
|
|
|
|
##########################################################################
|
|
# Create input tables and populate with data from csv files.
|
|
##########################################################################
|
|
Write-Host -foregroundcolor 'green' ("Step 1: Create and populate tables in Database" -f $dbname)
|
|
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
|
|
if ($ans -eq 'E' -or $ans -eq 'e')
|
|
{
|
|
return
|
|
}
|
|
if ($ans -eq 'y' -or $ans -eq 'Y')
|
|
{
|
|
try
|
|
{
|
|
CreateTablesAndLoadData
|
|
}
|
|
catch
|
|
{
|
|
Write-Host -ForegroundColor DarkYellow "Exception in populating database tables:"
|
|
Write-Host -ForegroundColor Red $Error[0].Exception
|
|
throw
|
|
}
|
|
}
|
|
|
|
##########################################################################
|
|
# Create and execute the scripts for data processing
|
|
##########################################################################
|
|
Write-Host -foregroundcolor 'green' ("Step 2: Data Processing/Create feature and label views and tables")
|
|
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
|
|
if ($ans -eq 'E' -or $ans -eq 'e')
|
|
{
|
|
return
|
|
}
|
|
if ($ans -eq 'y' -or $ans -eq 'Y')
|
|
{
|
|
CreateViewsAndPersist
|
|
}
|
|
|
|
##########################################################################
|
|
# Create and execute the stored procedure for feature selection (optional)
|
|
##########################################################################
|
|
Write-Host -foregroundcolor 'green' ("Step 2a: Feature Engineering (for demo purpose only, training step does it's own feature selection)")
|
|
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
|
|
if ($ans -eq 'E' -or $ans -eq 'e')
|
|
{
|
|
return
|
|
}
|
|
if ($ans -eq 'y' -or $ans -eq 'Y')
|
|
{
|
|
FeatureSelection
|
|
}
|
|
|
|
##########################################################################
|
|
# Create and execute the stored procedure for Training and evaluation
|
|
##########################################################################
|
|
|
|
Write-Host -foregroundcolor 'green' ("Step 3: Models Training and Evaluation")
|
|
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
|
|
if ($ans -eq 'E' -or $ans -eq 'e')
|
|
{
|
|
return
|
|
}
|
|
if ($ans -eq 'y' -or $ans -eq 'Y')
|
|
{
|
|
TrainEvalModels
|
|
}
|
|
|
|
##########################################################################
|
|
# Create and execute the stored procedure for charge_off predictions
|
|
##########################################################################
|
|
|
|
Write-Host -foregroundcolor 'green' ("Step 4: ChargeOff predictions")
|
|
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
|
|
if ($ans -eq 'E' -or $ans -eq 'e')
|
|
{
|
|
return
|
|
}
|
|
if ($ans -eq 'y' -or $ans -eq 'Y')
|
|
{
|
|
ScoreData
|
|
}
|
|
|
|
Write-Host -foregroundcolor 'green' ("Step 4a: Create on demand ChargeOff prediction stored proc")
|
|
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
|
|
if ($ans -eq 'E' -or $ans -eq 'e')
|
|
{
|
|
return
|
|
}
|
|
if ($ans -eq 'y' -or $ans -eq 'Y')
|
|
{
|
|
OnDemandPrediction
|
|
}
|
|
|
|
|
|
Write-Host -foregroundcolor 'green'("Loan Chargeoff Prediction Workflow Finished Successfully!")
|
|
}
|
|
|
|
$endTime =Get-Date
|
|
$totalTime = ($endTime-$startTime).ToString()
|
|
Write-Host "Finished running Loan_ChargeOff.ps1 at:" $endTime
|
|
Write-Host "Total time used: " -foregroundcolor 'green' $totalTime.ToString() |