r-server-loan-chargeoff/SQLR/Loan_ChargeOff.ps1

389 строки
14 KiB
PowerShell

<#
.SYNOPSIS
Script to provide Loan ChargeOff predictions, using SQL Server R Services using MicrosoftML and RevoScaleR packages.
.DESCRIPTION
This script will show the E2E work flow of loan chargeoff prediction machine learning
templates with Microsoft SQL Server 2016 and SQL Servevr R Services.
.PARAMETER ServerName
SQL Server instance
.PARAMETER DBName
Name of the database
.PARAMETER sqlUsername
Database user name
.PARAMETER sqlPassword
Database password
.PARAMETER uninterrupted
Whether to run the whole workflow uninterrupted or in interactive mode (y/n)
.PARAMETER dataPath
Folder path with raw csv data files to import into SQL Server database
.PARAMETER datasize
size of the dataset to use (10k, 100k, 1m)
.EXAMPLE
.\Loan_ChargeOff.ps1 -ServerName $env:COMPUTERNAME -DBName LoanChargeOff -sqlUsername sqluser -sqlPassword "MyP#assword1" -uninterrupted y -dataPath ..\..\Data -dataSize "100k"
#>
[CmdletBinding()]
param(
# SQL server address
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
[ValidateNotNullOrEmpty()]
[String]
$ServerName = "",
# SQL server database name
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
[ValidateNotNullOrEmpty()]
[String]
$DBName = "",
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
[ValidateNotNullOrEmpty()]
[String]
$sqlUsername ="",
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
[ValidateNotNullOrEmpty()]
[String]
$sqlPassword ="",
[parameter(Mandatory=$false,ParameterSetName = "LCR")]
[ValidateNotNullOrEmpty()]
[ValidateSet("y", "n", "yes", "no", IgnoreCase = $true)]
[String]
$uninterrupted="y",
[parameter(Mandatory=$true,ParameterSetName = "LCR")]
[ValidateNotNullOrEmpty()]
[String]
$dataPath = "",
[parameter(Mandatory=$false,ParameterSetName = "LCR")]
[ValidateSet("10k", "100k", "1m")]
[String]
$dataSize = "10k"
)
$scriptPath = Get-Location
$filePath = $scriptPath.Path+ "\"
$dataFilePath = $dataPath + "\"
##########################################################################
# Script level variables
##########################################################################
$yesArray = "y", "yes"
$noArray = "n", "no"
$table_suffix = "_" + $dataSize
$trainingTable = "loan_chargeoff_train" + $table_suffix
$testTable = "loan_chargeoff_test" + $table_suffix
$evalScoreTable = "loan_chargeoff_eval_score" + $table_suffix
$scoreTable = "loan_chargeoff_score" + $table_suffix
$modelTable = "loan_chargeoff_models" + $table_suffix
$predictionTable = "loan_chargeoff_prediction" + $table_suffix
$selectedFeaturesTable = "selected_features" + $table_suffix
$modelNames = @{"logistic_reg" = "Logistic Regression model with rxLogisticRegression";"fast_linear" = "Linear binary classification model with rxFastLinear";"fast_trees" = "Fast Decision Trees model with rxFastTrees";"fast_forest" = "Random Forest with rxFastForest";"neural_net" = "Neural Network with rxNeuralNet"}
##########################################################################
# Function wrapper to invoke SQL command
##########################################################################
function ExecuteSQL
{
param(
[String]
$sqlscript,
[String]
$VariableArray=""
)
Invoke-Sqlcmd -ServerInstance $ServerName -Database $DBName -Username $sqlUsername -Password "$sqlPassword" -InputFile $sqlscript -Variable $VariableArray -QueryTimeout 200000
}
##########################################################################
# Function wrapper to invoke SQL query
##########################################################################
function ExecuteSQLQuery
{
param(
[String]
$sqlquery
)
Invoke-Sqlcmd -ServerInstance $ServerName -Database $DBName -Username $sqlUsername -Password "$sqlPassword" -Query $sqlquery -QueryTimeout 200000
}
##########################################################################
# Construct the SQL connection strings
##########################################################################
$connectionString = "Driver=SQL Server;Server=$ServerName;Database=$DBName;UID=$sqlUsername;PWD=$sqlPassword"
$ServerName2="localhost"
$connectionString2 = "Driver=SQL Server;Server=$ServerName2;Database=$DBName;UID=$sqlUsername;PWD=$sqlPassword"
##########################################################################
# Check if the SQL server or database exists
##########################################################################
$query = "SELECT database_id FROM sys.databases WHERE NAME = '$DBName'"
$DB_ID = Invoke-Sqlcmd -ServerInstance $ServerName -Username $sqlUsername -Password "$sqlPassword" -Query $query -ErrorAction SilentlyContinue
if (!$db_id)
{
Write-Host -ForegroundColor Red "Failed the test to connect to SQL server: $ServerName database: $DBName !"
Write-Host -ForegroundColor Red "Please make sure: `n`t 1. SQL Server: $ServerName exists;
`n`t 2. SQL database: $DBName exists;
`n`t 3. SQL user: $sqlUsername has the right credential for SQL server access."
exit
}
$query = "USE $DBName;"
Invoke-Sqlcmd -ServerInstance $ServerName -Username $sqlUsername -Password "$sqlPassword" -Query $query
##########################################################################
# Common functions
##########################################################################
function CreateTablesAndLoadData
{
# create training and test tables
Write-Host -ForeGroundColor 'green' ("Create SQL tables: member_info, loan_info, payments_info")
$script = $filePath + "step1_create_tables.sql"
ExecuteSQL $script "datasize = $dataSize"
Write-Host -ForeGroundColor 'green' ("Populate SQL tables: member_info, loan_info, payments_info")
$dataList = "member_info", "loan_info", "payments_info"
# upload csv files into SQL tables
foreach ($dataFile in $dataList)
{
$destination = $dataFilePath + $dataFile + $table_suffix + ".csv"
$error_file = $dataFilePath + $dataFile + $table_suffix + ".error"
Write-Host -ForeGroundColor 'magenta'(" Populate SQL table: {0}... from {1}" -f $dataFile, $destination)
$tableName = $DBName + ".dbo." + $dataFile + $table_suffix
$tableSchema = $dataFilePath + $dataFile + $table_suffix + ".xml"
bcp $tableName format nul -c -x -f $tableSchema -U $sqlUsername -S $ServerName -P "{$sqlPassword}" -t '|'
Write-Host -ForeGroundColor 'magenta'(" Loading {0} to SQL table..." -f $dataFile)
$bcpStart = Get-Date
bcp $tableName in $destination -t '|' -S $ServerName -f $tableSchema -F 2 -C "RAW" -b 100000 -U $sqlUsername -P "{$sqlPassword}" -e $error_file
if (!$?)
{
Write-Host -ForegroundColor Red "Error in BCP. Check any SQL error messages."
throw
}
$bcpEnd = Get-Date
$bcpTotal = ($bcpEnd - $bcpStart).ToString()
Write-Host -ForeGroundColor 'magenta'(" Done...Loading {0} to SQL table {1} Total time: {2}" -f $dataFile, $tableName, $bcpTotal)
}
}
function CreateViewsAndPersist
{
# create the views for features and label with training, test and scoring split
Write-Host -ForeGroundColor 'magenta'(" Creating features label view and persisting...")
$script = $filepath + "step2_features_label_view.sql"
$persistStart = Get-Date
ExecuteSQL $script "datasize=$dataSize"
$persistEnd = Get-Date
$persistTime = ($persistEnd - $persistStart).ToString()
Write-Host -ForeGroundColor 'magenta'(" Done creating features label view and persisting. Total time: {0}" -f $persistTime)
}
function TrainEvalModels
{
# create the stored procedure for training
$script = $filepath + "step3_train_test_model.sql"
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'magenta'(" Done creating training and eval stored proc...")
# execute the training
Write-Host -ForeGroundColor 'magenta'(" Starting training and evaluation of models...")
foreach ($modelName in $modelNames.GetEnumerator())
{
Write-Host -ForeGroundColor 'Cyan' (" Training $($modelName.Value)...")
$query = "EXEC train_model $trainingTable, $testTable, $evalScoreTable, $modelTable, $($modelName.Name), '$connectionString2'"
$startModelling = Get-Date
ExecuteSQLQuery $query
$endModelling = Get-Date
$totalModellingTime = ($endModelling-$startModelling).ToString()
Write-Host -ForeGroundColor 'Cyan' (" Finished Training $($modelName.Value) Total time: {0}..." -f $totalModellingTime)
}
Write-Host -ForeGroundColor 'Cyan' (" Done with training and evaluation of models. Evaluation stats stored in $modelTable...")
}
function ScoreData
{
# create the stored procedure for recommendations
$script = $filepath + "step4_chargeoff_batch_prediction.sql"
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'magenta'(" Done creating batch scoring stored proc...")
#score on the data
$best_model = ExecuteSQLQuery "select top 1 model_name from $modelTable where f1score in (select max(f1score) from $modelTable)"
Write-Host -ForeGroundColor 'Cyan' ("Scoring based on best performing model '$($modelNames.Get_Item($best_model.model_name))' score table = $scoreTable, prediction table = $predictionTable...")
$scoring_query = "EXEC predict_chargeoff $scoreTable, $predictionTable, $modelTable"
$scoringStart = Get-Date
ExecuteSQLQuery $scoring_query
$scoringEnd = Get-Date
$scoringTotal = ($scoringEnd - $scoringStart).ToString()
Write-Host -ForeGroundColor 'Cyan' ("Done batch scoring. Total time: {0}" -f $scoringTotal)
}
function OnDemandPrediction
{
# create the stored procedure for recommendations
$script = $filepath + "step4a_chargeoff_ondemand_prediction.sql"
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'magenta'(" Done creating on demand scoring stored proc [predict_chargeoff_ondemand]...")
}
function FeatureSelection
{
# create the stored procedure for feature engineering
$script = $filepath + "step2a_optional_feature_selection.sql"
ExecuteSQL $script "datasize=$dataSize"
# execute the feature engineering
Write-Host -ForeGroundColor 'Cyan' (" selecting features using MicrosoftML selectFeatures mlTransform with Logistic Regression...")
$query = "EXEC select_features $trainingTable, $testTable, $selectedFeaturesTable, '$connectionString2'"
ExecuteSQLQuery $query
}
##########################################################################
# Running without interruption
##########################################################################
$startTime= Get-Date
Write-Host "Start time is:" $startTime
if ($uninterrupted -iIn $yesArray)
{
try
{
# step 1. create tables and load data
CreateTablesAndLoadData
# step 2. create the views for features and label with training, test and scoring split
CreateViewsAndPersist
# step 3. Train and evaluate models
TrainEvalModels
# step 4. Data scoring
ScoreData
# step 4a. Create on demand prediction stored proc
OnDemandPrediction
}
catch
{
Write-Host -ForegroundColor Yellow "Exception executing Data Science pipeline..."
Write-Host -ForegroundColor Red $Error[0].Exception
throw
}
Write-Host -foregroundcolor 'green'("Loan ChargeOff Workflow Finished Successfully!")
}
if ($uninterrupted -iIn $noArray)
{
##########################################################################
# Create input tables and populate with data from csv files.
##########################################################################
Write-Host -foregroundcolor 'green' ("Step 1: Create and populate tables in Database" -f $dbname)
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
if ($ans -eq 'E' -or $ans -eq 'e')
{
return
}
if ($ans -eq 'y' -or $ans -eq 'Y')
{
try
{
CreateTablesAndLoadData
}
catch
{
Write-Host -ForegroundColor DarkYellow "Exception in populating database tables:"
Write-Host -ForegroundColor Red $Error[0].Exception
throw
}
}
##########################################################################
# Create and execute the scripts for data processing
##########################################################################
Write-Host -foregroundcolor 'green' ("Step 2: Data Processing/Create feature and label views and tables")
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
if ($ans -eq 'E' -or $ans -eq 'e')
{
return
}
if ($ans -eq 'y' -or $ans -eq 'Y')
{
CreateViewsAndPersist
}
##########################################################################
# Create and execute the stored procedure for feature selection (optional)
##########################################################################
Write-Host -foregroundcolor 'green' ("Step 2a: Feature Engineering (for demo purpose only, training step does it's own feature selection)")
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
if ($ans -eq 'E' -or $ans -eq 'e')
{
return
}
if ($ans -eq 'y' -or $ans -eq 'Y')
{
FeatureSelection
}
##########################################################################
# Create and execute the stored procedure for Training and evaluation
##########################################################################
Write-Host -foregroundcolor 'green' ("Step 3: Models Training and Evaluation")
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
if ($ans -eq 'E' -or $ans -eq 'e')
{
return
}
if ($ans -eq 'y' -or $ans -eq 'Y')
{
TrainEvalModels
}
##########################################################################
# Create and execute the stored procedure for charge_off predictions
##########################################################################
Write-Host -foregroundcolor 'green' ("Step 4: ChargeOff predictions")
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
if ($ans -eq 'E' -or $ans -eq 'e')
{
return
}
if ($ans -eq 'y' -or $ans -eq 'Y')
{
ScoreData
}
Write-Host -foregroundcolor 'green' ("Step 4a: Create on demand ChargeOff prediction stored proc")
$ans = Read-Host 'Continue [y|Y], Exit [e|E], Skip [s|S]?'
if ($ans -eq 'E' -or $ans -eq 'e')
{
return
}
if ($ans -eq 'y' -or $ans -eq 'Y')
{
OnDemandPrediction
}
Write-Host -foregroundcolor 'green'("Loan Chargeoff Prediction Workflow Finished Successfully!")
}
$endTime =Get-Date
$totalTime = ($endTime-$startTime).ToString()
Write-Host "Finished running Loan_ChargeOff.ps1 at:" $endTime
Write-Host "Total time used: " -foregroundcolor 'green' $totalTime.ToString()