This commit is contained in:
Qiong Wei 2017-06-16 23:26:54 -07:00
Родитель 2738e68eb1 59fbcc7939
Коммит b49f7e121d
30 изменённых файлов: 434 добавлений и 625 удалений

Просмотреть файл

@ -39,9 +39,9 @@ loan_prediction <- function(LocalWorkDir,
importedModel <- function(bestModelName) {
switch(as.character(bestModelName),
forest = {import_model <- model_obj$forest_model},
linear = {import_model <- model_obj$linear_model},
logistic = {import_model <- model_obj$logistic_model},
tree = {import_model <- model_obj$tree_model},
linear = {import_model <- model_obj$linear_model},
NN = {import_model <- model_obj$NN_model})
}
@ -81,4 +81,4 @@ loan_prediction <- function(LocalWorkDir,
print("The prediction results are also stored in hive table loanchargeoff_predictions")
return(finalResult)
}
}

Просмотреть файл

@ -1 +0,0 @@
Code for Tiger Team Solution Templates

Просмотреть файл

@ -1,80 +0,0 @@
##############################################################################################
# Script to invoke the LoanChargeOff data science workflow with a smaller dataset of 10,000
# loans.
# It also creates a SQL Server user and stores the password in 'ExporedSqlPassword.txt'.
# Users can retrieve the password from the file and decrypt using ConvertTo-SecureString
# commandlet in PowerShell.
#
# Parameters:
# dbuser - (Optional) username for database LoanChargeOff
# dbpass - (Optional) database password
# createuser - (Optional) whethere to create a database user
# datadir - directory where raw csv data has been downloaded
# datasize - size of the data to train on (10k, 100k, 1m)
##############################################################################################
Param([string]$datadir, [string]$scriptdir, [string]$dbuser, [string]$dbpass, [bool]$createuser = $true, [ValidateSet("10k", "100k", "1m")][string]$datasize="10k")
cd $scriptdir
# Function to generate a temporary password for SQL Server
Function Get-TempPassword()
{
Param
(
[int]$length=10,
[string[]]$sourcedata
)
For ($loop=1; $loop -le $length; $loop++)
{
$TempPassword += ($sourcedata | Get-Random)
}
return $TempPassword
}
$passwordSource=$NULL
$dbpassword = ""
$dbusername = "rdemo"
$passwordFile = "ExportedSqlPassword.txt"
For ($a=33;$a -le 126; $a++)
{
$passwordSource += ,[char][byte]$a
}
if ($dbuser)
{
$dbusername = $dbuser
}
if (!$createuser)
{
if (!$dbpass)
{
if (Test-Path $passwordFile)
{
$secureTxtFromFile = Get-Content $passwordFile
$securePasswordObj = $secureTxtFromFile | ConvertTo-SecureString
#get back the original unencrypted password
$PasswordBSTR = [System.Runtime.InteropServices.Marshal]::SecureStringToBSTR($securePasswordObj)
$dbpassword = [System.Runtime.InteropServices.Marshal]::PtrToStringAuto($PasswordBSTR)
}
else
{
Write-Host -ForegroundColor DarkYellow "Either ExportedSqlPassword.txt must exist with encrypted database password or must provide password using dbpass parameter."
throw
}
}
else
{
$dbpassword = $dbpass
}
}
else
{
Write-Host -ForegroundColor Cyan "Creating database user"
$dbpassword = Get-TempPassword -length 15 -sourcedata $passwordSource
$securePassword = $dbpassword | ConvertTo-SecureString -AsPlainText -Force
$secureTxt = $securePassword | ConvertFrom-SecureString
Set-Content $passwordFile $secureTxt
sqlcmd -S $env:COMPUTERNAME -v username="$dbusername" -v password="$dbpassword" -i .\createuser.sql
}
.\Loan_ChargeOff.ps1 -ServerName $env:COMPUTERNAME -DBName LoanChargeOff -username $dbusername -password $dbpassword -uninterrupted y -dataPath $datadir -dataSize $datasize

Просмотреть файл

@ -1,77 +0,0 @@
##############################################################################################
# Script to invoke the LoanChargeOff data science workflow with a larger dataset of 1,000,000
# loans.
# It can also optionally creates a SQL Server user and stores the password in
# 'ExporedSqlPassword.txt'. Users can retrieve the password from the file and decrypt using
# ConvertTo-SecureString commandlet in PowerShell.
#
# Parameters:
# dbuser - (Optional) username for database LoanChargeOff
# dbpass - (Optional) database password
# createuser - (Optional) whethere to create a database user
##############################################################################################
Param([string]$dbuser, [string]$dbpass, [bool]$createuser = $true, [string]$datadir)
# Function to generate a temporary password for SQL Server
Function Get-TempPassword()
{
Param
(
[int]$length=10,
[string[]]$sourcedata
)
For ($loop=1; $loop -le $length; $loop++)
{
$TempPassword += ($sourcedata | Get-Random)
}
return $TempPassword
}
$passwordSource=$NULL
$dbpassword = ""
$dbusername = "rdemo"
$passwordFile = "ExportedSqlPassword.txt"
For ($a=33;$a -le 126; $a++)
{
$passwordSource += ,[char][byte]$a
}
if ($dbuser)
{
$dbusername = $dbuser
}
if (!$createuser)
{
if (!$dbpass)
{
if (Test-Path $passwordFile)
{
$secureTxtFromFile = Get-Content $passwordFile
$securePasswordObj = $secureTxtFromFile | ConvertTo-SecureString
#get back the original unencrypted password
$PasswordBSTR = [System.Runtime.InteropServices.Marshal]::SecureStringToBSTR($securePasswordObj)
$dbpassword = [System.Runtime.InteropServices.Marshal]::PtrToStringAuto($PasswordBSTR)
}
else
{
Write-Host -ForegroundColor DarkYellow "Either ExportedSqlPassword.txt must exist with encrypted database password or must provide password using dbpass parameter."
throw
}
}
else
{
$dbpassword = $dbpass
}
}
else
{
Write-Host -ForegroundColor Cyan "Creating database user"
$dbpassword = Get-TempPassword -length 15 -sourcedata $passwordSource
$securePassword = $dbpassword | ConvertTo-SecureString -AsPlainText -Force
$secureTxt = $securePassword | ConvertFrom-SecureString
Set-Content $passwordFile $secureTxt
sqlcmd -S $env:COMPUTERNAME -v username="$dbusername" -v password="$dbpassword" -i .\createuser.sql
}
.\Loan_ChargeOff.ps1 -ServerName $env:COMPUTERNAME -DBName LoanChargeOff -username $dbusername -password $dbpassword -uninterrupted y -dataPath $datadir -dataSize L

Просмотреть файл

@ -1,23 +0,0 @@
param( [string]$helpfile)
#git clone
$desktop = [Environment]::GetFolderPath("Desktop")
$desktop = $desktop + '\'
#create the help link in startup program
$startmenu = [Environment]::GetFolderPath("StartMenu")
$startupfolder = $startmenu + '\Programs\Startup\'
# We create this since the user startup folder is only created after first login
# Alternative is to add is to all user startup
mkdir $startupfolder
#copy
$down = $helpfile
Write-Host $down
Write-Host $startmenu
ls $startmenu
Write-Host $startupfolder
ls $startupfolder
cp -Verbose $down $startupfolder
cp -Verbose $down $desktop

Просмотреть файл

@ -1,103 +0,0 @@
SET ansi_nulls on
GO
SET quoted_identifier on
GO
/* Create the member_info Table. */
/* Large DataSets */
DROP TABLE IF EXISTS member_info_1m
CREATE TABLE [member_info_1m](
[memberId] [int],
[residentialState] [nvarchar](4),
[annualIncome] [real],
[yearsEmployment] [nvarchar](11),
[homeOwnership] [nvarchar](10),
[incomeVerified] [bit],
[creditScore] [int],
[dtiRatio] [real],
[revolvingBalance] [real],
[revolvingUtilizationRate] [real],
[numDelinquency2Years] [int],
[numDerogatoryRec] [int],
[numInquiries6Mon] [int],
[lengthCreditHistory] [int],
[numOpenCreditLines] [int],
[numTotalCreditLines] [int],
[numChargeoff1year] [int]
);
CREATE CLUSTERED COLUMNSTORE INDEX member_info_1m_cci ON member_info_1m WITH (DROP_EXISTING = OFF);
GO
/* Create the loan_info Table. */
DROP TABLE IF EXISTS loan_info_1m
CREATE TABLE [loan_info_1m](
[loanId] [int],
[loan_open_date] [datetime],
[memberId] [int],
[loanAmount] [real],
[interestRate] [real],
[grade] [int],
[term] [int],
[installment] [real],
[isJointApplication] [bit],
[purpose] [nvarchar](255)
);
CREATE CLUSTERED COLUMNSTORE INDEX loan_info_1m_cci ON loan_info_1m WITH (DROP_EXISTING = OFF);
GO
/* Create the payments_info Table*/
DROP TABLE IF EXISTS payments_info_1m
CREATE TABLE [payments_info_1m](
[loanId] [int],
[payment_date] [date],
[payment] [real],
[past_due] [real],
[remain_balance] [real],
[closed] [bit],
[charged_off] [bit]
);
CREATE CLUSTERED COLUMNSTORE INDEX payments_info_1m_cci ON payments_info_1m WITH (DROP_EXISTING = OFF);
GO
DROP TABLE IF EXISTS [loan_chargeoff_models_1m];
CREATE TABLE [loan_chargeoff_models_1m]
(
[model_name] varchar(30) not null default('default model') primary key,
[model] varbinary(max) not null,
[auc] real,
[accuracy] real,
[precision] real,
[recall] real,
[f1score] real,
[training_ts] datetime default(GETDATE())
);
GO
DROP TABLE IF EXISTS selected_features_1m;
CREATE TABLE [selected_features_1m](
[feature_id] [int] IDENTITY(1,1) NOT NULL,
[feature_name] [nvarchar](500) NOT NULL
);
GO
DROP TABLE IF EXISTS [loan_chargeoff_prediction_1m]
CREATE TABLE [loan_chargeoff_prediction_1m](
[memberId] [int],
[loanId] [int],
[payment_date] [date],
[prediction_date] [date] default(GETDATE()),
[PredictedLabel] [nvarchar](255),
[Score.1] [float],
[Probability.1] [float]
);
GO

Просмотреть файл

@ -1,137 +0,0 @@
-- View over the underlying table for features and labels required
drop view if exists vw_loan_chargeoff_train_10k
go
create view vw_loan_chargeoff_train_10k
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_10k p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_10k p1 ) AS t inner join loan_info_10k l ON t.loanId = l.loanId inner join member_info_10k m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and ((payment_date between '2016-09-12' and '2016-12-12' and charge_off = 1) or (payment_date = '2017-01-12'));
go
drop view if exists vw_loan_chargeoff_test_10k
go
create view vw_loan_chargeoff_test_10k
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_10k p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_10k p1 ) AS t inner join loan_info_10k l ON t.loanId = l.loanId inner join member_info_10k m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and payment_date = '2017-02-12';
go
drop view if exists vw_loan_chargeoff_score_10k
go
create view vw_loan_chargeoff_score_10k
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_10k p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_10k p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_10k p1 ) AS t inner join loan_info_10k l ON t.loanId = l.loanId inner join member_info_10k m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and payment_date > '2017-02-12';
go
-- persist the view in case of large dataset in order to get faster results
drop table if exists [loan_chargeoff_train_10k]
go
select *
into [loan_chargeoff_train_10k]
from [vw_loan_chargeoff_train_10k]
go
create clustered columnstore index [cci_loan_chargeoff_train_10k] on [loan_chargeoff_train_10k]
go
drop table if exists [loan_chargeoff_test_10k]
go
select *
into [loan_chargeoff_test_10k]
from [vw_loan_chargeoff_test_10k]
go
create clustered columnstore index [cci_loan_chargeoff_test_10k] on [loan_chargeoff_test_10k]
go
drop table if exists [loan_chargeoff_score_10k]
go
select *
into [loan_chargeoff_score_10k]
from [vw_loan_chargeoff_score_10k]
go
create clustered columnstore index [cci_loan_chargeoff_score_10k] on [loan_chargeoff_score_10k]
go

Просмотреть файл

@ -1,140 +0,0 @@
-- View over the underlying table for features and labels required
/* Large DataSets */
drop view if exists [dbo].[vw_loan_chargeoff_1m]
go
create view [dbo].[vw_loan_chargeoff_1m]
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_1m p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_1m p1 ) AS t inner join loan_info_1m l ON t.loanId = l.loanId inner join member_info_1m m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and ((payment_date between '2016-09-12' and '2016-12-12' and charge_off = 1) or (payment_date = '2017-01-12'))
GO
drop view if exists [dbo].[vw_loan_chargeoff_test_1m]
go
create view [dbo].[vw_loan_chargeoff_test_1m]
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_1m p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_1m p1 ) AS t inner join loan_info_1m l ON t.loanId = l.loanId inner join member_info_1m m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and payment_date = '2017-02-12'
GO
drop view if exists [dbo].[vw_loan_chargeoff_score_1m]
go
create view [dbo].[vw_loan_chargeoff_score_1m]
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_1m p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_1m p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_1m p1 ) AS t inner join loan_info_1m l ON t.loanId = l.loanId inner join member_info_1m m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and payment_date > '2017-02-12'
GO
-- persist the view in case of large dataset in order to get faster results
/* Large dataset */
drop table if exists [loan_chargeoff_train_1m]
go
select *
into [loan_chargeoff_train_1m]
from [vw_loan_chargeoff_1m]
go
create clustered columnstore index [cci_loan_chargeoff_train_1m] on [loan_chargeoff_train_1m]
go
drop table if exists [loan_chargeoff_test_1m]
go
select *
into [loan_chargeoff_test_1m]
from [vw_loan_chargeoff_test_1m]
go
create clustered columnstore index [cci_loan_chargeoff_test_1m] on [loan_chargeoff_test_1m]
go
drop table if exists [loan_chargeoff_score_1m]
go
select *
into [loan_chargeoff_score_1m]
from [vw_loan_chargeoff_score_1m]
go
create clustered columnstore index [cci_loan_chargeoff_score_1m] on [loan_chargeoff_score_1m]
go

Просмотреть файл

@ -0,0 +1,15 @@
##############################################################################
# Helper script to retrieve the password for 'rdemo' user if needed. During
# deployment of the solution template a new user is created with a random
# password which is stored in encrypted form in a text file.
#
# Must be run as the same user as the Data Science VM user supplied during
# deployment.
##############################################################################
$passwordFile = "ExportedSqlPassword.txt"
$secureTxtFromFile = Get-Content $passwordFile
$securePasswordObj = $secureTxtFromFile | ConvertTo-SecureString
#get back the original unencrypted password
$PasswordBSTR = [System.Runtime.InteropServices.Marshal]::SecureStringToBSTR($securePasswordObj)
[System.Runtime.InteropServices.Marshal]::PtrToStringAuto($PasswordBSTR)

Просмотреть файл

Просмотреть файл

@ -70,9 +70,11 @@ function ExecuteSQL
{
param(
[String]
$sqlscript
$sqlscript,
[String]
$VariableArray=""
)
Invoke-Sqlcmd -ServerInstance $ServerName -Database $DBName -Username $username -Password $password -InputFile $sqlscript -QueryTimeout 200000
Invoke-Sqlcmd -ServerInstance $ServerName -Database $DBName -Username $username -Password "$password" -InputFile $sqlscript -Variable $VariableArray -QueryTimeout 200000
}
##########################################################################
# Function wrapper to invoke SQL query
@ -83,7 +85,7 @@ param(
[String]
$sqlquery
)
Invoke-Sqlcmd -ServerInstance $ServerName -Database $DBName -Username $username -Password $password -Query $sqlquery -QueryTimeout 200000
Invoke-Sqlcmd -ServerInstance $ServerName -Database $DBName -Username $username -Password "$password" -Query $sqlquery -QueryTimeout 200000
}
##########################################################################
@ -113,7 +115,7 @@ $connectionString2 = GetConnectionString2
# Check if the SQL server or database exists
##########################################################################
$query = "IF NOT EXISTS(SELECT * FROM sys.databases WHERE NAME = '$DBName') CREATE DATABASE $DBName"
Invoke-Sqlcmd -ServerInstance $ServerName -Username $username -Password $password -Query $query -ErrorAction SilentlyContinue
Invoke-Sqlcmd -ServerInstance $ServerName -Username $username -Password "$password" -Query $query -ErrorAction SilentlyContinue
if ($? -eq $false)
{
Write-Host -ForegroundColor Red "Failed the test to connect to SQL server: $ServerName database: $DBName !"
@ -124,7 +126,7 @@ if ($? -eq $false)
}
$query = "USE $DBName;"
Invoke-Sqlcmd -ServerInstance $ServerName -Username $username -Password $password -Query $query
Invoke-Sqlcmd -ServerInstance $ServerName -Username $username -Password "$password" -Query $query
##########################################################################
@ -139,8 +141,8 @@ if ($uninterrupted -eq 'y' -or $uninterrupted -eq 'Y')
{
# create training and test tables
Write-Host -ForeGroundColor 'green' ("Create SQL tables: member_info, loan_info, payments_info")
$script = $filePath + "step1_create_tables" + $table_suffix + ".sql"
ExecuteSQL $script
$script = $filePath + "step1_create_tables.sql"
ExecuteSQL $script "datasize = $dataSize"
Write-Host -ForeGroundColor 'green' ("Populate SQL tables: member_info, loan_info, payments_info")
$dataList = "member_info", "loan_info", "payments_info"
@ -153,23 +155,23 @@ if ($uninterrupted -eq 'y' -or $uninterrupted -eq 'Y')
Write-Host -ForeGroundColor 'magenta'(" Populate SQL table: {0}... from {1}" -f $dataFile, $destination)
$tableName = $DBName + ".dbo." + $dataFile + $table_suffix
$tableSchema = $dataFilePath + $dataFile + $table_suffix + ".xml"
bcp $tableName format nul -c -x -f $tableSchema -U $username -S $ServerName -P $password -t ','
bcp $tableName format nul -c -x -f $tableSchema -U $username -S $ServerName -P "$password" -t ','
Write-Host -ForeGroundColor 'magenta'(" Loading {0} to SQL table..." -f $dataFile)
bcp $tableName in $destination -t ',' -S $ServerName -f $tableSchema -F 2 -C "RAW" -b 100000 -U $username -P $password -e $error_file
Write-Host -ForeGroundColor 'magenta'(" Done...Loading {0} to SQL table..." -f $dataFile)
bcp $tableName in $destination -t ',' -S $ServerName -f $tableSchema -F 2 -C "RAW" -b 100000 -U $username -P "$password" -e $error_file
Write-Host -ForeGroundColor 'magenta'(" Done...Loading {0} to SQL table {1}..." -f $dataFile, $tableName)
}
# create the views for features and label with training, test and scoring split
Write-Host -ForeGroundColor 'magenta'(" Creating features label view and persisting...")
$script = $filepath + "step2_features_label_view" + $table_suffix + ".sql"
ExecuteSQL $script
$script = $filepath + "step2_features_label_view.sql"
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'magenta'(" Done creating features label view and persisting...")
# create the stored procedure for training
$script = $filepath + "step3_train_test_model.sql"
ExecuteSQL $script
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'magenta'(" Done creating training and eval stored proc...")
# execute the training
@ -186,7 +188,7 @@ if ($uninterrupted -eq 'y' -or $uninterrupted -eq 'Y')
# create the stored procedure for recommendations
$script = $filepath + "step4_chargeoff_batch_prediction.sql"
ExecuteSQL $script
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'magenta'(" Done creating batch scoring stored proc...")
#score on the data
@ -196,13 +198,13 @@ if ($uninterrupted -eq 'y' -or $uninterrupted -eq 'Y')
# create the stored procedure for recommendations
$script = $filepath + "step4a_chargeoff_ondemand_prediction.sql"
ExecuteSQL $script
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'magenta'(" Done creating on demand scoring stored proc [predict_chargeoff_ondemand]...")
}
catch
{
Write-Host -ForegroundColor DarkYellow "Exception in populating database tables:"
Write-Host -ForegroundColor Yellow "Exception executing Data Science pipeline..."
Write-Host -ForegroundColor Red $Error[0].Exception
throw
}
@ -228,8 +230,8 @@ if ($ans -eq 'y' -or $ans -eq 'Y')
{
# create training and test tables
Write-Host -ForeGroundColor 'green' ("Create SQL tables: member_info, loan_info, payments_info")
$script = $filePath + "step1_create_tables" + $table_suffix + ".sql"
ExecuteSQL $script
$script = $filePath + "step1_create_tables.sql"
ExecuteSQL $script "datasize = $dataSize"
Write-Host -ForeGroundColor 'green' ("Populate SQL tables: member_info, loan_info, payments_info")
$dataList = "member_info", "loan_info", "payments_info"
@ -242,10 +244,10 @@ if ($ans -eq 'y' -or $ans -eq 'Y')
Write-Host -ForeGroundColor 'magenta'(" Populate SQL table: {0} from {1}..." -f $dataFile, $destination)
$tableName = $DBName + ".dbo." + $dataFile + $table_suffix
$tableSchema = $dataFilePath + $dataFile + $table_suffix + ".xml"
bcp $tableName format nul -c -x -f $tableSchema -U $username -S $ServerName -P $password -t ','
bcp $tableName format nul -c -x -f $tableSchema -U $username -S $ServerName -P "$password" -t ','
Write-Host -ForeGroundColor 'magenta'(" Loading {0} to SQL table..." -f $dataFile)
bcp $tableName in $destination -t ',' -S $ServerName -f $tableSchema -F 2 -C "RAW" -b 100000 -U $username -P $password -e $error_file
Write-Host -ForeGroundColor 'magenta'(" Done...Loading {0} to SQL table..." -f $dataFile)
bcp $tableName in $destination -t ',' -S $ServerName -f $tableSchema -F 2 -C "RAW" -b 100000 -U $username -P "$password" -e $error_file
Write-Host -ForeGroundColor 'magenta'(" Done...Loading {0} to SQL table {1}..." -f $dataFile, $tableName)
}
}
catch
@ -269,8 +271,8 @@ if ($ans -eq 'y' -or $ans -eq 'Y')
{
# create features, labels view
Write-Host -ForeGroundColor 'Cyan' (" Creating feature/label views...")
$script = $filepath + "step2_features_label_view" + $table_suffix + ".sql"
ExecuteSQL $script
$script = $filepath + "step2_features_label_view.sql"
ExecuteSQL $script "datasize = $dataSize"
}
##########################################################################
@ -286,7 +288,7 @@ if ($ans -eq 'y' -or $ans -eq 'Y')
{
# create the stored procedure for feature engineering
$script = $filepath + "step2a_optional_feature_selection.sql"
ExecuteSQL $script
ExecuteSQL $script "datasize=$dataSize"
# execute the feature engineering
Write-Host -ForeGroundColor 'Cyan' (" selecting features using MicrosoftML selectFeatures mlTransform with Logistic Regression...")
@ -308,7 +310,7 @@ if ($ans -eq 'y' -or $ans -eq 'Y')
{
# create the stored procedure for training
$script = $filepath + "step3_train_test_model.sql"
ExecuteSQL $script
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'magenta'(" Starting training and evaluation of models...")
$modelNames = 'logistic_reg','fast_linear','fast_trees','fast_forest','neural_net'
@ -334,7 +336,7 @@ if ($ans -eq 'y' -or $ans -eq 'Y')
{
# create the stored procedure for recommendations
$script = $filepath + "step4_chargeoff_batch_prediction.sql"
ExecuteSQL $script
ExecuteSQL $script "datasize=$dataSize"
# compute loan chargeoff predictions
Write-Host -ForeGroundColor 'Cyan' ("Scoring based on best performing model score table = $scoreTable, prediction table = $predictionTable...")
@ -352,7 +354,7 @@ if ($ans -eq 'y' -or $ans -eq 'Y')
{
# create the stored procedure for recommendations
$script = $filepath + "step4a_chargeoff_ondemand_prediction.sql"
ExecuteSQL $script
ExecuteSQL $script "datasize=$dataSize"
Write-Host -ForeGroundColor 'Cyan' ("Done creating on demand chargeoff prediction stored proc [predict_chargeoff_ondemand]...")
}
@ -363,5 +365,5 @@ Write-Host -foregroundcolor 'green'("Loan Chargeoff Prediction Workflow Finished
$endTime =Get-Date
$totalTime = ($endTime-$startTime).ToString()
Write-Host "Finished running at:" $endTime
Write-Host "Finished running Loan_ChargeOff.ps1 at:" $endTime
Write-Host "Total time used: " -foregroundcolor 'green' $totalTime.ToString()

Просмотреть файл

@ -10,13 +10,15 @@
################################################################################################
param([string]$serverName,[string]$baseurl,[string]$username,[string]$password)
$startTime= Get-Date
Write-Host "Start time for setup is:" $startTime
$originalLocation = Get-Location
# This is the directory for the data/code download
$solutionTemplateSetupDir = "LoanChargeOffSolution"
$solutionTemplateSetupPath = "D:\" + $solutionTemplateSetupDir
$dataDir = "Data"
$dataDirPath = $solutionTemplateSetupPath + "\" + $dataDir
$checkoutDir = "Code"
$checkoutDir = "Source"
New-Item -Path "D:\" -Name $solutionTemplateSetupDir -ItemType directory -force
New-Item -Path $solutionTemplateSetupPath -Name $dataDir -ItemType directory -force
@ -25,8 +27,6 @@ Start-Transcript -Path $setupLog -Append
cd $dataDirPath
$helpShortCutFilePath = $solutionTemplateSetupPath + "\LoanChargeOffHelp.url"
# List of files to be downloaded
$dataList = "loan_info_10k", "member_info_10k", "payments_info_10k", "loan_info_100k", "member_info_100k", "payments_info_100k", "loan_info_1m", "member_info_1m", "payments_info_1m"
$dataExtn = ".csv"
@ -34,41 +34,50 @@ $hashExtn = ".hash"
foreach ($dataFile in $dataList)
{
$down = $baseurl + '/' + $dataFile + $dataExtn
Write-Host $down
Write-Host -ForeGroundColor 'magenta' "Downloading file $down..."
Start-BitsTransfer -Source $down
}
#checkout setup scripts/code from github
cd $solutionTemplateSetupPath
Remove-Item $checkoutDir -Force -Recurse
if (Test-Path $checkoutDir)
{
Remove-Item $checkoutDir -Force -Recurse
}
git clone -n https://github.com/Microsoft/r-server-loan-chargeoff $checkoutDir
cd $checkoutDir
git config core.sparsecheckout true
echo "/*`r`n!HDI" | out-file -encoding ascii .git/info/sparse-checkout
git checkout master
$sqlsolutionCodePath = $solutionTemplateSetupPath + "\" + $checkoutDir + "\SQL"
$sqlsolutionCodePath = $solutionTemplateSetupPath + "\" + $checkoutDir + "\SQLR"
$helpShortCutFilePath = $sqlsolutionCodePath + "\LoanChargeOffHelp.url"
cd $sqlsolutionCodePath
# make sure the hashes match for data files
Write-Host -ForeGroundColor 'magenta' "Checking integrity of downloaded files..."
foreach ($dataFile in $dataList)
{
$dataFileHash = Get-FileHash ($dataDirPath + "\" + $dataFile + $dataExtn) -Algorithm SHA512
$storedHash = Get-Content ($dataFile + $hashExtn)
if ($dataFileHash.Hash -ne $storedHash)
{
Write-Host -ForeGroundColor 'Red' "Data file has been corrupted. Please try again."
Write-Error "Data file has been corrupted. Please try again."
throw
}
}
Write-Host -ForeGroundColor 'magenta' "File integrity check successful."
# making sure that the data files conform to windows style of line ending.
Write-Host -ForeGroundColor 'Cyan' "Converting data files from unix2dos"
Write-Host -ForeGroundColor 'magenta' "Converting data files from unix2dos..."
foreach ($dataFile in $dataList)
{
$csvfile = $dataDirPath + "\" + $dataFile + $dataExtn
unix2dos $csvfile
}
Write-Host -ForeGroundColor 'magenta' "Done with unix2dos conversion."
# Start the script for DB creation. Due to privilege issues with SYSTEM user (the user that runs the
# extension script), we use ps-remoting to login as admin use and run the DB creation scripts
@ -79,9 +88,13 @@ $command2 ="setupHelp.ps1"
Enable-PSRemoting -Force
Invoke-Command -Credential $credential -ComputerName $serverName -FilePath $command1 -ArgumentList $dataDirPath, $sqlsolutionCodePath
Invoke-Command -Credential $credential -ComputerName $serverName -FilePath $command2 -ArgumentList $helpShortCutFilePath
Invoke-Command -Credential $credential -ComputerName $serverName -FilePath $command2 -ArgumentList $helpShortCutFilePath, $solutionTemplateSetupPath
Disable-PSRemoting -Force
cd $originalLocation.Path
$endTime= Get-Date
$totalTime = $endTime - $startTime
Write-Host "Finished running setup at " $endTime
Write-Host "Total time for setup:" $totalTime
Stop-Transcript

Просмотреть файл

@ -1,5 +1,8 @@
:on error exit
--
-- remove old $(username) user and login from master
-- remove old $(username) user and login from master.
-- $(username) and $(password) is substituted by Invoke-SqlCmd
-- through environment variables.
--
USE [master]
GO

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

101
SQLR/runDB.ps1 Normal file
Просмотреть файл

@ -0,0 +1,101 @@
##############################################################################################
# Script to invoke the LoanChargeOff data science workflow with a smaller dataset of 10,000
# loans.
# It also creates a SQL Server user and stores the password in 'ExporedSqlPassword.txt'.
# Users can retrieve the password from the file and decrypt using ConvertTo-SecureString
# commandlet in PowerShell.
#
# Parameters:
# datadir - directory where raw csv data has been downloaded
# scriptdir - directory where scripts are checked out from github
# dbuser - (Optional) username for database LoanChargeOff
# dbpass - (Optional) database password
# createuser - (Optional) whethere to create a database user
# datasize - size of the dataset (10k, 100k, 1m)
##############################################################################################
Param([string]$datadir, [string]$scriptdir, [string]$dbuser, [string]$dbpass, [bool]$createuser = $true, [ValidateSet("10k", "100k", "1m")][string]$datasize="10k")
cd $scriptdir
$dbpassword = ""
$dbusername = "rdemo"
$passwordFile = "ExportedSqlPassword.txt"
function Retrieve-FilePassword([string]$file=$passwordFile)
{
$secureTxtFromFile = Get-Content $file
$securePasswordObj = $secureTxtFromFile | ConvertTo-SecureString
#get back the original unencrypted password
$PasswordBSTR = [System.Runtime.InteropServices.Marshal]::SecureStringToBSTR($securePasswordObj)
[System.Runtime.InteropServices.Marshal]::PtrToStringAuto($PasswordBSTR)
}
if ($dbuser)
{
$dbusername = $dbuser
}
if (!$createuser)
{
if (!$dbpass)
{
if (Test-Path $passwordFile)
{
$dbpassword = Retrieve-FilePassword($passwordFile)
}
else
{
Write-Host -ForegroundColor Yellow "Either ExportedSqlPassword.txt must exist with encrypted database password or must provide password using dbpass parameter."
throw
}
}
else
{
$dbpassword = $dbpass
}
}
else
{
Write-Host -ForegroundColor 'Cyan' "Creating database user"
[Reflection.Assembly]::LoadWithPartialName("System.Web")
$dbpassword = [System.Web.Security.Membership]::GeneratePassword(15,0)
# Variables to pass to createuser.sql script
# Cannot use -v option as sqlcmd does not like special characters which maybe part of the randomly generated password.
$sqlcmdvars = @{"username" = "$dbusername"; "password" = "$dbpassword"}
$old_env = @{}
foreach ($var in $sqlcmdvars.GetEnumerator()) {
# Save Environment
$old_env.Add($var.Name, [Environment]::GetEnvironmentVariable($var.Value, "User"))
[Environment]::SetEnvironmentVariable($var.Name, $var.Value)
}
try {
#sqlcmd -S $env:COMPUTERNAME -b -i .\createuser.sql
Invoke-Sqlcmd -ServerInstance $env:COMPUTERNAME -InputFile .\createuser.sql
# save password securely for later retrieval
$securePassword = $dbpassword | ConvertTo-SecureString -AsPlainText -Force
$secureTxt = $securePassword | ConvertFrom-SecureString
Set-Content $passwordFile $secureTxt
} catch {
Write-Host -ForegroundColor 'Yellow' "Error creating database user, see error message output"
Write-Host -ForegroundColor 'Red' $Error[0].Exception
#Try to read password from stored file
if (Test-Path $passwordFile)
{
Write-Host -ForegroundColor 'Yellow' "Retrieving password from stored file."
$dbpassword = Retrieve-FilePassword($passwordFile)
}
else
{
Write-Host -ForegroundColor DarkYellow "Either ExportedSqlPassword.txt must exist with encrypted database password or must provide password using dbpass parameter."
throw
}
} finally {
# Restore Environment
foreach ($var in $old_env.GetEnumerator()) {
[Environment]::SetEnvironmentVariable($var.Name, $var.Value)
}
}
Write-Host -ForegroundColor 'Cyan' "Done creating database user"
}
.\Loan_ChargeOff.ps1 -ServerName $env:COMPUTERNAME -DBName LoanChargeOff -username $dbusername -password "$dbpassword" -uninterrupted y -dataPath $datadir -dataSize $datasize

37
SQLR/setupHelp.ps1 Normal file
Просмотреть файл

@ -0,0 +1,37 @@
#######################################################################
# Script to create help short cut and solution folder shortcut.
#
# Parameters:
# helpfile - path to the help url file.
# solutionPath - path to the solution folder with data and
# source.
#######################################################################
param( [string]$helpfile, [string]$solutionPath)
#git clone
$desktop = [Environment]::GetFolderPath("Desktop")
$desktop = $desktop + '\'
#create the help link in startup program
$startmenu = [Environment]::GetFolderPath("StartMenu")
$startupfolder = $startmenu + '\Programs\Startup\'
# We create this since the user startup folder is only created after first login
# Alternative is to add is to all user startup
mkdir $startupfolder
#copy
$down = $helpfile
Write-Host $down
Write-Host $startmenu
ls $startmenu
Write-Host $startupfolder
ls $startupfolder
cp -Verbose $down $startupfolder
cp -Verbose $down $desktop
#create shortcut to solution folder on desktop
$WsShell = New-Object -ComObject WScript.Shell
$shortcut = $WsShell.CreateShortcut($desktop + "LoanChargeOff.lnk")
$shortcut.TargetPath = $solutionPath
$shortcut.Save()

Просмотреть файл

@ -1,3 +1,12 @@
/*
* SQL Script to create tables required for training, testing and scoring
* of models.
* It creates tables for member_info, loan_info and payments_info tables.
* It also creates tables to store models with evaluation stats, selected features
* and prediction tables.
* $(datasize) is substituted through Invoke-SqlCmd's Variable option
* (in powershell).
*/
SET ansi_nulls on
GO
SET quoted_identifier on
@ -5,9 +14,9 @@ GO
/* Create the member_info Table. */
DROP TABLE IF EXISTS member_info_10k
DROP TABLE IF EXISTS member_info_$(datasize)
CREATE TABLE [member_info_10k](
CREATE TABLE [member_info_$(datasize)](
[memberId] [int],
[residentialState] [nvarchar](4),
[annualIncome] [real],
@ -27,13 +36,13 @@ CREATE TABLE [member_info_10k](
[numChargeoff1year] [int]
);
CREATE CLUSTERED COLUMNSTORE INDEX member_info_10k_cci ON member_info_10k WITH (DROP_EXISTING = OFF);
CREATE CLUSTERED COLUMNSTORE INDEX member_info_$(datasize)_cci ON member_info_$(datasize) WITH (DROP_EXISTING = OFF);
GO
/* Create the loan_info Table. */
DROP TABLE IF EXISTS loan_info_10k
DROP TABLE IF EXISTS loan_info_$(datasize)
CREATE TABLE [loan_info_10k](
CREATE TABLE [loan_info_$(datasize)](
[loanId] [int],
[loan_open_date] [datetime],
[memberId] [int],
@ -46,13 +55,13 @@ CREATE TABLE [loan_info_10k](
[purpose] [nvarchar](255)
);
CREATE CLUSTERED COLUMNSTORE INDEX loan_info_10k_cci ON loan_info_10k WITH (DROP_EXISTING = OFF);
CREATE CLUSTERED COLUMNSTORE INDEX loan_info_$(datasize)_cci ON loan_info_$(datasize) WITH (DROP_EXISTING = OFF);
GO
/* Create the payments_info Table*/
DROP TABLE IF EXISTS payments_info_10k
DROP TABLE IF EXISTS payments_info_$(datasize)
CREATE TABLE [payments_info_10k](
CREATE TABLE [payments_info_$(datasize)](
[loanId] [int],
[payment_date] [datetime],
[payment] [real],
@ -62,12 +71,12 @@ CREATE TABLE [payments_info_10k](
[charged_off] [bit]
);
CREATE CLUSTERED COLUMNSTORE INDEX payments_info_10k_cci ON payments_info_10k WITH (DROP_EXISTING = OFF);
CREATE CLUSTERED COLUMNSTORE INDEX payments_info_$(datasize)_cci ON payments_info_$(datasize) WITH (DROP_EXISTING = OFF);
GO
DROP TABLE IF EXISTS [loan_chargeoff_models_10k];
DROP TABLE IF EXISTS [loan_chargeoff_models_$(datasize)];
CREATE TABLE [loan_chargeoff_models_10k]
CREATE TABLE [loan_chargeoff_models_$(datasize)]
(
[model_name] varchar(30) not null default('default model') primary key,
[model] varbinary(max) not null,
@ -80,17 +89,17 @@ CREATE TABLE [loan_chargeoff_models_10k]
);
GO
DROP TABLE IF EXISTS [selected_features_10k];
DROP TABLE IF EXISTS [selected_features_$(datasize)];
CREATE TABLE [selected_features_10k](
CREATE TABLE [selected_features_$(datasize)](
[feature_id] [int] IDENTITY(1,1) NOT NULL,
[feature_name] [nvarchar](500) NOT NULL
);
GO
DROP TABLE IF EXISTS [loan_chargeoff_prediction_10k]
DROP TABLE IF EXISTS [loan_chargeoff_prediction_$(datasize)]
CREATE TABLE [loan_chargeoff_prediction_10k](
CREATE TABLE [loan_chargeoff_prediction_$(datasize)](
[memberId] [int],
[loanId] [int],
[payment_date] [date],

Просмотреть файл

@ -0,0 +1,144 @@
/*
* SQL script to create views with feature and label columns for training, testing and prediction.
* We also persist these views to physical tables for faster training/scoring times.
* If there is not much data these views can be used directly.
* $(datasize) is substituted through Invoke-SqlCmd's Variable option
* (in powershell).
*/
-- View over the underlying table for features and labels required
drop view if exists vw_loan_chargeoff_train_$(datasize)
go
create view vw_loan_chargeoff_train_$(datasize)
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_$(datasize) p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_$(datasize) p1 ) AS t inner join loan_info_$(datasize) l ON t.loanId = l.loanId inner join member_info_$(datasize) m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and ((payment_date between '2016-09-12' and '2016-12-12' and charge_off = 1) or (payment_date = '2017-01-12'));
go
drop view if exists vw_loan_chargeoff_test_$(datasize)
go
create view vw_loan_chargeoff_test_$(datasize)
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_$(datasize) p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_$(datasize) p1 ) AS t inner join loan_info_$(datasize) l ON t.loanId = l.loanId inner join member_info_$(datasize) m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and payment_date = '2017-02-12';
go
drop view if exists vw_loan_chargeoff_score_$(datasize)
go
create view vw_loan_chargeoff_score_$(datasize)
as
select t.loanId, t.payment_date, t.payment, t.past_due, t.remain_balance,
l.loan_open_date, l.loanAmount,l.interestRate,l.grade,l.term,l.installment,l.isJointApplication,l.purpose,
m.memberId,m.residentialState,m.annualIncome,m.yearsEmployment,m.homeOwnership,m.incomeVerified,m.creditScore,m.dtiRatio,m.revolvingBalance,m.revolvingUtilizationRate,m.numDelinquency2Years,m.numDerogatoryRec,m.numInquiries6Mon,m.lengthCreditHistory,m.numOpenCreditLines,m.numTotalCreditLines,m.numChargeoff1year,
ISNULL(t.payment_1, 0) payment_1,ISNULL(t.payment_2, 0) payment_2,ISNULL(t.payment_3, 0) payment_3,ISNULL(t.payment_4, 0) payment_4,ISNULL(t.payment_5, 0) payment_5,
ISNULL(t.past_due_1, 0) past_due_1,ISNULL(t.past_due_2, 0) past_due_2,ISNULL(t.past_due_3, 0) past_due_3,ISNULL(t.past_due_4, 0) past_due_4,ISNULL(t.past_due_5, 0) past_due_5,
ISNULL(t.remain_balance_1, 0) remain_balance_1,ISNULL(t.remain_balance_2, 0) remain_balance_2,ISNULL(t.remain_balance_3, 0) remain_balance_3,ISNULL(t.remain_balance_4, 0) remain_balance_4,ISNULL(t.remain_balance_5, 0) remain_balance_5, t.charge_off
from
(
select *,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) payment_1,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) payment_2,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) payment_3,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) payment_4,
(select top 1 payment from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) payment_5,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) past_due_1,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) past_due_2,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) past_due_3,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) past_due_4,
(select top 1 past_due from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) past_due_5,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 1 AND p1.loanId = p2.loanId) remain_balance_1,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 2 AND p1.loanId = p2.loanId) remain_balance_2,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 3 AND p1.loanId = p2.loanId) remain_balance_3,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 4 AND p1.loanId = p2.loanId) remain_balance_4,
(select top 1 remain_balance from payments_info_$(datasize) p2 where DATEDIFF(month, p2.payment_date,p1.payment_date) = 5 AND p1.loanId = p2.loanId) remain_balance_5,
(select MAX(charged_off+0) from payments_info_$(datasize) p2 where DATEDIFF(month, p1.payment_date,p2.payment_date) IN (1,2,3) AND p1.loanId = p2.loanId) charge_off
from payments_info_$(datasize) p1 ) AS t inner join loan_info_$(datasize) l ON t.loanId = l.loanId inner join member_info_$(datasize) m ON l.memberId = m.memberId
where t.charge_off IS NOT NULL
and payment_date > '2017-02-12';
go
-- persist the view in case of large dataset in order to get faster results
drop table if exists [loan_chargeoff_train_$(datasize)]
go
select *
into [loan_chargeoff_train_$(datasize)]
from [vw_loan_chargeoff_train_$(datasize)]
go
create clustered columnstore index [cci_loan_chargeoff_train_$(datasize)] on [loan_chargeoff_train_$(datasize)]
go
drop table if exists [loan_chargeoff_test_$(datasize)]
go
select *
into [loan_chargeoff_test_$(datasize)]
from [vw_loan_chargeoff_test_$(datasize)]
go
create clustered columnstore index [cci_loan_chargeoff_test_$(datasize)] on [loan_chargeoff_test_$(datasize)]
go
drop table if exists [loan_chargeoff_score_$(datasize)]
go
select *
into [loan_chargeoff_score_$(datasize)]
from [vw_loan_chargeoff_score_$(datasize)]
go
create clustered columnstore index [cci_loan_chargeoff_score_$(datasize)] on [loan_chargeoff_score_$(datasize)]
go

Просмотреть файл

@ -1,3 +1,9 @@
/*
* SQLR script to demonstrate feature selection available in MicrosoftML package.
* We use this same mechanism during training so this step is optional to run, but
* serves as an example of an approach for feature selection, i.e., preselect features
* and store in database table for later use in training of models.
*/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
@ -6,6 +12,14 @@ GO
DROP PROCEDURE IF EXISTS [dbo].[select_features];
GO
/*
* Stored procedure for feature selection.
* Parameters:
* @training_set_table - table with training data
* @test_set_table - table with test data
* @selected_features_table - table to store selected features in
* @connectionString - connection string to connect to the database for use in the R script
*/
CREATE PROCEDURE [select_features] @training_set_table varchar(100), @test_set_table varchar(100), @selected_features_table varchar(100), @connectionString varchar(300)
AS
BEGIN

Просмотреть файл

@ -1,4 +1,6 @@
/****** Stored Procedure to train models ******/
/*
* SQLR script to create stored procedure for training.
*/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
@ -7,7 +9,20 @@ GO
DROP PROCEDURE IF EXISTS [dbo].[train_model];
GO
CREATE PROCEDURE [train_model] @training_set_table varchar(100), @test_set_table varchar(100), @scored_table varchar(100), @model_table varchar(100), @model_name_param varchar(50), @connectionString varchar(300)
/*
* Stored Procedure for training of models using MicrosoftML algorithms. This also evaluates the models and stores
* the following stats along with serialized model binary, accuracy, auc, precision, recall, f1score.
* The parameters can be tuned for various algorithms based on performance on your data.
* Parameters:
* @training_set_table - training data table name
* @test_set_table - test data table name for model evaluation
* @scored_table - table to store scores in when doing model evaluation
* @model_table - table to store model in serialized binary format along with evaluation stats
* @model_name_param - the algorithm to use for training the model.
* Can be one of 'logistic_reg', 'fast_trees', 'fast_forest', 'fast_linear', 'neural_net'
* @connectionString - connection string to connect to the database for use in the R script
*/
CREATE PROCEDURE [train_model] @training_set_table varchar(100), @test_set_table varchar(100), @scored_table varchar(100), @model_table varchar(100), @model_alg varchar(50), @connectionString varchar(300)
AS
BEGIN
@ -89,7 +104,7 @@ stat_f1score <- model_stats[[5]]
'
, @params = N'@model_name varchar(20), @connection_string varchar(300), @train_set varchar(100), @test_set varchar(100), @score_set varchar(100),
@modelbin varbinary(max) OUTPUT, @stat_auc real OUTPUT, @stat_accuracy real OUTPUT, @stat_precision real OUTPUT, @stat_recall real OUTPUT, @stat_f1score real OUTPUT'
, @model_name = @model_name_param
, @model_name = @model_alg
, @connection_string = @connectionString
, @train_set = @training_set_table
, @test_set = @test_set_table
@ -101,9 +116,9 @@ stat_f1score <- model_stats[[5]]
, @stat_recall = @recall OUTPUT
, @stat_f1score = @f1score OUTPUT;
SET @del_cmd = N'DELETE FROM ' + @model_table + N' WHERE model_name = ''' + @model_name_param + ''''
SET @del_cmd = N'DELETE FROM ' + @model_table + N' WHERE model_name = ''' + @model_alg + ''''
EXEC sp_executesql @del_cmd;
SET @ins_cmd = N'INSERT INTO ' + @model_table + N' (model_name, model, auc, accuracy, precision, recall, f1score) VALUES (''' + @model_name_param + ''', @p_payload, @p_auc, @p_accuracy, @p_precision, @p_recall, @p_f1score)'
SET @ins_cmd = N'INSERT INTO ' + @model_table + N' (model_name, model, auc, accuracy, precision, recall, f1score) VALUES (''' + @model_alg + ''', @p_payload, @p_auc, @p_accuracy, @p_precision, @p_recall, @p_f1score)'
SET @param_def = N'@p_payload varbinary(max),
@p_auc real,
@p_accuracy real,

Просмотреть файл

@ -1,4 +1,6 @@
/*
* SQLR script to do batch scoring.
*/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
@ -7,6 +9,14 @@ GO
DROP PROCEDURE IF EXISTS [dbo].[predict_chargeoff]
GO
/*
* Stored Procedure to do batch scoring using the 'best model' based on f1score.
* Parameters:
* @score_table - Table with data to score/make prediction on
* @score_prediction_table - Table to store predictions
* @models_table - Table which has serialized binary models stored along with evaluation stats (during training step)
* @connectionString - connection string to connect to the database for use in the R script
*/
CREATE PROCEDURE [predict_chargeoff] @score_table varchar(100), @score_prediction_table varchar(100), @models_table varchar(100), @connectionString varchar(300)
AS

Просмотреть файл

@ -1,9 +1,16 @@
/*
* SQLR script to do on demand scoring/prediction of one record.
*/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
/*
* Stored Procedure for on demand scoring/prediction using the 'best model' based on f1score.
* Parameters:
* @models_table - Table which has serialized binary models stored along with evaluation stats (during training step)
* Rest of the parameters are the features used during training.
*/
DROP PROCEDURE IF EXISTS [dbo].[predict_chargeoff_ondemand]
GO