Update CreditRiskScale and CreditRiskDeploy
This commit is contained in:
Родитель
fb51d0db38
Коммит
6a7c90b8fb
|
@ -15,11 +15,17 @@ knitr::opts_chunk$set(echo = TRUE,
|
|||
|
||||
## 1 Introduction
|
||||
|
||||
This document will walk through you how to deploy a credit risk model as a web service,
|
||||
using the `mrsdeploy` package that ships with Microsoft R Client and R Server.
|
||||
The `mrsdeploy` package, delivered with Microsoft R Client and R Server, provides functions for:
|
||||
|
||||
It will start by creating the model locally, then publish it as a web service, and then share it
|
||||
with other authenticated users for consumption.
|
||||
**1** Establishing a remote session in a R console application for the purposes of executing code on that server
|
||||
|
||||
**2** Publishing and managing an R web service that is backed by the R code block or script you provided.
|
||||
|
||||
Each feature can be used independently, but the greatest value is achieved when you can leverage both.
|
||||
|
||||
This document will walk through you how to deploy a credit risk model as a web service, using the `mrsdeploy` package.
|
||||
|
||||
It will start by modelling locally, then publish it as a web service, and then share it with other authenticated users for consumption, and finally manage and update the web service.
|
||||
|
||||
## 2 Automated Credit Risk Model Deployment
|
||||
|
||||
|
@ -116,9 +122,11 @@ model_rxtrees <- rxFastTrees(formula=form,
|
|||
numTrees=100,
|
||||
numLeaves=20,
|
||||
learningRate=0.2,
|
||||
minSplit=10)
|
||||
minSplit=10,
|
||||
unbalancedSets=FALSE,
|
||||
verbose=0)
|
||||
|
||||
summary(model_rxtrees)
|
||||
model_rxtrees
|
||||
```
|
||||
|
||||
```{r, message=FALSE, warning=FALSE, error=FALSE}
|
||||
|
@ -126,21 +134,8 @@ summary(model_rxtrees)
|
|||
|
||||
creditRiskPrediction <- function(account_id, amount_6, pur_6, avg_pur_amt_6, avg_interval_pur_6,
|
||||
credit_limit, marital_status, sex, education, income, age)
|
||||
{
|
||||
staticdata <- data.frame(account_id=c("a_1055521381828530", "a_1055521125532160", "a_914800337488587", "a_844428146542225", "a_844428047550192"),
|
||||
amount_6=c(106.99, 212.49, 118.00, 151.19, 148.39),
|
||||
pur_6=c(1, 1, 1, 1, 1),
|
||||
avg_pur_amt_6=c(106.99, 212.49, 118.00, 151.19, 148.39),
|
||||
avg_interval_pur_6=c(0, 0, 0, 0, 0),
|
||||
credit_limit=c(2.21, -0.96, 3.82, -0.39, -0.45),
|
||||
marital_status=c("single", "married", "single", "married", "single"),
|
||||
sex=c("female", "male", "male", "female", "female"),
|
||||
education=c("master", "high_school", "middle_school",
|
||||
"polytechnics", "undergraduate"),
|
||||
income=c(0, 6.27, 17.15, 9.94, 19.93),
|
||||
age=c(33, 47, 30, 51, 32))
|
||||
|
||||
inputdata <- data.frame(account_id=account_id,
|
||||
{
|
||||
newdata <- data.frame(account_id=account_id,
|
||||
amount_6=amount_6,
|
||||
pur_6=pur_6,
|
||||
avg_pur_amt_6=avg_pur_amt_6,
|
||||
|
@ -152,13 +147,10 @@ creditRiskPrediction <- function(account_id, amount_6, pur_6, avg_pur_amt_6, avg
|
|||
income=income,
|
||||
age=age)
|
||||
|
||||
newdata <- rbind(inputdata, staticdata)
|
||||
|
||||
pred <- rxPredict(model_rxtrees, data=newdata)[, c(1, 3)]
|
||||
pred <- rxPredict(modelObject=model_rxtrees, data=newdata)[, c(1, 3)]
|
||||
pred <- cbind(newdata$account_id, pred)
|
||||
names(pred) <- c("account_id", "scored_label", "scored_prob")
|
||||
pred <- pred[1, ]
|
||||
pred
|
||||
pred
|
||||
}
|
||||
|
||||
# Test function locally by printing results
|
||||
|
@ -229,7 +221,7 @@ api <- publishService(
|
|||
|
||||
### 2.3 Test the service by consuming it in R
|
||||
|
||||
Finally, we can consume the service in R directly after publishing it to verify that the results are as expected.
|
||||
After publishing it , we can consume the service in R directly to verify that the results are as expected.
|
||||
|
||||
```{r, message=FALSE, warning=FALSE, error=TRUE}
|
||||
# Get service and assign service to the variable `api`.
|
||||
|
@ -253,7 +245,48 @@ result <- api$creditRiskPrediction(account_id="a_1055521029582310",
|
|||
# Print response output named `answer`
|
||||
|
||||
print(result$output("pred"))
|
||||
```
|
||||
|
||||
### 2.4 Update the web service
|
||||
|
||||
In the process of production, we could manage and update the web service timely.
|
||||
|
||||
```{r, message=FALSE, warning=FALSE, error=TRUE}
|
||||
# Load the pre-trained optimal model obtained from the template of CreditRiskScale.
|
||||
|
||||
load(file="model_rxtrees.RData")
|
||||
|
||||
model_rxtrees
|
||||
|
||||
api <- updateService(name="crpService",
|
||||
v="v1.0.0",
|
||||
model=model_rxtrees,
|
||||
descr="Update the model hyper-parameters")
|
||||
|
||||
# Re-test the updated service by consuming it
|
||||
|
||||
result <- api$creditRiskPrediction(account_id="a_1055521029582310",
|
||||
amount_6=173.22,
|
||||
pur_6=1,
|
||||
avg_pur_amt_6=173.22,
|
||||
avg_interval_pur_6=0,
|
||||
credit_limit=5.26,
|
||||
marital_status="married",
|
||||
sex="male",
|
||||
education="undergraduate",
|
||||
income=12.36,
|
||||
age=38)
|
||||
|
||||
# Print response output named `answer`
|
||||
|
||||
print(result$output("pred"))
|
||||
```
|
||||
|
||||
### 2.5 Application Integration
|
||||
|
||||
Last but not least, we can get the json file that is needed for application integration.
|
||||
|
||||
```{r, message=FALSE, warning=FALSE, error=TRUE}
|
||||
# Get this service's `swagger.json` file that is needed for web application integration
|
||||
|
||||
swagger <- api$swagger(json = FALSE)
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
<meta name="author" content="Fang Zhou, Data Scientist, Microsoft" />
|
||||
|
||||
<meta name="date" content="2017-05-17" />
|
||||
<meta name="date" content="2017-05-18" />
|
||||
|
||||
<title>Deploy a Credit Risk Model as a Web Service</title>
|
||||
|
||||
|
@ -119,15 +119,19 @@ $(document).ready(function () {
|
|||
|
||||
<h1 class="title toc-ignore">Deploy a Credit Risk Model as a Web Service</h1>
|
||||
<h4 class="author"><em>Fang Zhou, Data Scientist, Microsoft</em></h4>
|
||||
<h4 class="date"><em>2017-05-17</em></h4>
|
||||
<h4 class="date"><em>2017-05-18</em></h4>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div id="introduction" class="section level2">
|
||||
<h2>1 Introduction</h2>
|
||||
<p>This document will walk through you how to deploy a credit risk model as a web service, using the <code>mrsdeploy</code> package that ships with Microsoft R Client and R Server.</p>
|
||||
<p>It will start by creating the model locally, then publish it as a web service, and then share it with other authenticated users for consumption.</p>
|
||||
<p>The <code>mrsdeploy</code> package, delivered with Microsoft R Client and R Server, provides functions for:</p>
|
||||
<p><strong>1</strong> Establishing a remote session in a R console application for the purposes of executing code on that server</p>
|
||||
<p><strong>2</strong> Publishing and managing an R web service that is backed by the R code block or script you provided.</p>
|
||||
<p>Each feature can be used independently, but the greatest value is achieved when you can leverage both.</p>
|
||||
<p>This document will walk through you how to deploy a credit risk model as a web service, using the <code>mrsdeploy</code> package.</p>
|
||||
<p>It will start by modelling locally, then publish it as a web service, and then share it with other authenticated users for consumption, and finally manage and update the web service.</p>
|
||||
</div>
|
||||
<div id="automated-credit-risk-model-deployment" class="section level2">
|
||||
<h2>2 Automated Credit Risk Model Deployment</h2>
|
||||
|
@ -236,64 +240,26 @@ model_rxtrees <- rxFastTrees(formula=form,
|
|||
numTrees=100,
|
||||
numLeaves=20,
|
||||
learningRate=0.2,
|
||||
minSplit=10)</code></pre>
|
||||
<pre><code>## Not adding a normalizer.
|
||||
## Making per-feature arrays
|
||||
## Changing data from row-wise to column-wise
|
||||
## Beginning processing data.
|
||||
minSplit=10,
|
||||
unbalancedSets=FALSE,
|
||||
verbose=0)</code></pre>
|
||||
<pre><code>## Beginning processing data.
|
||||
## Rows Read: 128977, Read Time: 0, Transform Time: 0
|
||||
## Beginning processing data.
|
||||
## Processed 128977 instances
|
||||
## Binning and forming Feature objects
|
||||
## Reserved memory for tree learner: 219024 bytes
|
||||
## Starting to train ...
|
||||
## Not training a calibrator because it is not needed.
|
||||
## Elapsed time: 00:00:02.1784776</code></pre>
|
||||
<pre class="r"><code>summary(model_rxtrees)</code></pre>
|
||||
<pre><code>## Elapsed time: 00:00:00.2568312</code></pre>
|
||||
## Beginning processing data.</code></pre>
|
||||
<pre class="r"><code>model_rxtrees</code></pre>
|
||||
<pre><code>## Call:
|
||||
## rxFastTrees(formula = form, data = data[train, c(target, vars)],
|
||||
## type = "binary", numTrees = 100, numLeaves = 20, learningRate = 0.2,
|
||||
## minSplit = 10)
|
||||
## minSplit = 10, unbalancedSets = FALSE, verbose = 0)
|
||||
##
|
||||
## Predictor for: bad_flag~amount_6+pur_6+avg_pur_amt_6+avg_interval_pur_6+credit_limit+age+income+sex+education+marital_status
|
||||
## Data: data[train, c(target, vars)]
|
||||
##
|
||||
## Per-feature gain:
|
||||
## credit_limit: 1
|
||||
## age: 0.8760216
|
||||
## education.undergraduate: 0.3149455
|
||||
## education.polytechnics: 0.3069959
|
||||
## education.master: 0.2611516
|
||||
## income: 0.2581172
|
||||
## sex.female: 0.2488511
|
||||
## marital_status.single: 0.2456198
|
||||
## marital_status.married: 0.2349416
|
||||
## education.high_school: 0.2142461
|
||||
## sex.male: 0.1726051
|
||||
## amount_6: 0.1578717
|
||||
## education.middle_school: 0.1311938
|
||||
## avg_pur_amt_6: 0.08475202
|
||||
## avg_interval_pur_6: 0.02046135</code></pre>
|
||||
## FastTreeBinaryClassification (BinaryClassifierTrainer) for: bad_flag~amount_6+pur_6+avg_pur_amt_6+avg_interval_pur_6+credit_limit+age+income+sex+education+marital_status
|
||||
## Data: data[train, c(target, vars)]</code></pre>
|
||||
<pre class="r"><code># Produce a prediction function that can use the model
|
||||
|
||||
creditRiskPrediction <- function(account_id, amount_6, pur_6, avg_pur_amt_6, avg_interval_pur_6,
|
||||
credit_limit, marital_status, sex, education, income, age)
|
||||
{
|
||||
staticdata <- data.frame(account_id=c("a_1055521381828530", "a_1055521125532160", "a_914800337488587", "a_844428146542225", "a_844428047550192"),
|
||||
amount_6=c(106.99, 212.49, 118.00, 151.19, 148.39),
|
||||
pur_6=c(1, 1, 1, 1, 1),
|
||||
avg_pur_amt_6=c(106.99, 212.49, 118.00, 151.19, 148.39),
|
||||
avg_interval_pur_6=c(0, 0, 0, 0, 0),
|
||||
credit_limit=c(2.21, -0.96, 3.82, -0.39, -0.45),
|
||||
marital_status=c("single", "married", "single", "married", "single"),
|
||||
sex=c("female", "male", "male", "female", "female"),
|
||||
education=c("master", "high_school", "middle_school",
|
||||
"polytechnics", "undergraduate"),
|
||||
income=c(0, 6.27, 17.15, 9.94, 19.93),
|
||||
age=c(33, 47, 30, 51, 32))
|
||||
|
||||
inputdata <- data.frame(account_id=account_id,
|
||||
{
|
||||
newdata <- data.frame(account_id=account_id,
|
||||
amount_6=amount_6,
|
||||
pur_6=pur_6,
|
||||
avg_pur_amt_6=avg_pur_amt_6,
|
||||
|
@ -305,13 +271,10 @@ creditRiskPrediction <- function(account_id, amount_6, pur_6, avg_pur_amt_6,
|
|||
income=income,
|
||||
age=age)
|
||||
|
||||
newdata <- rbind(inputdata, staticdata)
|
||||
|
||||
pred <- rxPredict(model_rxtrees, data=newdata)[, c(1, 3)]
|
||||
pred <- rxPredict(modelObject=model_rxtrees, data=newdata)[, c(1, 3)]
|
||||
pred <- cbind(newdata$account_id, pred)
|
||||
names(pred) <- c("account_id", "scored_label", "scored_prob")
|
||||
pred <- pred[1, ]
|
||||
pred
|
||||
pred
|
||||
}
|
||||
|
||||
# Test function locally by printing results
|
||||
|
@ -328,10 +291,10 @@ pred <- creditRiskPrediction(account_id="a_1055521029582310",
|
|||
income=12.36,
|
||||
age=38)</code></pre>
|
||||
<pre><code>## Beginning processing data.
|
||||
## Rows Read: 6, Read Time: 0, Transform Time: 0
|
||||
## Rows Read: 1, Read Time: 0.001, Transform Time: 0
|
||||
## Beginning processing data.
|
||||
## Elapsed time: 00:00:00.1521922
|
||||
## Finished writing 6 rows.
|
||||
## Elapsed time: 00:00:00.3347313
|
||||
## Finished writing 1 rows.
|
||||
## Writing completed.</code></pre>
|
||||
<pre class="r"><code>print(pred)</code></pre>
|
||||
<pre><code>## account_id scored_label scored_prob
|
||||
|
@ -376,7 +339,7 @@ api <- publishService(
|
|||
</div>
|
||||
<div id="test-the-service-by-consuming-it-in-r" class="section level3">
|
||||
<h3>2.3 Test the service by consuming it in R</h3>
|
||||
<p>Finally, we can consume the service in R directly after publishing it to verify that the results are as expected.</p>
|
||||
<p>After publishing it , we can consume the service in R directly to verify that the results are as expected.</p>
|
||||
<pre class="r"><code># Get service and assign service to the variable `api`.
|
||||
|
||||
api <- getService("crpService", "v1.0.0")
|
||||
|
@ -400,6 +363,52 @@ result <- api$creditRiskPrediction(account_id="a_1055521029582310",
|
|||
print(result$output("pred")) </code></pre>
|
||||
<pre><code>## account_id scored_label scored_prob
|
||||
## 1 a_1055521029582310 no 0.0250131245702505</code></pre>
|
||||
</div>
|
||||
<div id="update-the-web-service" class="section level3">
|
||||
<h3>2.4 Update the web service</h3>
|
||||
<p>In the process of production, we could manage and update the web service timely.</p>
|
||||
<pre class="r"><code># Load the pre-trained optimal model obtained from the template of CreditRiskScale.
|
||||
|
||||
load(file="model_rxtrees.RData")
|
||||
|
||||
model_rxtrees</code></pre>
|
||||
<pre><code>## Call:
|
||||
## rxFastTrees(formula = form, data = data_split[[1]], type = "binary",
|
||||
## numTrees = 500, numLeaves = 20, learningRate = 0.3, minSplit = 10,
|
||||
## unbalancedSets = FALSE, verbose = 0)
|
||||
##
|
||||
## FastTreeBinaryClassification (BinaryClassifierTrainer) for: bad_flag~amount_6+pur_6+avg_pur_amt_6+avg_interval_pur_6+credit_limit+age+income+sex+education+marital_status
|
||||
## Data: data_split[[1]] (RxXdfData Data Source)
|
||||
## File name:
|
||||
## C:\Demo\acceleratoRs-master\CreditRiskPrediction\Code\file4b9418067c23..train.1.xdf</code></pre>
|
||||
<pre class="r"><code>api <- updateService(name="crpService",
|
||||
v="v1.0.0",
|
||||
model=model_rxtrees,
|
||||
descr="Update the model hyper-parameters")
|
||||
|
||||
# Re-test the updated service by consuming it
|
||||
|
||||
result <- api$creditRiskPrediction(account_id="a_1055521029582310",
|
||||
amount_6=173.22,
|
||||
pur_6=1,
|
||||
avg_pur_amt_6=173.22,
|
||||
avg_interval_pur_6=0,
|
||||
credit_limit=5.26,
|
||||
marital_status="married",
|
||||
sex="male",
|
||||
education="undergraduate",
|
||||
income=12.36,
|
||||
age=38)
|
||||
|
||||
# Print response output named `answer`
|
||||
|
||||
print(result$output("pred")) </code></pre>
|
||||
<pre><code>## account_id scored_label scored_prob
|
||||
## 1 a_1055521029582310 no 0.0156644005328417</code></pre>
|
||||
</div>
|
||||
<div id="application-integration" class="section level3">
|
||||
<h3>2.5 Application Integration</h3>
|
||||
<p>Last but not least, we can get the json file that is needed for application integration.</p>
|
||||
<pre class="r"><code># Get this service's `swagger.json` file that is needed for web application integration
|
||||
|
||||
swagger <- api$swagger(json = FALSE)
|
||||
|
|
|
@ -1,449 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"title: \"Deploy a Credit Risk Model as a Web Service\"\n",
|
||||
"author: \"Fang Zhou, Data Scientist, Microsoft\"\n",
|
||||
"date: \"`r Sys.Date()`\"\n",
|
||||
"output: html_document\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"id": "",
|
||||
"include": "FALSE,",
|
||||
"purl": "FALSE"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"knitr::opts_chunk$set(echo = TRUE,\n",
|
||||
" fig.width = 8,\n",
|
||||
" fig.height = 5,\n",
|
||||
" fig.align='center',\n",
|
||||
" dev = \"png\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1 Introduction\n",
|
||||
"\n",
|
||||
"This document will walk through you how to deploy a credit risk model as a web service,\n",
|
||||
"using the `mrsdeploy` package that ships with Microsoft R Client and R Server. \n",
|
||||
"\n",
|
||||
"It will start by creating the model locally, then publish it as a web service, and then share it \n",
|
||||
"with other authenticated users for consumption. \n",
|
||||
"\n",
|
||||
"## 2 Automated Credit Risk Model Deployment\n",
|
||||
"\n",
|
||||
"### 2.1 Setup\n",
|
||||
"\n",
|
||||
"We load the required R packages."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"# Load the required packages into the R session.\n",
|
||||
"\n",
|
||||
"library(rattle) # Use normVarNames().\n",
|
||||
"library(dplyr) # Wrangling: tbl_df(), group_by(), print(), glimpse().\n",
|
||||
"library(magrittr) # Pipe operator %>% %<>% %T>% equals().\n",
|
||||
"library(scales) # Include commas in numbers.\n",
|
||||
"library(MicrosoftML) # Build models using Microsoft ML algortihms.\n",
|
||||
"library(mrsdeploy) # Publish an R model as a web service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Then, the dataset processedSimu is ingested for demonstration. This dataset was created by the data preprocessing steps in the data science accelerator for credit risk prediction."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Data Ingestion\n",
|
||||
"\n",
|
||||
"# Identify the source location of the dataset.\n",
|
||||
"\n",
|
||||
"#DATA <- \"../../Data/\"\n",
|
||||
"#txn_fname <- file.path(DATA, \"Raw/processedSimu.csv\")\n",
|
||||
"\n",
|
||||
"wd <- getwd()\n",
|
||||
"\n",
|
||||
"dpath <- \"../Data\"\n",
|
||||
"data_fname <- file.path(wd, dpath, \"processedSimu.csv\")\n",
|
||||
"\n",
|
||||
"# Ingest the dataset.\n",
|
||||
"\n",
|
||||
"data <- read.csv(file=data_fname) %T>% \n",
|
||||
" {dim(.) %>% comma() %>% cat(\"\\n\")}\n",
|
||||
"\n",
|
||||
"# A glimpse into the data.\n",
|
||||
"\n",
|
||||
"glimpse(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 2.2 Model Locally\n",
|
||||
"\n",
|
||||
"Now, let's get started to build an R model based web service. \n",
|
||||
"\n",
|
||||
"First of all, we create a machine learning fast tree model on the dataset processedSimu by using the function `rxFastTrees()` from the `MicrosoftML` package. This model could be used to predict whether an account will default or to predict its probability of default, given some transaction statistics and demographic & bank account information as inputs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Variable roles.\n",
|
||||
"\n",
|
||||
"# Target variable\n",
|
||||
"\n",
|
||||
"target <- \"bad_flag\"\n",
|
||||
"\n",
|
||||
"# Note any identifier.\n",
|
||||
"\n",
|
||||
"id <- c(\"account_id\") %T>% print() \n",
|
||||
"\n",
|
||||
"# Note the available variables as model inputs.\n",
|
||||
"\n",
|
||||
"vars <- setdiff(names(data), c(target, id))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Split Data\n",
|
||||
"\n",
|
||||
"set.seed(42)\n",
|
||||
"\n",
|
||||
"data <- data[order(runif(nrow(data))), ]\n",
|
||||
"\n",
|
||||
"train <- sample(nrow(data), 0.70 * nrow(data))\n",
|
||||
"test <- setdiff(seq_len(nrow(data)), train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Prepare the formula\n",
|
||||
"\n",
|
||||
"top_vars <- c(\"amount_6\", \"pur_6\", \"avg_pur_amt_6\", \"avg_interval_pur_6\", \"credit_limit\", \"age\", \"income\", \"sex\", \"education\", \"marital_status\")\n",
|
||||
"\n",
|
||||
"form <- as.formula(paste(target, paste(top_vars, collapse=\"+\"), sep=\"~\"))\n",
|
||||
"form"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Train model: rxFastTrees\n",
|
||||
"\n",
|
||||
"model_rxtrees <- rxFastTrees(formula=form,\n",
|
||||
" data=data[train, c(target, vars)],\n",
|
||||
" type=\"binary\",\n",
|
||||
" numTrees=100,\n",
|
||||
" numLeaves=20,\n",
|
||||
" learningRate=0.2,\n",
|
||||
" minSplit=10)\n",
|
||||
"\n",
|
||||
"summary(model_rxtrees)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Produce a prediction function that can use the model\n",
|
||||
"\n",
|
||||
"creditRiskPrediction <- function(account_id, amount_6, pur_6, avg_pur_amt_6, avg_interval_pur_6, \n",
|
||||
" credit_limit, marital_status, sex, education, income, age)\n",
|
||||
"{\n",
|
||||
" staticdata <- data.frame(account_id=c(\"a_1055521381828530\", \"a_1055521125532160\", \"a_914800337488587\", \"a_844428146542225\", \"a_844428047550192\"),\n",
|
||||
" amount_6=c(106.99, 212.49, 118.00, 151.19, 148.39),\n",
|
||||
" pur_6=c(1, 1, 1, 1, 1),\n",
|
||||
" avg_pur_amt_6=c(106.99, 212.49, 118.00, 151.19, 148.39),\n",
|
||||
" avg_interval_pur_6=c(0, 0, 0, 0, 0),\n",
|
||||
" credit_limit=c(2.21, -0.96, 3.82, -0.39, -0.45),\n",
|
||||
" marital_status=c(\"single\", \"married\", \"single\", \"married\", \"single\"),\n",
|
||||
" sex=c(\"female\", \"male\", \"male\", \"female\", \"female\"),\n",
|
||||
" education=c(\"master\", \"high_school\", \"middle_school\", \n",
|
||||
" \"polytechnics\", \"undergraduate\"),\n",
|
||||
" income=c(0, 6.27, 17.15, 9.94, 19.93),\n",
|
||||
" age=c(33, 47, 30, 51, 32))\n",
|
||||
" \n",
|
||||
" inputdata <- data.frame(account_id=account_id,\n",
|
||||
" amount_6=amount_6, \n",
|
||||
" pur_6=pur_6, \n",
|
||||
" avg_pur_amt_6=avg_pur_amt_6, \n",
|
||||
" avg_interval_pur_6=avg_interval_pur_6, \n",
|
||||
" credit_limit=credit_limit, \n",
|
||||
" marital_status=marital_status, \n",
|
||||
" sex=sex, \n",
|
||||
" education=education, \n",
|
||||
" income=income, \n",
|
||||
" age=age)\n",
|
||||
" \n",
|
||||
" newdata <- rbind(inputdata, staticdata)\n",
|
||||
" \n",
|
||||
" pred <- rxPredict(model_rxtrees, data=newdata)[, c(1, 3)]\n",
|
||||
" pred <- cbind(newdata$account_id, pred)\n",
|
||||
" names(pred) <- c(\"account_id\", \"scored_label\", \"scored_prob\")\n",
|
||||
" pred <- pred[1, ]\n",
|
||||
" pred\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Test function locally by printing results\n",
|
||||
"\n",
|
||||
"pred <- creditRiskPrediction(account_id=\"a_1055521029582310\",\n",
|
||||
" amount_6=173.22, \n",
|
||||
" pur_6=1, \n",
|
||||
" avg_pur_amt_6=173.22, \n",
|
||||
" avg_interval_pur_6=0, \n",
|
||||
" credit_limit=5.26, \n",
|
||||
" marital_status=\"married\", \n",
|
||||
" sex=\"male\", \n",
|
||||
" education=\"undergraduate\", \n",
|
||||
" income=12.36, \n",
|
||||
" age=38)\n",
|
||||
"\n",
|
||||
"print(pred)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 2.2 Publish model as a web service\n",
|
||||
"\n",
|
||||
"The second procedure is to publish the model as a web service by following the below steps.\n",
|
||||
"\n",
|
||||
"Step 1: From your local R IDE, log into Microsoft R Server with your credentials using the appropriate authentication function from the `mrsdeploy` package (remoteLogin or remoteLoginAAD). \n",
|
||||
"\n",
|
||||
"For simplicity, the code below uses the basic local admin account for authentication with the remoteLogin function and `session = false` so that no remote R session is started."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Use `remoteLogin` to authenticate with R Server using \n",
|
||||
"# the local admin account. Use session = false so no \n",
|
||||
"# remote R session started\n",
|
||||
"\n",
|
||||
"remoteLogin(\"http://localhost:12800\", \n",
|
||||
" username=\"admin\", \n",
|
||||
" password=\"P@ssw0rd\",\n",
|
||||
" session=FALSE)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, you are successfully connected to the remote R Server.\n",
|
||||
"\n",
|
||||
"Step 2: Publish the model as a web service to R Server using the `publishService()` function from the `mrsdeploy` package. \n",
|
||||
"\n",
|
||||
"In this example, you publish a web service called \"crpService\" using the model `model_rxtrees` and the function `creditRiskPrediction()`. As an input, the service takes a list of transaction statistics and demographic & bank account information represented as numerical or categorical. As an output, an R data frame including the account id, the predicted label of default, and the probability of default for the given individual account, has of being achieved with the pre-defined credit risk prediction function. \n",
|
||||
"\n",
|
||||
"When publishing, you must specify, among other parameters, a service name and version, the R code, the inputs, as well as the outputs that application developers will need to integrate in their applications."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "FALSE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Publish a web service\n",
|
||||
"\n",
|
||||
"api <- publishService(\n",
|
||||
" \"crpService\",\n",
|
||||
" code=creditRiskPrediction,\n",
|
||||
" model=model_rxtrees,\n",
|
||||
" inputs=list(account_id=\"character\",\n",
|
||||
" amount_6=\"numeric\", \n",
|
||||
" pur_6=\"numeric\", \n",
|
||||
" avg_pur_amt_6=\"numeric\", \n",
|
||||
" avg_interval_pur_6=\"numeric\", \n",
|
||||
" credit_limit=\"numeric\", \n",
|
||||
" marital_status=\"character\", \n",
|
||||
" sex=\"character\", \n",
|
||||
" education=\"character\", \n",
|
||||
" income=\"numeric\", \n",
|
||||
" age=\"numeric\"),\n",
|
||||
" outputs=list(pred=\"data.frame\"),\n",
|
||||
" v=\"v1.0.0\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 2.3 Test the service by consuming it in R\n",
|
||||
"\n",
|
||||
"Finally, we can consume the service in R directly after publishing it to verify that the results are as expected."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"attributes": {
|
||||
"classes": [],
|
||||
"error": "TRUE",
|
||||
"id": "",
|
||||
"message": "FALSE,",
|
||||
"warning": "FALSE,"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get service and assign service to the variable `api`.\n",
|
||||
"\n",
|
||||
"api <- getService(\"crpService\", \"v1.0.0\")\n",
|
||||
"\n",
|
||||
"# Consume service by calling function, `creditRiskPrediction` contained in this service\n",
|
||||
"\n",
|
||||
"result <- api$creditRiskPrediction(account_id=\"a_1055521029582310\",\n",
|
||||
" amount_6=173.22, \n",
|
||||
" pur_6=1, \n",
|
||||
" avg_pur_amt_6=173.22, \n",
|
||||
" avg_interval_pur_6=0, \n",
|
||||
" credit_limit=5.26, \n",
|
||||
" marital_status=\"married\", \n",
|
||||
" sex=\"male\", \n",
|
||||
" education=\"undergraduate\", \n",
|
||||
" income=12.36, \n",
|
||||
" age=38)\n",
|
||||
"\n",
|
||||
"# Print response output named `answer`\n",
|
||||
"\n",
|
||||
"print(result$output(\"pred\")) \n",
|
||||
"\n",
|
||||
"# Get this service's `swagger.json` file that is needed for web application integration\n",
|
||||
"\n",
|
||||
"swagger <- api$swagger(json = FALSE)\n",
|
||||
"\n",
|
||||
"# Delete the service to make the script re-runable\n",
|
||||
"\n",
|
||||
"deleteService(name=\"crpService\", v=\"v1.0.0\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
|
@ -77,7 +77,9 @@ rxGetVarInfo(data)
|
|||
|
||||
Now, let's get started to build credit risk models by leveraging different machine learning algorithms from the `MicrosoftML` package.
|
||||
|
||||
First of all, we create individual machine learning models on the dataset processedSimu by using the functions `rxLogisticRegression()`, `rxFastTrees()`, and `rxFastForest()`.
|
||||
First of all, we create individual machine learning models on the dataset processedSimu.xdf by using the functions `rxLogisticRegression()`, `rxFastForest()`, `rxFastTrees()`.
|
||||
|
||||
From the credit risk prediction template, we know that gradient boosting is the most suitable algorithm for this example, considering the overall performance. Therefore, the models implemented by the function `rxFastTrees()` with different sets of parameters are trained respectively.
|
||||
|
||||
```{r, message=FALSE, warning=FALSE, error=FALSE}
|
||||
## Variable roles.
|
||||
|
@ -100,9 +102,6 @@ vars <- setdiff(names(data), c(target, id))
|
|||
|
||||
set.seed(42)
|
||||
|
||||
train <- sample(nrow(data), 0.70 * nrow(data))
|
||||
test <- setdiff(seq_len(nrow(data)), train)
|
||||
|
||||
# Add training/testing flag to each observation.
|
||||
|
||||
data %<>%
|
||||
|
@ -138,21 +137,9 @@ time_rxlogit <- system.time(
|
|||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
l1Weight=1)
|
||||
)
|
||||
|
||||
# Train model: rxFastTrees
|
||||
|
||||
time_rxtrees <- system.time(
|
||||
|
||||
model_rxtrees <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=100,
|
||||
numLeaves=20,
|
||||
learningRate=0.2,
|
||||
minSplit=10)
|
||||
l1Weight=1,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
# Train model: rxFastForest
|
||||
|
@ -165,11 +152,75 @@ time_rxforest <- system.time(
|
|||
type="binary",
|
||||
numTrees=100,
|
||||
numLeaves=20,
|
||||
minSplit=10)
|
||||
minSplit=10,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
# Train model: rxFastTrees
|
||||
|
||||
time_rxtrees1 <- system.time(
|
||||
|
||||
model_rxtrees1 <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=100,
|
||||
numLeaves=20,
|
||||
learningRate=0.2,
|
||||
minSplit=10,
|
||||
unbalancedSets=FALSE,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
time_rxtrees2 <- system.time(
|
||||
|
||||
model_rxtrees2 <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=500,
|
||||
numLeaves=20,
|
||||
learningRate=0.2,
|
||||
minSplit=10,
|
||||
unbalancedSets=FALSE,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
time_rxtrees3 <- system.time(
|
||||
|
||||
model_rxtrees3 <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=500,
|
||||
numLeaves=20,
|
||||
learningRate=0.3,
|
||||
minSplit=10,
|
||||
unbalancedSets=FALSE,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
time_rxtrees4 <- system.time(
|
||||
|
||||
model_rxtrees4 <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=500,
|
||||
numLeaves=20,
|
||||
learningRate=0.3,
|
||||
minSplit=10,
|
||||
unbalancedSets=TRUE,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
Next, we build an ensemble of models by using the function `rxEnsemble()`.
|
||||
Next, we build an ensemble of fast tree models by using the function `rxEnsemble()`.
|
||||
|
||||
```{r, message=FALSE, warning=FALSE, error=FALSE}
|
||||
# Train an ensemble model.
|
||||
|
@ -180,20 +231,27 @@ time_ensemble <- system.time(
|
|||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
trainers=list(logisticRegression(), fastTrees(), fastForest()),
|
||||
replace=TRUE
|
||||
trainers=list(fastTrees(),
|
||||
fastTrees(numTrees=500),
|
||||
fastTrees(numTrees=500, learningRate=0.3),
|
||||
fastTrees(numTrees=500, learningRate=0.3, unbalancedSets=TRUE)),
|
||||
combineMethod="vote",
|
||||
replace=TRUE,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 2.3 Model Evaluation
|
||||
|
||||
Finally, we evaluate and compare the above built models.
|
||||
Finally, we evaluate and compare the above built models at various aspects.
|
||||
|
||||
```{r, message=FALSE, warning=FALSE, error=FALSE}
|
||||
# Predict
|
||||
|
||||
models <- list(model_rxlogit, model_rxtrees, model_rxforest, model_ensemble)
|
||||
models <- list(model_rxlogit, model_rxforest,
|
||||
model_rxtrees1, model_rxtrees2, model_rxtrees3, model_rxtrees4,
|
||||
model_ensemble)
|
||||
|
||||
# Predict class
|
||||
|
||||
|
@ -202,7 +260,7 @@ predictions <- lapply(models,
|
|||
data=data_split[[2]]) %>%
|
||||
lapply('[[', 1)
|
||||
|
||||
levels(predictions[[4]]) <- c("no", "yes")
|
||||
levels(predictions[[7]]) <- c("no", "yes")
|
||||
|
||||
# Confusion matrix evaluation results.
|
||||
|
||||
|
@ -237,7 +295,7 @@ pre_metrics <-
|
|||
|
||||
# Predict class probability
|
||||
|
||||
probs <- lapply(models[c(1, 2, 3)],
|
||||
probs <- lapply(models[c(1, 2, 3, 4, 5, 6)],
|
||||
rxPredict,
|
||||
data=data_split[[2]]) %>%
|
||||
lapply('[[', 3)
|
||||
|
@ -259,8 +317,18 @@ auc_metrics <- lapply(preds,
|
|||
|
||||
auc_metrics <- c(auc_metrics, NaN)
|
||||
|
||||
algo_list <- c("rxLogisticRegression", "rxFastTrees", "rxFastForest", "rxEnsemble")
|
||||
time_consumption <- c(time_rxlogit[3], time_rxtrees[3], time_rxforest[[3]], time_ensemble[3])
|
||||
algo_list <- c("rxLogisticRegression",
|
||||
"rxFastForest",
|
||||
"rxFastTrees",
|
||||
"rxFastTrees(500)",
|
||||
"rxFastTrees(500, 0.3)",
|
||||
"rxFastTrees(500, 0.3, ub)",
|
||||
"rxEnsemble")
|
||||
|
||||
time_consumption <- c(time_rxlogit[3], time_rxforest[[3]],
|
||||
time_rxtrees1[3], time_rxtrees2[[3]],
|
||||
time_rxtrees3[[3]], time_rxtrees4[[3]],
|
||||
time_ensemble[3])
|
||||
|
||||
df_comp <-
|
||||
data.frame(Models=algo_list,
|
||||
|
@ -269,6 +337,18 @@ df_comp <-
|
|||
Precision=pre_metrics,
|
||||
AUC=auc_metrics,
|
||||
Time=time_consumption) %T>%
|
||||
{head(.) %>% print()}
|
||||
print()
|
||||
```
|
||||
|
||||
### 2.4 Save Models for Deployment
|
||||
|
||||
Last but not least, we need to save the model objects in various formats, (e.g., `.RData`, `SQLServerData`, ect) for the later usage of deployment.
|
||||
|
||||
```{r, message=FALSE, warning=FALSE, error=FALSE}
|
||||
# Save model for deployment usage.
|
||||
|
||||
model_rxtrees <- model_rxtrees3
|
||||
|
||||
save(model_rxtrees, file="model_rxtrees.RData")
|
||||
```
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
<meta name="author" content="Fang Zhou, Data Scientist, Microsoft" />
|
||||
|
||||
<meta name="date" content="2017-05-17" />
|
||||
<meta name="date" content="2017-05-18" />
|
||||
|
||||
<title>Faster and Scalable Credit Risk Prediction</title>
|
||||
|
||||
|
@ -119,7 +119,7 @@ $(document).ready(function () {
|
|||
|
||||
<h1 class="title toc-ignore">Faster and Scalable Credit Risk Prediction</h1>
|
||||
<h4 class="author"><em>Fang Zhou, Data Scientist, Microsoft</em></h4>
|
||||
<h4 class="date"><em>2017-05-17</em></h4>
|
||||
<h4 class="date"><em>2017-05-18</em></h4>
|
||||
|
||||
</div>
|
||||
|
||||
|
@ -210,7 +210,8 @@ rxGetVarInfo(data)</code></pre>
|
|||
<div id="model-building" class="section level3">
|
||||
<h3>2.2 Model Building</h3>
|
||||
<p>Now, let’s get started to build credit risk models by leveraging different machine learning algorithms from the <code>MicrosoftML</code> package.</p>
|
||||
<p>First of all, we create individual machine learning models on the dataset processedSimu by using the functions <code>rxLogisticRegression()</code>, <code>rxFastTrees()</code>, and <code>rxFastForest()</code>.</p>
|
||||
<p>First of all, we create individual machine learning models on the dataset processedSimu.xdf by using the functions <code>rxLogisticRegression()</code>, <code>rxFastForest()</code>, <code>rxFastTrees()</code>.</p>
|
||||
<p>From the credit risk prediction template, we know that gradient boosting is the most suitable algorithm for this example, considering the overall performance. Therefore, the models implemented by the function <code>rxFastTrees()</code> with different sets of parameters are trained respectively.</p>
|
||||
<pre class="r"><code>## Variable roles.
|
||||
|
||||
# Target variable
|
||||
|
@ -228,9 +229,6 @@ vars <- setdiff(names(data), c(target, id))</code></pre>
|
|||
|
||||
set.seed(42)
|
||||
|
||||
train <- sample(nrow(data), 0.70 * nrow(data))
|
||||
test <- setdiff(seq_len(nrow(data)), train)
|
||||
|
||||
# Add training/testing flag to each observation.
|
||||
|
||||
data %<>%
|
||||
|
@ -262,7 +260,9 @@ time_rxlogit <- system.time(
|
|||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
l1Weight=1)
|
||||
l1Weight=1,
|
||||
verbose=0
|
||||
)
|
||||
)</code></pre>
|
||||
<pre><code>## Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
|
||||
## LBFGS multi-threading will attempt to load dataset into memory. In case of out-of-memory issues, turn off multi-threading by setting trainThreads to 1.
|
||||
|
@ -271,30 +271,8 @@ time_rxlogit <- system.time(
|
|||
## improvement criterion: Mean Improvement
|
||||
## L1 regularization selected 13 of 17 weights.
|
||||
## Not training a calibrator because it is not needed.
|
||||
## Elapsed time: 00:00:01.9451805
|
||||
## Elapsed time: 00:00:00.2341484</code></pre>
|
||||
<pre class="r"><code># Train model: rxFastTrees
|
||||
|
||||
time_rxtrees <- system.time(
|
||||
|
||||
model_rxtrees <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=100,
|
||||
numLeaves=20,
|
||||
learningRate=0.2,
|
||||
minSplit=10)
|
||||
)</code></pre>
|
||||
<pre><code>## Not adding a normalizer.
|
||||
## Making per-feature arrays
|
||||
## Changing data from row-wise to column-wise
|
||||
## Processed 129155 instances
|
||||
## Binning and forming Feature objects
|
||||
## Reserved memory for tree learner: 219024 bytes
|
||||
## Starting to train ...
|
||||
## Not training a calibrator because it is not needed.
|
||||
## Elapsed time: 00:00:01.4975066</code></pre>
|
||||
## Elapsed time: 00:00:01.8597467
|
||||
## Elapsed time: 00:00:00.2332643</code></pre>
|
||||
<pre class="r"><code># Train model: rxFastForest
|
||||
|
||||
time_rxforest <- system.time(
|
||||
|
@ -305,18 +283,73 @@ time_rxforest <- system.time(
|
|||
type="binary",
|
||||
numTrees=100,
|
||||
numLeaves=20,
|
||||
minSplit=10)
|
||||
minSplit=10,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
# Train model: rxFastTrees
|
||||
|
||||
time_rxtrees1 <- system.time(
|
||||
|
||||
model_rxtrees1 <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=100,
|
||||
numLeaves=20,
|
||||
learningRate=0.2,
|
||||
minSplit=10,
|
||||
unbalancedSets=FALSE,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
time_rxtrees2 <- system.time(
|
||||
|
||||
model_rxtrees2 <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=500,
|
||||
numLeaves=20,
|
||||
learningRate=0.2,
|
||||
minSplit=10,
|
||||
unbalancedSets=FALSE,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
time_rxtrees3 <- system.time(
|
||||
|
||||
model_rxtrees3 <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=500,
|
||||
numLeaves=20,
|
||||
learningRate=0.3,
|
||||
minSplit=10,
|
||||
unbalancedSets=FALSE,
|
||||
verbose=0
|
||||
)
|
||||
)
|
||||
|
||||
time_rxtrees4 <- system.time(
|
||||
|
||||
model_rxtrees4 <- rxFastTrees(
|
||||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
numTrees=500,
|
||||
numLeaves=20,
|
||||
learningRate=0.3,
|
||||
minSplit=10,
|
||||
unbalancedSets=TRUE,
|
||||
verbose=0
|
||||
)
|
||||
)</code></pre>
|
||||
<pre><code>## Not adding a normalizer.
|
||||
## Making per-feature arrays
|
||||
## Changing data from row-wise to column-wise
|
||||
## Processed 129155 instances
|
||||
## Binning and forming Feature objects
|
||||
## Reserved memory for tree learner: 219024 bytes
|
||||
## Starting to train ...
|
||||
## Training calibrator.
|
||||
## Elapsed time: 00:00:02.2617353</code></pre>
|
||||
<p>Next, we build an ensemble of models by using the function <code>rxEnsemble()</code>.</p>
|
||||
<p>Next, we build an ensemble of fast tree models by using the function <code>rxEnsemble()</code>.</p>
|
||||
<pre class="r"><code># Train an ensemble model.
|
||||
|
||||
time_ensemble <- system.time(
|
||||
|
@ -325,18 +358,25 @@ time_ensemble <- system.time(
|
|||
formula=form,
|
||||
data=data_split[[1]],
|
||||
type="binary",
|
||||
trainers=list(logisticRegression(), fastTrees(), fastForest()),
|
||||
replace=TRUE
|
||||
trainers=list(fastTrees(),
|
||||
fastTrees(numTrees=500),
|
||||
fastTrees(numTrees=500, learningRate=0.3),
|
||||
fastTrees(numTrees=500, learningRate=0.3, unbalancedSets=TRUE)),
|
||||
combineMethod="vote",
|
||||
replace=TRUE,
|
||||
verbose=0
|
||||
)
|
||||
)</code></pre>
|
||||
<pre><code>## Elapsed time: 00:00:00.4024905</code></pre>
|
||||
<pre><code>## Elapsed time: 00:00:00.6069022</code></pre>
|
||||
</div>
|
||||
<div id="model-evaluation" class="section level3">
|
||||
<h3>2.3 Model Evaluation</h3>
|
||||
<p>Finally, we evaluate and compare the above built models.</p>
|
||||
<p>Finally, we evaluate and compare the above built models at various aspects.</p>
|
||||
<pre class="r"><code># Predict
|
||||
|
||||
models <- list(model_rxlogit, model_rxtrees, model_rxforest, model_ensemble)
|
||||
models <- list(model_rxlogit, model_rxforest,
|
||||
model_rxtrees1, model_rxtrees2, model_rxtrees3, model_rxtrees4,
|
||||
model_ensemble)
|
||||
|
||||
# Predict class
|
||||
|
||||
|
@ -344,11 +384,14 @@ predictions <- lapply(models,
|
|||
rxPredict,
|
||||
data=data_split[[2]]) %>%
|
||||
lapply('[[', 1)</code></pre>
|
||||
<pre><code>## Elapsed time: 00:00:00.3329582
|
||||
## Elapsed time: 00:00:00.5682058
|
||||
## Elapsed time: 00:00:00.5726077
|
||||
## Elapsed time: 00:00:00.9745613</code></pre>
|
||||
<pre class="r"><code>levels(predictions[[4]]) <- c("no", "yes")
|
||||
<pre><code>## Elapsed time: 00:00:00.3323721
|
||||
## Elapsed time: 00:00:00.6021822
|
||||
## Elapsed time: 00:00:00.5783706
|
||||
## Elapsed time: 00:00:03.2957191
|
||||
## Elapsed time: 00:00:03.1592642
|
||||
## Elapsed time: 00:00:03.3192340
|
||||
## Elapsed time: 00:00:05.7721651</code></pre>
|
||||
<pre class="r"><code>levels(predictions[[7]]) <- c("no", "yes")
|
||||
|
||||
# Confusion matrix evaluation results.
|
||||
|
||||
|
@ -383,13 +426,16 @@ pre_metrics <-
|
|||
|
||||
# Predict class probability
|
||||
|
||||
probs <- lapply(models[c(1, 2, 3)],
|
||||
probs <- lapply(models[c(1, 2, 3, 4, 5, 6)],
|
||||
rxPredict,
|
||||
data=data_split[[2]]) %>%
|
||||
lapply('[[', 3)</code></pre>
|
||||
<pre><code>## Elapsed time: 00:00:00.2420659
|
||||
## Elapsed time: 00:00:00.5683409
|
||||
## Elapsed time: 00:00:00.5884139</code></pre>
|
||||
<pre><code>## Elapsed time: 00:00:00.2500800
|
||||
## Elapsed time: 00:00:00.6540367
|
||||
## Elapsed time: 00:00:00.5695867
|
||||
## Elapsed time: 00:00:03.3178125
|
||||
## Elapsed time: 00:00:03.1851176
|
||||
## Elapsed time: 00:00:03.2898501</code></pre>
|
||||
<pre class="r"><code># Create prediction object
|
||||
|
||||
preds <- lapply(probs,
|
||||
|
@ -407,8 +453,18 @@ auc_metrics <- lapply(preds,
|
|||
|
||||
auc_metrics <- c(auc_metrics, NaN)
|
||||
|
||||
algo_list <- c("rxLogisticRegression", "rxFastTrees", "rxFastForest", "rxEnsemble")
|
||||
time_consumption <- c(time_rxlogit[3], time_rxtrees[3], time_rxforest[[3]], time_ensemble[3])
|
||||
algo_list <- c("rxLogisticRegression",
|
||||
"rxFastForest",
|
||||
"rxFastTrees",
|
||||
"rxFastTrees(500)",
|
||||
"rxFastTrees(500, 0.3)",
|
||||
"rxFastTrees(500, 0.3, ub)",
|
||||
"rxEnsemble")
|
||||
|
||||
time_consumption <- c(time_rxlogit[3], time_rxforest[[3]],
|
||||
time_rxtrees1[3], time_rxtrees2[[3]],
|
||||
time_rxtrees3[[3]], time_rxtrees4[[3]],
|
||||
time_ensemble[3])
|
||||
|
||||
df_comp <-
|
||||
data.frame(Models=algo_list,
|
||||
|
@ -417,12 +473,24 @@ df_comp <-
|
|||
Precision=pre_metrics,
|
||||
AUC=auc_metrics,
|
||||
Time=time_consumption) %T>%
|
||||
{head(.) %>% print()}</code></pre>
|
||||
<pre><code>## Models Accuracy Recall Precision AUC Time
|
||||
## 1 rxLogisticRegression 0.9712880 0.0000000 NaN 0.5700583 2.36
|
||||
## 2 rxFastTrees 0.9807982 0.4589128 0.7823276 0.9383639 1.61
|
||||
## 3 rxFastForest 0.9712880 0.0000000 NaN 0.6735370 2.36
|
||||
## 4 rxEnsemble 0.9712880 0.0000000 NaN NaN 6.28</code></pre>
|
||||
print()</code></pre>
|
||||
<pre><code>## Models Accuracy Recall Precision AUC Time
|
||||
## 1 rxLogisticRegression 0.9710578 0.0000000 NaN 0.5808235 2.27
|
||||
## 2 rxFastForest 0.9710578 0.0000000 NaN 0.6917648 2.57
|
||||
## 3 rxFastTrees 0.9805252 0.4527363 0.7827957 0.9392378 1.47
|
||||
## 4 rxFastTrees(500) 0.9877968 0.6915423 0.8593509 0.9833151 4.57
|
||||
## 5 rxFastTrees(500, 0.3) 0.9873108 0.7226368 0.8177340 0.9675594 4.42
|
||||
## 6 rxFastTrees(500, 0.3, ub) 0.9848989 0.9689055 0.6638262 0.9948528 4.23
|
||||
## 7 rxEnsemble 0.9833510 0.4471393 0.9523179 NaN 11.40</code></pre>
|
||||
</div>
|
||||
<div id="save-models-for-deployment" class="section level3">
|
||||
<h3>2.4 Save Models for Deployment</h3>
|
||||
<p>Last but not least, we need to save the model objects in various formats, (e.g., <code>.RData</code>, <code>SQLServerData</code>, ect) for the later usage of deployment.</p>
|
||||
<pre class="r"><code># Save model for deployment usage.
|
||||
|
||||
model_rxtrees <- model_rxtrees3
|
||||
|
||||
save(model_rxtrees, file="model_rxtrees.RData")</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
|
@ -9,9 +9,13 @@ Some other critical R packages for the analysis:
|
|||
* glmnet >= 2.0-5 Logistic regression model with L1 and L2 regularization.
|
||||
* xgboost >= 0.6-4 Extreme gradiant boost model.
|
||||
* randomForest >= 4.6-12 Random Forest model.
|
||||
* caret >= 6.0-73 Classification and regression training.
|
||||
* caretEnsemble >= 2.0.0 Ensemble of caret based models.
|
||||
* dplyrXdf Out-of-Memory Data wrangling.
|
||||
|
||||
* RevoScaleR >= 9.1 Parallel and chunked data processing and modeling.
|
||||
* dplyrXdf >= 0.9.2 Out-of-Memory Data wrangling.
|
||||
* MicrosoftML >= 9.1 Microsoft machine learning models.
|
||||
|
||||
* mrsdeploy >= 9.1 R Server Operationalization.
|
||||
|
||||
# Use of template
|
||||
|
|
Загрузка…
Ссылка в новой задаче