update student drop out template

This commit is contained in:
Zhou Fang 2017-08-11 11:22:22 +08:00
Родитель 6da8c65394
Коммит 63f3288b5d
6 изменённых файлов: 1392 добавлений и 1134 удалений

Просмотреть файл

@ -20,7 +20,7 @@ knitr::opts_chunk$set(fig.width = 6,
```
# Introducation
# Introduction
Welcome to the Data Science Design Pattern for Student Drop Out. This pattern provides a starting point for the data scientist exploring a new dataset. By no means is it the end point of the data science journey. The pattern is under regular revision and improvement and is provided as is.
@ -69,7 +69,8 @@ We use the studentDropIndia dataset simulated from student data in UCI repositor
# Identify the source location of the dataset.
dspath <- "../../Data/studentDropIndia_20161215.csv"
#dspath <- "../../Data/studentDropIndia_20161215.csv"
dspath <- file.path("C:/Demo/EducationAnalytics/Data/studentDropIndia_20161215.csv")
# Ingest the dataset.
@ -527,7 +528,7 @@ form <- formula(target %s+% " ~ .") %T>% print()
# Use correlation search to identify key variables.
# Could be useful to decide which variables to retain.
cfs(form, ds[vars])
#cfs(form, ds[vars])
```
```{r, message=FALSE, warning=FALSE, error=FALSE}
@ -541,9 +542,9 @@ Or, we can use the function information.gain() to identify varaible importance a
```{r, message=FALSE, warning=FALSE, error=FALSE}
# Use information gain to identify variable importance.
information.gain(form, ds[vars]) %>%
rownames_to_column() %>%
arrange(attr_importance)
#information.gain(form, ds[vars]) %>%
# rownames_to_column() %>%
# arrange(attr_importance)
# Any variables to remove because not useful?
@ -818,5 +819,6 @@ save(ds, dsname, dspath, dsdate, nobs,
vars, target, id, ignore, omit,
inputi, inputs, numi, numc, cati, catc,
file=dsrdata)
```

Просмотреть файл

@ -73,7 +73,7 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<div id="TOC">
<ul>
<li><a href="#introducation">Introducation</a></li>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#pre-configuration">Pre-configuration</a></li>
<li><a href="#step-1.1-load---dataset">Step 1.1: Load - Dataset</a></li>
<li><a href="#step-1.2-load---generic-variables">Step 1.2: Load - Generic Variables</a></li>
@ -105,8 +105,8 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
</ul>
</div>
<div id="introducation" class="section level1">
<h1>Introducation</h1>
<div id="introduction" class="section level1">
<h1>Introduction</h1>
<p>Welcome to the Data Science Design Pattern for Student Drop Out. This pattern provides a starting point for the data scientist exploring a new dataset. By no means is it the end point of the data science journey. The pattern is under regular revision and improvement and is provided as is.</p>
<p>We now begin with the task of preparing our data for building models using R.</p>
</div>
@ -126,15 +126,14 @@ code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Inf
<span class="kw">library</span>(stringi) <span class="co"># String concat operator %s+%.</span>
<span class="kw">library</span>(stringr) <span class="co"># String manipulation: str_replace().</span>
<span class="kw">library</span>(randomForest) <span class="co"># Impute missing values with na.roughfix()</span>
<span class="kw">library</span>(FSelector) <span class="co"># Feature selection: information.gain().</span>
<span class="kw">library</span>(ggplot2) <span class="co"># Visualise data.</span>
<span class="kw">library</span>(tibble) <span class="co"># Table data frame: rownames_to_column()</span></code></pre></div>
<p>Lets define some utility functions which simplify the coding.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Useful utility functions.</span>
echo &lt;-<span class="st"> </span>function(x, <span class="dt">big.mark=</span><span class="st">&quot;,&quot;</span>, ...)
echo &lt;-<span class="st"> </span><span class="cf">function</span>(x, <span class="dt">big.mark=</span><span class="st">&quot;,&quot;</span>, ...)
{
<span class="kw">format</span>(x, <span class="dt">big.mark=</span>big.mark, ...) %&gt;%<span class="st"> </span><span class="kw">cat</span>(<span class="st">&quot;</span><span class="ch">\n</span><span class="st">&quot;</span>)
<span class="kw">format</span>(x, <span class="dt">big.mark=</span>big.mark, ...) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">cat</span>(<span class="st">&quot;</span><span class="ch">\n</span><span class="st">&quot;</span>)
}</code></pre></div>
</div>
<div id="step-1.1-load---dataset" class="section level1">
@ -145,7 +144,8 @@ echo &lt;-<span class="st"> </span>function(x, <span class="dt">big.mark=</span>
<span class="co"># Identify the source location of the dataset.</span>
dspath &lt;-<span class="st"> &quot;C:/Users/zhouf/Documents/Revolution Analytics/Projects/Education2/Demo/studentDropIndia_20161215.csv&quot;</span>
<span class="co">#dspath &lt;- &quot;../../Data/studentDropIndia_20161215.csv&quot;</span>
dspath &lt;-<span class="st"> </span><span class="kw">file.path</span>(<span class="st">&quot;C:/Demo/EducationAnalytics/Data/studentDropIndia_20161215.csv&quot;</span>)
<span class="co"># Ingest the dataset.</span>
@ -182,8 +182,8 @@ ds &lt;-<span class="st"> </span><span class="kw">get</span>(dsname)
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Save the dataset to disk as a binary backup if needed.</span>
fpath &lt;-<span class="st"> &quot;data&quot;</span>
fname &lt;-<span class="st"> </span><span class="kw">file.path</span>(fpath, dsname %s+%<span class="st"> &quot;.RData&quot;</span>)
if (!<span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span class="kw">dir.create</span>(fpath)
fname &lt;-<span class="st"> </span><span class="kw">file.path</span>(fpath, dsname <span class="op">%s+%</span><span class="st"> &quot;.RData&quot;</span>)
<span class="cf">if</span> (<span class="op">!</span><span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span class="kw">dir.create</span>(fpath)
<span class="kw">save</span>(studentDropIndia, <span class="dt">file=</span>fname)
<span class="co"># Remove the original dataset to save on memory.</span>
@ -192,7 +192,7 @@ if (!<span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span c
<span class="co"># Test the loading of the saved dataset and then cleanup.</span>
<span class="kw">load</span>(fname) %&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<span class="kw">load</span>(fname) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;studentDropIndia&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">rm</span>(studentDropIndia)</code></pre></div>
</div>
@ -204,9 +204,9 @@ if (!<span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span c
<span class="co"># Basic size information.</span>
<span class="kw">nrow</span>(ds) %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<span class="kw">nrow</span>(ds) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ncol</span>(ds) %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ncol</span>(ds) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 15</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># A glimpse into the dataset.</span>
@ -245,7 +245,7 @@ if (!<span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span c
## [13] &quot;total_students&quot; &quot;total_toilets&quot; &quot;establishment_year&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Normalise the variable names.</span>
<span class="kw">names</span>(ds) %&lt;&gt;%<span class="st"> </span><span class="kw">normVarNames</span>() %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<span class="kw">names</span>(ds) <span class="op">%&lt;&gt;%</span><span class="st"> </span><span class="kw">normVarNames</span>() <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;continue_drop&quot; &quot;student_id&quot; &quot;gender&quot;
## [4] &quot;caste&quot; &quot;mathematics_marks&quot; &quot;english_marks&quot;
## [7] &quot;science_marks&quot; &quot;science_teacher&quot; &quot;languages_teacher&quot;
@ -277,7 +277,7 @@ if (!<span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span c
<p>Once we have normalized the variable names, the next step is to understand the shape of the dataset. A first look at the sample observations by using head(), tail(), and sample_n() give us an initial understanding of the data.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Review the first few observations.</span>
<span class="kw">head</span>(ds) %&gt;%<span class="st"> </span><span class="kw">print.data.frame</span>()</code></pre></div>
<span class="kw">head</span>(ds) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">print.data.frame</span>()</code></pre></div>
<pre><code>## continue_drop student_id gender caste mathematics_marks english_marks
## 1 continue s00001 F SC 0.409 0.514
## 2 continue s00002 F BC 0.290 0.512
@ -301,7 +301,7 @@ if (!<span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span c
## 6 00333 335 43 1916</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Review the last few observations.</span>
<span class="kw">tail</span>(ds) %&gt;%<span class="st"> </span><span class="kw">print.data.frame</span>()</code></pre></div>
<span class="kw">tail</span>(ds) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">print.data.frame</span>()</code></pre></div>
<pre><code>## continue_drop student_id gender caste mathematics_marks english_marks
## 1 continue s19095 M OC 0.478 0.642
## 2 continue s19096 M OC 0.323 0.429
@ -325,28 +325,28 @@ if (!<span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span c
## 6 00343 353 15 1957</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Review a random sample of observations.</span>
<span class="kw">sample_n</span>(ds, <span class="dt">size=</span><span class="dv">6</span>) %&gt;%<span class="st"> </span><span class="kw">print.data.frame</span>()</code></pre></div>
<span class="kw">sample_n</span>(ds, <span class="dt">size=</span><span class="dv">6</span>) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">print.data.frame</span>()</code></pre></div>
<pre><code>## continue_drop student_id gender caste mathematics_marks english_marks
## 1 continue s04595 F OC 0.757 0.664
## 2 drop s13959 M BC 0.438 0.495
## 3 continue s03314 M SC 0.523 0.522
## 4 continue s06682 M BC 0.562 0.530
## 5 continue s13285 M BC 0.737 0.706
## 6 continue s09281 F BC 0.500 0.544
## 1 continue s15250 F OC 0.490 0.806
## 2 continue s18947 M BC 0.632 0.445
## 3 continue s16154 F ST 0.389 0.582
## 4 continue s10010 F SC 0.550 0.679
## 5 continue s16149 F BC 0.347 0.643
## 6 continue s10393 F OC 0.670 0.793
## science_marks science_teacher languages_teacher guardian internet
## 1 0.757 8 6 mother TRUE
## 2 0.438 5 3 mother TRUE
## 3 0.523 7 10 mother TRUE
## 4 0.562 8 5 father TRUE
## 5 0.737 2 2 mother TRUE
## 6 0.500 1 5 mother TRUE
## 1 0.490 5 6 mother FALSE
## 2 0.632 9 4 mother TRUE
## 3 0.389 4 6 mother TRUE
## 4 0.550 7 0 mother TRUE
## 5 0.347 1 6 mother TRUE
## 6 0.670 4 5 mother TRUE
## school_id total_students total_toilets establishment_year
## 1 00378 121 28 1971
## 2 00361 287 15 1992
## 3 00361 287 15 1992
## 4 00313 399 14 1916
## 5 00340 134 15 1976
## 6 00396 129 21 2006</code></pre>
## 1 00331 516 15 1996
## 2 00322 179 8 1955
## 3 00378 121 28 1971
## 4 00311 209 14 1976
## 5 00370 272 1 1961
## 6 00323 344 28 1961</code></pre>
</div>
<div id="step-2.4-review---summary" class="section level1">
<h1>Step 2.4: Review - Summary</h1>
@ -395,13 +395,13 @@ if (!<span class="st"> </span><span class="kw">dir.exists</span>(fpath)) <span c
<span class="co"># How many schools are represented in the dataset? (Why we need to know this)</span>
ds$school_id %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">unique</span>() %&gt;%
ds<span class="op">$</span>school_id <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">unique</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">length</span>()</code></pre></div>
<pre><code>## [1] 100</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Check the class of all the variables.</span>
ds %&gt;%<span class="st"> </span><span class="kw">sapply</span>(class)</code></pre></div>
ds <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sapply</span>(class)</code></pre></div>
<pre><code>## continue_drop student_id gender
## &quot;character&quot; &quot;character&quot; &quot;character&quot;
## caste mathematics_marks english_marks
@ -414,18 +414,18 @@ ds %&gt;%<span class="st"> </span><span class="kw">sapply</span>(class)</code></
## &quot;integer&quot; &quot;integer&quot; &quot;integer&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Select all the character variables.</span>
ds %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(is.character) %&gt;%
<span class="st"> </span><span class="kw">which</span>(<span class="dt">useNames=</span><span class="ot">TRUE</span>) %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%<span class="st"> </span>
ds <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(is.character) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>(<span class="dt">useNames=</span><span class="ot">TRUE</span>) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
vnames</code></pre></div>
<pre><code>## [1] &quot;continue_drop&quot; &quot;student_id&quot; &quot;gender&quot; &quot;caste&quot;
## [5] &quot;guardian&quot; &quot;school_id&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Review the values of the variables.</span>
ds[vnames] %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(as.factor) %&gt;%
ds[vnames] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(as.factor) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">summary</span>()</code></pre></div>
<pre><code>## continue_drop student_id gender caste guardian
## continue:18200 s00001 : 1 F:9900 BC:9700 father: 4400
@ -445,32 +445,32 @@ ds[vnames] %&gt;%
## (Other):16200</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Choose to convert these variables from character to factor.</span>
vnames %&lt;&gt;%<span class="st"> </span><span class="kw">setdiff</span>(<span class="kw">c</span>(<span class="st">&quot;student_id&quot;</span>)) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
vnames <span class="op">%&lt;&gt;%</span><span class="st"> </span><span class="kw">setdiff</span>(<span class="kw">c</span>(<span class="st">&quot;student_id&quot;</span>)) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;continue_drop&quot; &quot;gender&quot; &quot;caste&quot; &quot;guardian&quot;
## [5] &quot;school_id&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[vnames] %&lt;&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">lapply</span>(factor) %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">data.frame</span>() %&gt;%<span class="st"> </span>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[vnames] <span class="op">%&lt;&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">lapply</span>(factor) <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">data.frame</span>() <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">tbl_df</span>()
<span class="co"># Confirm they are now factors.</span>
ds[vnames] %&gt;%<span class="st"> </span><span class="kw">sapply</span>(class)</code></pre></div>
ds[vnames] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sapply</span>(class)</code></pre></div>
<pre><code>## continue_drop gender caste guardian school_id
## &quot;factor&quot; &quot;factor&quot; &quot;factor&quot; &quot;factor&quot; &quot;factor&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Select all the logical variables.</span>
ds %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(is.logical) %&gt;%
<span class="st"> </span><span class="kw">which</span>(<span class="dt">useNames=</span><span class="ot">TRUE</span>) %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%<span class="st"> </span>
ds <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(is.logical) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>(<span class="dt">useNames=</span><span class="ot">TRUE</span>) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
vnames</code></pre></div>
<pre><code>## [1] &quot;internet&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Decide to convert these variables from logical to factor.</span>
ds[vnames] %&gt;%<span class="st"> </span><span class="kw">head</span>()</code></pre></div>
<pre><code>## # A tibble: 6 x 1
ds[vnames] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">head</span>()</code></pre></div>
<pre><code>## # A tibble: 6 × 1
## internet
## &lt;lgl&gt;
## 1 TRUE
@ -479,12 +479,12 @@ ds[vnames] %&gt;%<span class="st"> </span><span class="kw">head</span>()</code><
## 4 TRUE
## 5 TRUE
## 6 TRUE</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[vnames] %&lt;&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">lapply</span>(factor) %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">data.frame</span>() %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">tbl_df</span>() %T&gt;%
<span class="st"> </span>{<span class="kw">head</span>(.) %&gt;%<span class="st"> </span><span class="kw">print</span>()}</code></pre></div>
<pre><code>## # A tibble: 6 x 1
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[vnames] <span class="op">%&lt;&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">lapply</span>(factor) <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">data.frame</span>() <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">tbl_df</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span>{<span class="kw">head</span>(.) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">print</span>()}</code></pre></div>
<pre><code>## # A tibble: 6 × 1
## internet
## &lt;fctr&gt;
## 1 TRUE
@ -495,7 +495,7 @@ ds[vnames] %&gt;%<span class="st"> </span><span class="kw">head</span>()</code><
## 6 TRUE</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Review the distribution of observations across levels.</span>
ds[, <span class="kw">sapply</span>(ds, is.factor)] %&gt;%<span class="st"> </span><span class="kw">sapply</span>(table)</code></pre></div>
ds[, <span class="kw">sapply</span>(ds, is.factor)] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sapply</span>(table)</code></pre></div>
<pre><code>## $continue_drop
##
## continue drop
@ -559,7 +559,7 @@ ds[, <span class="kw">sapply</span>(ds, is.factor)] %&gt;%<span class="st"> </sp
<span class="co"># Review the values.</span>
<span class="kw">head</span>(ds[vnames])</code></pre></div>
<pre><code>## # A tibble: 6 x 3
<pre><code>## # A tibble: 6 × 3
## english_marks science_marks mathematics_marks
## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt;
## 1 0.514 0.409 0.409
@ -569,18 +569,18 @@ ds[, <span class="kw">sapply</span>(ds, is.factor)] %&gt;%<span class="st"> </sp
## 5 0.614 0.536 0.536
## 6 0.519 0.594 0.594</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">sample_n</span>(ds[vnames], <span class="dv">6</span>)</code></pre></div>
<pre><code>## # A tibble: 6 x 3
<pre><code>## # A tibble: 6 × 3
## english_marks science_marks mathematics_marks
## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt;
## 1 0.411 0.649 0.649
## 2 0.450 0.447 0.447
## 3 0.524 0.461 0.461
## 4 0.556 0.304 0.304
## 5 0.650 0.926 0.926
## 6 0.623 0.266 0.266</code></pre>
## 1 0.518 0.456 0.456
## 2 0.533 0.416 0.416
## 3 0.468 0.666 0.666
## 4 0.642 0.602 0.602
## 5 0.602 0.564 0.564
## 6 0.747 0.778 0.778</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Check the current class of the variables.</span>
ds[vnames] %&gt;%<span class="st"> </span><span class="kw">sapply</span>(class)</code></pre></div>
ds[vnames] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sapply</span>(class)</code></pre></div>
<pre><code>## english_marks science_marks mathematics_marks
## &quot;numeric&quot; &quot;numeric&quot; &quot;numeric&quot;</code></pre>
</div>
@ -592,7 +592,7 @@ ds[vnames] %&gt;%<span class="st"> </span><span class="kw">sapply</span>(class)<
<span class="co"># Note the available variables.</span>
vars &lt;-<span class="st"> </span><span class="kw">names</span>(ds) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
vars &lt;-<span class="st"> </span><span class="kw">names</span>(ds) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;continue_drop&quot; &quot;student_id&quot; &quot;gender&quot;
## [4] &quot;caste&quot; &quot;mathematics_marks&quot; &quot;english_marks&quot;
## [7] &quot;science_marks&quot; &quot;science_teacher&quot; &quot;languages_teacher&quot;
@ -604,7 +604,7 @@ target &lt;-<span class="st"> &quot;continue_drop&quot;</span>
<span class="co"># Place the target variable at the beginning of the vars.</span>
vars &lt;-<span class="st"> </span><span class="kw">c</span>(target, vars) %&gt;%<span class="st"> </span><span class="kw">unique</span>() %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
vars &lt;-<span class="st"> </span><span class="kw">c</span>(target, vars) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">unique</span>() <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;continue_drop&quot; &quot;student_id&quot; &quot;gender&quot;
## [4] &quot;caste&quot; &quot;mathematics_marks&quot; &quot;english_marks&quot;
## [7] &quot;science_marks&quot; &quot;science_teacher&quot; &quot;languages_teacher&quot;
@ -624,22 +624,22 @@ id &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quo
<span class="co"># Initialise ignored variables: identifiers and risk.</span>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(id, if (<span class="kw">exists</span>(<span class="st">&quot;risk&quot;</span>)) risk) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(id, <span class="cf">if</span> (<span class="kw">exists</span>(<span class="st">&quot;risk&quot;</span>)) risk) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;student_id&quot; &quot;school_id&quot;</code></pre>
<p>We might also identify any variable that has a unique value for every observation. These are sometimes identifiers as well and if so are candidates for ignoring.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Heuristic for candidate identifiers to possibly ignore. </span>
ds[vars] %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(function(x) x %&gt;%<span class="st"> </span><span class="kw">unique</span>() %&gt;%<span class="st"> </span><span class="kw">length</span>()) %&gt;%
<span class="st"> </span><span class="kw">equals</span>(<span class="kw">nrow</span>(ds)) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%
ds[vars] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(<span class="cf">function</span>(x) x <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">unique</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">length</span>()) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">equals</span>(<span class="kw">nrow</span>(ds)) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
ids</code></pre></div>
<pre><code>## [1] &quot;student_id&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Add them if any to the variables to be ignored for modelling.</span>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, ids) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, ids) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;student_id&quot; &quot;school_id&quot;</code></pre>
</div>
<div id="all-missing" class="section level2">
@ -647,17 +647,17 @@ ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, ids)
<p>We then remove any variable where all of the values are missing.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify variables with only missing values.</span>
ds[vars] %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(function(x) x %&gt;%<span class="st"> </span>is.na %&gt;%<span class="st"> </span>sum) %&gt;%
<span class="st"> </span><span class="kw">equals</span>(<span class="kw">nrow</span>(ds)) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%
ds[vars] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(<span class="cf">function</span>(x) x <span class="op">%&gt;%</span><span class="st"> </span>is.na <span class="op">%&gt;%</span><span class="st"> </span>sum) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">equals</span>(<span class="kw">nrow</span>(ds)) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
missing</code></pre></div>
<pre><code>## character(0)</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Add them if any to the variables to be ignored for modelling.</span>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, missing) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, missing) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;student_id&quot; &quot;school_id&quot;</code></pre>
</div>
<div id="many-missing" class="section level2">
@ -669,17 +669,17 @@ missing.threshold &lt;-<span class="st"> </span><span class="fl">0.7</span>
<span class="co"># Identify variables that are mostly missing.</span>
ds[vars] %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(function(x) x %&gt;%<span class="st"> </span><span class="kw">is.na</span>() %&gt;%<span class="st"> </span><span class="kw">sum</span>()) %&gt;%
<span class="st"> '&gt;'</span>(missing.threshold*<span class="kw">nrow</span>(ds)) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%
ds[vars] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(<span class="cf">function</span>(x) x <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">is.na</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sum</span>()) <span class="op">%&gt;%</span>
<span class="st"> '&gt;'</span>(missing.threshold<span class="op">*</span><span class="kw">nrow</span>(ds)) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
mostly</code></pre></div>
<pre><code>## character(0)</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Add them if any to the variables to be ignored for modelling.</span>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, mostly) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, mostly) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;student_id&quot; &quot;school_id&quot;</code></pre>
</div>
</div>
@ -694,20 +694,20 @@ levels.threshold &lt;-<span class="st"> </span><span class="dv">20</span>
<span class="co"># Identify variables that have too many levels.</span>
ds[vars] %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(is.factor) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">names</span>() %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(function(x) ds %&gt;%<span class="st"> </span><span class="kw">extract2</span>(x) %&gt;%<span class="st"> </span><span class="kw">levels</span>() %&gt;%<span class="st"> </span><span class="kw">length</span>()) %&gt;%
<span class="st"> '&gt;='</span>(levels.threshold) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%
ds[vars] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(is.factor) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(<span class="cf">function</span>(x) ds <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">extract2</span>(x) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">levels</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">length</span>()) <span class="op">%&gt;%</span>
<span class="st"> '&gt;='</span>(levels.threshold) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
too.many</code></pre></div>
<pre><code>## [1] &quot;school_id&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Add them if any to the variables to be ignored for modelling.</span>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, too.many) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, too.many) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;student_id&quot; &quot;school_id&quot;</code></pre>
</div>
<div id="constants" class="section level2">
@ -715,16 +715,16 @@ ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, too.m
<p>We ingore variables with constant values as well.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify variables that have a single value.</span>
ds[vars] %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(function(x) <span class="kw">all</span>(x ==<span class="st"> </span>x[1L])) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%
ds[vars] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(<span class="cf">function</span>(x) <span class="kw">all</span>(x <span class="op">==</span><span class="st"> </span>x[1L])) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
constants </code></pre></div>
<pre><code>## character(0)</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Add them if any to the variables to be ignored for modelling.</span>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, constants) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, constants) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;student_id&quot; &quot;school_id&quot;</code></pre>
</div>
</div>
@ -733,12 +733,12 @@ ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, const
<p>Within all the numeric varaibles, we can identify pairs where we want to keep one but not the other, because they are highly correlated. We will select them manually since it is a judgement call. Normally we might limit the removals to those correlations that are 0.95 or more. In this case, there does not exist a collinearity.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Note which variables are numeric.</span>
vars %&gt;%
<span class="st"> </span><span class="kw">setdiff</span>(ignore) %&gt;%
<span class="st"> '['</span>(ds, .) %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(is.numeric) %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%
vars <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">setdiff</span>(ignore) <span class="op">%&gt;%</span>
<span class="st"> '['</span>(ds, .) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(is.numeric) <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
numc</code></pre></div>
<pre><code>## [1] &quot;mathematics_marks&quot; &quot;english_marks&quot; &quot;science_marks&quot;
@ -746,20 +746,20 @@ numc</code></pre></div>
## [7] &quot;total_toilets&quot; &quot;establishment_year&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># For the numeric variables generate a table of correlations</span>
ds[numc] %&gt;%
<span class="st"> </span><span class="kw">cor</span>(<span class="dt">use=</span><span class="st">&quot;complete.obs&quot;</span>) %&gt;%
<span class="st"> </span><span class="kw">ifelse</span>(<span class="kw">upper.tri</span>(., <span class="dt">diag=</span><span class="ot">TRUE</span>), <span class="ot">NA</span>, .) %&gt;%<span class="st"> </span>
<span class="st"> </span>abs %&gt;%<span class="st"> </span>
<span class="st"> </span>data.frame %&gt;%
<span class="st"> </span>tbl_df %&gt;%
<span class="st"> </span><span class="kw">set_colnames</span>(numc) %&gt;%
<span class="st"> </span><span class="kw">mutate</span>(<span class="dt">var1=</span>numc) %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">gather</span>(var2, cor, -var1) %&gt;%<span class="st"> </span>
<span class="st"> </span>na.omit %&gt;%
<span class="st"> </span><span class="kw">arrange</span>(-<span class="kw">abs</span>(cor)) %T&gt;%
ds[numc] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">cor</span>(<span class="dt">use=</span><span class="st">&quot;complete.obs&quot;</span>) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">ifelse</span>(<span class="kw">upper.tri</span>(., <span class="dt">diag=</span><span class="ot">TRUE</span>), <span class="ot">NA</span>, .) <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span>abs <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span>data.frame <span class="op">%&gt;%</span>
<span class="st"> </span>tbl_df <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">set_colnames</span>(numc) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">mutate</span>(<span class="dt">var1=</span>numc) <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">gather</span>(var2, cor, <span class="op">-</span>var1) <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span>na.omit <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">arrange</span>(<span class="op">-</span><span class="kw">abs</span>(cor)) <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
mc</code></pre></div>
<pre><code>## # A tibble: 28 x 3
<pre><code>## # A tibble: 28 × 3
## var1 var2 cor
## &lt;chr&gt; &lt;chr&gt; &lt;dbl&gt;
## 1 science_marks mathematics_marks 1.00000000
@ -775,7 +775,7 @@ mc</code></pre></div>
## # ... with 18 more rows</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Any variables could be removed because highly correlated?</span>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, <span class="ot">NULL</span>) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, <span class="ot">NULL</span>) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;student_id&quot; &quot;school_id&quot;</code></pre>
</div>
<div id="step-3.4-clean---remove-the-ignored-variables" class="section level1">
@ -787,7 +787,7 @@ ignore &lt;-<span class="st"> </span><span class="kw">union</span>(ignore, <span
<pre><code>## [1] 15</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Remove the variables to ignore.</span>
vars &lt;-<span class="st"> </span><span class="kw">setdiff</span>(vars, ignore) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
vars &lt;-<span class="st"> </span><span class="kw">setdiff</span>(vars, ignore) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;continue_drop&quot; &quot;gender&quot; &quot;caste&quot;
## [4] &quot;mathematics_marks&quot; &quot;english_marks&quot; &quot;science_marks&quot;
## [7] &quot;science_teacher&quot; &quot;languages_teacher&quot; &quot;guardian&quot;
@ -806,16 +806,14 @@ vars &lt;-<span class="st"> </span><span class="kw">setdiff</span>(vars, ignore)
<span class="co"># Formula for modelling.</span>
form &lt;-<span class="st"> </span><span class="kw">formula</span>(target %s+%<span class="st"> &quot; ~ .&quot;</span>) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## continue_drop ~ .</code></pre>
form &lt;-<span class="st"> </span><span class="kw">formula</span>(target <span class="op">%s+%</span><span class="st"> &quot; ~ .&quot;</span>) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Use correlation search to identify key variables.</span>
<span class="co"># Could be useful to decide which variables to retain.</span>
<span class="kw">cfs</span>(form, ds[vars])</code></pre></div>
<pre><code>## [1] &quot;mathematics_marks&quot; &quot;english_marks&quot;</code></pre>
<span class="co">#cfs(form, ds[vars])</span></code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Any variables to remove because not useful?</span>
vars %&lt;&gt;%<span class="st"> </span><span class="kw">setdiff</span>(<span class="ot">NULL</span>) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
vars <span class="op">%&lt;&gt;%</span><span class="st"> </span><span class="kw">setdiff</span>(<span class="ot">NULL</span>) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;continue_drop&quot; &quot;gender&quot; &quot;caste&quot;
## [4] &quot;mathematics_marks&quot; &quot;english_marks&quot; &quot;science_marks&quot;
## [7] &quot;science_teacher&quot; &quot;languages_teacher&quot; &quot;guardian&quot;
@ -824,25 +822,13 @@ vars %&lt;&gt;%<span class="st"> </span><span class="kw">setdiff</span>(<span cl
<p>Or, we can use the function information.gain() to identify varaible importance and decide which varaibles to remove.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Use information gain to identify variable importance.</span>
<span class="kw">information.gain</span>(form, ds[vars]) %&gt;%
<span class="st"> </span><span class="kw">rownames_to_column</span>() %&gt;%
<span class="st"> </span><span class="kw">arrange</span>(attr_importance)</code></pre></div>
<pre><code>## rowname attr_importance
## 1 total_students 0.0000000000
## 2 total_toilets 0.0001336230
## 3 establishment_year 0.0001336230
## 4 gender 0.0008611377
## 5 guardian 0.0011585567
## 6 internet 0.0022360914
## 7 caste 0.0087435209
## 8 languages_teacher 0.0158443590
## 9 science_teacher 0.0248176746
## 10 mathematics_marks 0.1518071497
## 11 science_marks 0.1518071497
## 12 english_marks 0.1645442736</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Any variables to remove because not useful?</span>
<span class="co">#information.gain(form, ds[vars]) %&gt;%</span>
<span class="co"># rownames_to_column() %&gt;%</span>
<span class="co"># arrange(attr_importance)</span>
vars %&lt;&gt;%<span class="st"> </span><span class="kw">setdiff</span>(<span class="ot">NULL</span>)</code></pre></div>
<span class="co"># Any variables to remove because not useful?</span>
vars <span class="op">%&lt;&gt;%</span><span class="st"> </span><span class="kw">setdiff</span>(<span class="ot">NULL</span>)</code></pre></div>
</div>
<div id="step-3.6-clean---remove-missing-target" class="section level1">
<h1>Step 3.6: Clean - Remove Missing Target</h1>
@ -852,23 +838,23 @@ vars %&lt;&gt;%<span class="st"> </span><span class="kw">setdiff</span>(<span cl
<span class="co"># Check the dimensions to start with.</span>
<span class="kw">dim</span>(ds) %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<span class="kw">dim</span>(ds) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100 15</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify observations with a missing target.</span>
ds %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">extract2</span>(target) %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">is.na</span>() %T&gt;%
<span class="st"> </span>{<span class="kw">sum</span>(.) %&gt;%<span class="st"> </span><span class="kw">print</span>()} -&gt;
ds <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">extract2</span>(target) <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">is.na</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span>{<span class="kw">sum</span>(.) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">print</span>()} -&gt;
missing.target </code></pre></div>
<pre><code>## [1] 0</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Remove observations with a missing target.</span>
ds %&lt;&gt;%<span class="st"> </span><span class="kw">filter</span>(!missing.target)
ds <span class="op">%&lt;&gt;%</span><span class="st"> </span><span class="kw">filter</span>(<span class="op">!</span>missing.target)
<span class="co"># Confirm the filter delivered the expected dataset.</span>
<span class="kw">dim</span>(ds) %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<span class="kw">dim</span>(ds) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100 15</code></pre>
</div>
<div id="step-3.7-clean---deal-with-missing-values" class="section level1">
@ -879,14 +865,14 @@ ds %&lt;&gt;%<span class="st"> </span><span class="kw">filter</span>(!missing.ta
<span class="co"># Count the number of missing values.</span>
ds[vars] %&gt;%<span class="st"> </span><span class="kw">is.na</span>() %&gt;%<span class="st"> </span><span class="kw">sum</span>() %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
ds[vars] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">is.na</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sum</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 676</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Impute missing values.</span>
ds[vars] %&lt;&gt;%<span class="st"> </span><span class="kw">na.roughfix</span>()</code></pre></div>
ds[vars] <span class="op">%&lt;&gt;%</span><span class="st"> </span><span class="kw">na.roughfix</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Confirm that no missing values remain.</span>
ds[vars] %&gt;%<span class="st"> </span><span class="kw">is.na</span>() %&gt;%<span class="st"> </span><span class="kw">sum</span>() %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
ds[vars] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">is.na</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sum</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 0</code></pre>
<p>Alternatively, we can simply remove observations that have missing values. Here na.omit() identifies the rows to omit based on the vars to be included for modelling. This list of rows to omit is stored as the na.action attribute of the returned object. We then remove these observations from the dataset.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">####################################
@ -898,31 +884,31 @@ omit &lt;-<span class="st"> </span><span class="ot">NULL</span>
<span class="co"># Review the current dataset.</span>
ds[vars] %&gt;%<span class="st"> </span><span class="kw">nrow</span>() %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
ds[vars] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">nrow</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[vars] %&gt;%<span class="st"> </span><span class="kw">is.na</span>() %&gt;%<span class="st"> </span><span class="kw">sum</span>() %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[vars] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">is.na</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sum</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 0</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify any observations with missing values.</span>
ds[vars] %&gt;%
<span class="st"> </span><span class="kw">na.omit</span>() %&gt;%
<span class="st"> </span><span class="kw">attr</span>(<span class="st">&quot;na.action&quot;</span>) %T&gt;%
ds[vars] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">na.omit</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">attr</span>(<span class="st">&quot;na.action&quot;</span>) <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
mo</code></pre></div>
<pre><code>## NULL</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Record the observations to omit.</span>
omit &lt;-<span class="st"> </span><span class="kw">union</span>(omit, mo) %T&gt;%<span class="st"> </span>{<span class="kw">length</span>(.) %&gt;%<span class="st"> </span><span class="kw">print</span>()}</code></pre></div>
omit &lt;-<span class="st"> </span><span class="kw">union</span>(omit, mo) <span class="op">%T&gt;%</span><span class="st"> </span>{<span class="kw">length</span>(.) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">print</span>()}</code></pre></div>
<pre><code>## [1] 0</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># If there are observations to omit then remove them.</span>
if (<span class="kw">length</span>(omit)) ds &lt;-<span class="st"> </span>ds[-omit,]
<span class="cf">if</span> (<span class="kw">length</span>(omit)) ds &lt;-<span class="st"> </span>ds[<span class="op">-</span>omit,]
<span class="co"># Confirm the observations have been removed.</span>
ds[vars] %&gt;%<span class="st"> </span><span class="kw">nrow</span>() %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
ds[vars] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">nrow</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[vars] %&gt;%<span class="st"> </span><span class="kw">is.na</span>() %&gt;%<span class="st"> </span><span class="kw">sum</span>() %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[vars] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">is.na</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sum</span>() <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 0</code></pre>
</div>
<div id="step-3.8-clean---normalise-factors" class="section level1">
@ -933,17 +919,17 @@ ds[vars] %&gt;%<span class="st"> </span><span class="kw">nrow</span>() %&gt;%<sp
<span class="co"># Note which variables are categoric.</span>
ds[vars] %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(is.factor) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">names</span>() %T&gt;%
ds[vars] <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(is.factor) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
catc</code></pre></div>
<pre><code>## [1] &quot;continue_drop&quot; &quot;gender&quot; &quot;caste&quot; &quot;guardian&quot;
## [5] &quot;internet&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Check the levels.</span>
ds[catc] %&gt;%<span class="st"> </span><span class="kw">sapply</span>(levels)</code></pre></div>
ds[catc] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sapply</span>(levels)</code></pre></div>
<pre><code>## $continue_drop
## [1] &quot;continue&quot; &quot;drop&quot;
##
@ -960,12 +946,12 @@ ds[catc] %&gt;%<span class="st"> </span><span class="kw">sapply</span>(levels)</
## [1] &quot;FALSE&quot; &quot;TRUE&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Normalise the levels of all categoric variables.</span>
for (v in catc)
<span class="kw">levels</span>(ds[[v]]) %&lt;&gt;%<span class="st"> </span><span class="kw">normVarNames</span>()
<span class="cf">for</span> (v <span class="cf">in</span> catc)
<span class="kw">levels</span>(ds[[v]]) <span class="op">%&lt;&gt;%</span><span class="st"> </span><span class="kw">normVarNames</span>()
<span class="co"># Review the levels.</span>
ds[catc] %&gt;%<span class="st"> </span><span class="kw">sapply</span>(levels)</code></pre></div>
ds[catc] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">sapply</span>(levels)</code></pre></div>
<pre><code>## $continue_drop
## [1] &quot;continue&quot; &quot;drop&quot;
##
@ -988,19 +974,19 @@ ds[catc] %&gt;%<span class="st"> </span><span class="kw">sapply</span>(levels)</
<span class="kw">class</span>(ds[[target]])</code></pre></div>
<pre><code>## [1] &quot;factor&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[[target]] %&lt;&gt;%<span class="st"> </span><span class="kw">as.factor</span>()
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ds[[target]] <span class="op">%&lt;&gt;%</span><span class="st"> </span><span class="kw">as.factor</span>()
<span class="co"># Confirm the distribution.</span>
ds[target] %&gt;%<span class="st"> </span><span class="kw">summary</span>()</code></pre></div>
ds[target] <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">summary</span>()</code></pre></div>
<pre><code>## continue_drop
## continue:18200
## drop : 900</code></pre>
<p>Here, we visualise the distribution of the target variable using ggplot2.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">p &lt;-<span class="st"> </span><span class="kw">ggplot</span>(ds, <span class="kw">aes_string</span>(<span class="dt">x=</span>target))
p &lt;-<span class="st"> </span>p +<span class="st"> </span><span class="kw">geom_bar</span>(<span class="dt">width=</span><span class="fl">0.2</span>)
p &lt;-<span class="st"> </span>p <span class="op">+</span><span class="st"> </span><span class="kw">geom_bar</span>(<span class="dt">width=</span><span class="fl">0.2</span>)
<span class="kw">print</span>(p)</code></pre></div>
<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAkAAAAGACAMAAAByRC0tAAAAyVBMVEUAAAAAADoAAGYAOpAAZrYzMzM6AAA6ADo6AGY6kNtNTU1NTW5NTY5Nbo5NbqtNjshZWVlmAABmADpmtrZmtv9uTU1uTW5uTY5ubo5ubqtuq6tuq+SOTU2OTW6OTY6Obk2Ojm6OyP+QOgCQkDqQkGaQtpCQ2/+rbk2rbm6rbo6rjk2rq26ryKur5P+2ZgC22/+2///Ijk3I5KvI///bkDrb///kq27k///r6+v/tmb/yI7/25D/27b/5Kv//7b//8j//9v//+T///+qVdIjAAAACXBIWXMAAA7DAAAOwwHHb6hkAAALFUlEQVR4nO3cAVcbxxWGYdnBjr2mckmduNRAIpoap41cqyJQoEJo//+P6s5KgCSEjdi7c+8n3ufErEVknZnNy+ysjpVOCTTQ8R4AtBEQGiEgNEJAaISA0AgBoRGbgP4bRJiBtCfKFAlIVJQpEpCoKFMkIFFRpkhAoqJMkYBERZkiAYmKMkUCEhVligQkKsoUCUhUlCkSkKgoUyQgUVGmSECiokyRgERFmSIBiYoyRfWA/ryO/MNrDwHZICBnBCSKgGwQkDMCEkVANgjIGQGJIiAbBOSMgEQRkA0CckZAogjIBgE5sw0ov7UC8h7sJmMFErOZK1D+8ROQMwISRUA2CMgZAYkiIBsE5IyARBGQDQJyRkCiCMgGATkjIFEEZIOAnBGQKAKyQUDOCEgUAdkgIGcEJIqAbBCQMwISRUA2CMgZAYkiIBsE5IyARBGQDQJyRkCiCMgGATkjIFEEZIOAnBGQKAKyQUDOCEgUAdkgIGcEJIqAbBCQMwISRUA2CMgZAYlSC2j0fliWJ0VRbA/L8V7RPS2XDgSUl1hA5ymcctBLv58c9cqTt0sHAspMK6DBm9+rFWjyqZ8ejA+GaUFaPBBQZloBTS9h1cWqKHrlaPe0HO/3Fw9l+aLyjRdpwVoB5R/e0/GQgEY/9dMqdN6tk1k8TJ+V/weAFcjZOgHVBr17ViACyko4IPZAESgGlK5Wk9+Gk6MP09uv+QMBZaYYUHof6E1/+Q0g3gdyoRbQw+QfPwE5IyBRBGSDgJwRkCgCskFAzghIFAHZICBnBCSKgGwQkDMCEkVANgjIGQGJIiAbBOSMgEQRkA0CckZAogjIBgE5IyBRBGSDgJwRkCgCskFAzghIFAHZICBnBCSKgGwQkDMCEkVANgjIGQGJIiAbBOSMgEQRkA0CckZAogjIBgE5IyBRBGSDgJzZBpTfWgF5D3aTsQKJ2cwVKP/4CcgZAYkiIBsE5IyARBGQDQJyRkCiCMgGATkjIFEEZIOAnBGQKAKyQUDOCEgUAdkgIGcEJIqAbBCQMwISRUA2CMgZAYkiIBsE5IyARBGQDQJyRkCiCMgGATkjIFEEZIOAnBGQKAKyQUDOCEgUAdkgIGcEJIqAbBCQMwISRUA2CMgZAYkiIBsE5IyARBGQDQJydk9Alzvv0uHs+ZfZN0bvh2U53iu6p/ccCCgvsYDOi+1hOTnqlSdvVx8IKLPQAR13rm1NvzF483u1Ao0PhmklWnkgoMxCB3SzAt1KjYx2T8vxfn/loSxfVO5silq3VkD5h/d0fGsTnQI679atrDxMn5X/B4AVyNl9AV28rC9hC5vor69ABJRV8ICuDrfurkDsgQIJHtDKPdDk6MP0vmvFgYAyCx7Q1eGKgHgfKJDgAc29hbiO/OMnIGf3BHS501ncRBNQMMEDeqT84ycgZwQkKnhAXMKiCx7QLKMfPq7VDwHlIxFQefbdHwQUk0hAXMKi0gjoMytQVMEDmm2in7EHiip4QI+Uf/wE5IyARIUPqP5rra8JKKroAR2n+6/LnTULyj9+AnJ2T0B3PtZDQMEQkA0CcnZPQFzCooseEJvo4MIH9Cj5x09AzghIVPSArg5f3/1sDwHFET2gz1vlik+HEVAYwQPiNj46ArJBQM7uCYj3gaKLHlB5xvtAoYUP6FHyj5+AnBGQKAKyQUDOCEgUAdkgIGcEJIqAbBCQMwISRUA2CMiZbUD5rRWQ92A3GSuQmM1cgfKPn4CcEZAoArJBQM4ISBQB2SAgZwQkioBsEJAzAhJFQDYIyBkBiSIgGwTkjIBEEZANAnJGQKIIyAYBOSMgUQRkg4CcEZAoArJBQM4ISBQB2SAgZwQkioBsEJAzAhJFQDYIyBkBiSIgGwTkjIBEEZANAnJGQKIIyAYBOSMgUQRkg4CcEZAoArJBQM4ISBQB2SAgZ2sFdFIUxfawHO8V3dNy6UBAeUkGNOilr5OjXnnydulAQJkpBjT51E+H8cGwHL0fLh4IKDPFgKqLVVH0ytHuaTne7y8eyvJF5SHLmK21Aso/vKfjIQGNfuqnVei8WyezeJg+I/8PACuQs3UCqg1696xABJSVcEDsgSJQDChdrSa/DSdHH6a3X/MHAspMMaD0PtCb/vIbQLwP5EIyoG/KP34CckZAogjIBgE5IyBRBGSDgJwRkCgCskFAzghIFAHZICBnBCSKgGwQkDMCEkVANgjIGQGJIiAbBOSMgEQRkA0CckZAogjIBgE5IyBRBGSDgJwRkCgCskFAzghIFAHZICBnBCSKgGwQkDMCEkVANgjIGQGJIiAbBOSMgEQRkA0CckZAogjIBgE5sw0ov7UC8h7sJmMFErOZK1D+8ROQMwISRUA2CMgZAYkiIBsE5IyARBGQDQJyRkCiCMgGATkjIFEEZIOAnBGQKAKyQUDOCEgUAdkgIGcEJIqAbBCQMwISRUA2CMgZAYkiIBsE5IyARBGQDQJyRkCiCMgGATkjoOjWmmH+KRJQdATUqthn1wIBtSr22bVAQK2KfXYtEFCrYp9dCwTUqthn1wIBtSr22bVAQK2KfXYtEFCrYp9dCwTUqthn1wIBtSr22bWw8QGN94ruKQG1ZtMDmhz1ypO3BNSaTQ9ofDAsR++HBNSWTQ9otHtajvf7Zfmi8tgXgbxHB3TenQWUZO//HmEG0p4oU2wc0M0KREBZRZli44Cc90CrhRlIe6JMsXFAk6MPnndhq4UZSHuiTLFxQM7vA60WZiDtiTLF5gHN857NtTADaU+UKRKQqChTJCBRUaZIQKKiTJGAREWZIgGJijJFAhIVZYoEJCrKFAlIVJQp2gYUxRP4eyXRpkhAYqJNkYDERJsiAYmJNsXNCgjZERAaISA0QkBoZHMCGr0f3vwd7c2TPscZ0kYF5D2EFhGQrfFe8aafvm4Py9HuP4uil37/n2oRqh/UNaVf02eoq2bxp597o7/+vD2cTmh88K80/wg0A0o/j+fd08GH9HX04/RQX8JuH9QBDeY+OqIrzbPojX7sTX/bPR3vdU/PY/xkaAY0vVylj6aN9/uzWG7+KW8P6ZOP6Vna0gyqH5l6RZ1OebzXKyefQixBogHtpg8UpTNancavBbRXFFHW+serf1wGvetVtZpy/ZHgQYhdkWhAD12B5FefZMUKVAXECtRA2gNV53Mwv99ZCCid4ZPt4WzL4D3cpmZ7oHohmu2B3kaZl2ZAi3dh02YmR/Vd2GydPymKvx0MZ89TNzmq78JSQLO7sP1fosxLNKAn7uZ/i+KPgBQREDYFAaERAkIjBIRGCAiNENCD/e/f5cX3Hx/01LPnX1oeTBgE9FAPjSchINxBQCsR0B1Xh53O1u3x4vt/vOx03l1UX15XEc0eTntKX9LT5nu53Ok8+/X5l4tXf6++ffMav3Y63/3hNaM2EdCyq8OtKoJ39TH9unhZ/Zc/roqoYkkBzT1Mj9NTyuPbOC53Xle/qn//cquce43U0pbntNpCQMuuL1X1ZegslfCu/uZNQLcP07F+Wgpupn58PPtjy6/hNqn2ENCy6/3LWVpV5kq5DmjuYToed2qvr/94vRhdvJquUAuvMVfZBiGgZesGtLS1IaCn7uYS9uzj9PLz9YDqp825uWrVAV2/Rn0Je7WJt2YEtOx643uzAZ6VkhaQuXDSZvnq8Fm1ia5WmbmKLne2ppvo9Dw20U/Rndv42f36587WXEBluq//yw/T2/j5Vej6Nr5eyOZv4zeyHwLKYjNvwGoElAMB4RvSBS15trIUAgJWIyA0QkBohIDQCAGhEQJCIwSERv4Psu2UGO3jCMwAAAAASUVORK5CYII=" title alt style="display: block; margin: auto;" /></p>
<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAkAAAAGACAMAAAByRC0tAAAAyVBMVEUAAAAAADoAAGYAOpAAZrYzMzM6AAA6ADo6AGY6kNtNTU1NTW5NTY5Nbo5NbqtNjshZWVlmAABmADpmtrZmtv9uTU1uTW5uTY5ubo5ubqtuq6tuq+SOTU2OTW6OTY6Obk2Ojm6OyP+QOgCQkDqQkGaQtpCQ2/+rbk2rbm6rbo6rjk2rq26ryKur5P+2ZgC22/+2///Ijk3I5KvI///bkDrb///kq27k///r6+v/tmb/yI7/25D/27b/5Kv//7b//8j//9v//+T///+qVdIjAAAACXBIWXMAAA7DAAAOwwHHb6hkAAALBklEQVR4nO3cAVsaVxaHcZKaNJm4ZO2mzbpRW9xuTHdLNixWVl1E5vt/qJ07gALBKM7hnvPH930aJxjqc4/9eWeGJ7RVEjWo5b0A0g5A1CgAUaMARI0CEDUKQNQoE0D/DVKYhWywKDMCSLQoMwJItCgzAki0KDMCSLQoMwJItCgzAki0KDMCSLQoMwJItCgzAki0KDMCSLQoMwJItCgzAki0KDMCSLQoMwJItCgzagP681plX94mA5BFAHIPQKIByCIAuQcg0QBkEYDcA5BoALIIQO4BSDQAWQQg9wAk2lYCyt56gLxXu9WxA4m1lTtQ9tUDyD0AiQYgiwDkHoBEA5BFAHIPQKIByCIAuQcg0QBkEYDcA5BoALIIQO4BSDQAWQQg9wAkGoAsApB7ABINQBYByD0AiQYgiwDkHoBEA5BFAHIPQKIByCIAuQcg0QBkEYDcA5BoALIIQO4BSDQAWQQg9wAkGoAsApB7ABINQBYByD0AiQYgiwDkHoBEA5BFAHLvYYCG7/tleVYUxW6/HB0U7UG5dABQ7qQAXSQ4Za+Tfj8+6ZRnb5cOAMqeEqDem9+rHWj8qZsejI76aUNaPAAoe0qAJqew6mRVFJ1yuD8oR4fdxUP1lBdV3/wam2g9QNmX95S6H9Dwp27ahS7aNZnFw/Rp2fmzA7n3cEB1vc5dOxCA8iYLiGugGOkBSmer8W/98cmHye3X/AFA2dMDlF4HetNdfgGI14Gc0gL0wLKvHkDuAUg0AFkEIPcAJBqALAKQewASDUAWAcg9AIkGIIsA5B6ARAOQRQByD0CiAcgiALkHINEAZBGA3AOQaACyCEDuAUg0AFkEIPcAJBqALAKQewASDUAWAcg9AIkGIIsA5B6ARAOQRQByD0CiAcgiALkHINEAZBGA3AOQaACyCEDuAUg0AFkEIPdMAWVvPUDeq93q2IHE2sodKPvqAeQegEQDkEUAcg9AogHIIgC5ByDRAGQRgNwDkGgAsghA7gFINABZBCD3ACQagCwCkHsAEg1AFgHIPQCJBiCLAOQegEQDkEUAcg9AogHIIgC5ByDRAGQRgNwDkGgAsghA7gFINABZBCD3ACQagCwCkHsAEg1AFgHIPQCJBiCLAOQegEQDkEUAcg9AokUGdLX3Lh3On3+ZfWb4vl+Wo4OiPbjjAKDcSQG6KHb75fikU569XX0AUPbiAjptzdqZfqb35vdqBxod9dNOtPIAoOzFBXSzA82VjAz3B+XosLvyUD3lRdXKC6NNth6g7Mt7Sn37IjoBumjXVlYepk/Lzp8dyL2VgC5f1qewxYvoe3YgAOUtMqDr451ysSHXQNGKDGj1NdD45MPkvmvFAUDZiwzo+ngVIF4HClVkQPMvIa5V9tUDyL1VgK72WksX0QAKV2RAjy776gHkHoBEiwyIU5hAkQFNGf3wkR0obvEBleff/QGgsCkA4hQWOAFAn9mBAhcZ0PQi+hnXQIGLDOjRZV89gNwDkGixAdV/rfU1gAIXGtBpuv+62ltbUPbVA8i9VYC+flsPgMIFIIsA5N4qQJzCBAoNiIvo+MUG9Miyrx5A7gFItNCAro9fr3hvD4AiFRrQ551y1bvDABSoyIC4jRcIQBYByL1VgHgdSKDQgMpzXgeKXmxAjyz76gHkHoBEA5BFAHIPQKIByCIAuQcg0QBkEYDcA5BoALIIQO6ZAsreeoC8V7vVsQOJtZU7UPbVA8g9AIkGIIsA5B6ARAOQRQByD0CiAcgiALkHINEAZBGA3AOQaACyCEDuAUg0AFkEIPcAJBqALAKQewASDUAWAcg9AIkGIIsA5B6ARAOQRQByD0CiAcgiALkHINEAZBGA3AOQaACyCEDuAUg0AFkEIPcAJBqALAKQewASDUAWAcg9AIkGIIsA5B6ARAOQRQBybw1AZ0VR7PbL0UHRHpRLBwDlThBQr5M+jk865dnbpQOAsqcHaPypmw6jo345fN9fPAAoe3qAqpNVUXTK4f6gHB12Fw/VH7+oun8XM249QNmX95S6H9Dwp27ahS7aNZnFw/Qp2fmzA7n3cEB1vc5dOxCA8iYLiGugGOkBSmer8W/98cmHye3X/AFA2dMDlF4HetNdfgGI14GcEgR0f9lXDyD3ACQagCwCkHsAEg1AFgHIPQCJBiCLAOQegEQDkEUAcg9AogHIIgC5ByDRAGQRgNwDkGgAsghA7gFINABZBCD3ACQagCwCkHsAEg1AFgHIPQCJBiCLAOQegEQDkEUAcg9AogHIIgC5ByDRAGQRgNwDkGgAsghA7gFINABZBCD3ACQagCwCkHumgLK3HiDv1W517EBibeUOlH31AHIPQKIByCIAuQcg0QBkEYDcA5BoALIIQO4BSDQAWQQg9wAkGoAsApB7ABINQBYByD0AiQYgiwDkHoBEA5BFAHIPQKIByCIAuQcg0QBkEYDcA5BoALIIQO4BSDQAWfQUAAUfEUDRCz4igKIXfEQARS/4iACKXvARARS94CMCKHrBRwRQ9IKPCKDoBR8RQNELPiKAohd8RABFL/iITQGNDor2AEAbLPiIDQGNTzrl2VsAbbDgIzYENDrql8P3fQBtruAjNgQ03B+Uo8Nu9bsXVY/7GrQNPRLQRXsGKJXf/+rCLGSDRZmxIaDbHQhAeYsyY0NAztdAdxRmIRssyowNAY1PPnjehd1RmIVssCgzNgTk/DrQHYVZyAaLMmNTQAt5DzMrzEI2WJQZASRalBkBJFqUGQEkWpQZASRalBkBJFqUGQEkWpQZASRalBkBJFqUGU0BRekp/L2SaDMCSKxoMwJIrGgzAkisaDNuFSDKH4CoUQCiRgGIGgUgatTWABq+79/+Lf/tK70VOGTbBMh7CZsMQKaNDoo33fRxt18O9/9ZFJ30+/9Um1D9oNaUfk2eIV81xp9+7gz/+vNufzLR6Ohf6RsQIUlA6cfxoj3ofUgfhz9ODvUp7PZBDag3/+Yj3dKgRWf4Y2fy2/ZgdNAeXMT40ZAENDldpTc3jg67Uyw3/5S3h/Te2fQs8dII1c9MvaVOZh4ddMrxpxBbkCag/fSWtPQNrb6L3wJ0UBRRtvoG1T8vvc5sW61mrt9V3gtxVaQJ6KE7kP7uk1qxA1WA2IEeX7oGqr6dvfnrnQVA6Rt8ttufXjF4L7dx02ugeiOaXgO9jTKYJKDFu7CJmfFJfRc23ebPiuJvR/3p8+Qbn9R3YQnQ9C7s8Jcog2kCeurd/p913AOQYgCibQlA1CgAUaMARI0CEDUKQNQoAD20//27vPz+44Oeev78y4YXEycAPbCH4kkBiL4KQKsD0HLXx63Wzu3x8vt/vGy13l1WH15XiKYPJ57Sh/S0eS9Xe61nvz7/cvnq79Wnb77Gr63Wd394TbTRALTU9fFOheBdfUy/Ll9W/+VPKxEVlgRo7mF6nJ5Snt7iuNp7Xf2q/vzlTjn3NZKlHc+xNhaAlpqdqurT0HmS8K7+5A2g24fpWD8tgZtWPz6d/mvLX8NtqA0GoKVm1y/naVeZkzIDNPcwHU9bda9n/3q9GV2+muxQC19jTtk2BaCl1gW0dGkDoKfezSns2cfJ6efbgOqnzXVz1qoBzb5GfQp7tZW3ZgBaanbhe3MBPJWSNpA5OOli+fr4WXURXe0yc4qu9nYmF9HpeVxEP8W+uo2f3q9/bu3MASrTff1ffpjcxs/vQrPb+Hojm7+N304/AMrRlt6A1QEoQwCie0ontNSzlVIARHRHAKJGAYgaBSBqFICoUQCiRgGIGvV/uYqPrtt9awEAAAAASUVORK5CYII=" style="display: block; margin: auto;" /></p>
</div>
<div id="step-4.1-prepare---variable" class="section level1">
<h1>Step 4.1: Prepare - Variable</h1>
@ -1010,7 +996,7 @@ p &lt;-<span class="st"> </span>p +<span class="st"> </span><span class="kw">geo
<span class="co"># Identify the input variables by name.</span>
inputs &lt;-<span class="st"> </span><span class="kw">setdiff</span>(vars, target) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
inputs &lt;-<span class="st"> </span><span class="kw">setdiff</span>(vars, target) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;gender&quot; &quot;caste&quot; &quot;mathematics_marks&quot;
## [4] &quot;english_marks&quot; &quot;science_marks&quot; &quot;science_teacher&quot;
## [7] &quot;languages_teacher&quot; &quot;guardian&quot; &quot;internet&quot;
@ -1018,22 +1004,22 @@ inputs &lt;-<span class="st"> </span><span class="kw">setdiff</span>(vars, targe
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify the input variables by index.</span>
inputi &lt;-<span class="st"> </span><span class="kw">sapply</span>(inputs,
function(x) <span class="kw">which</span>(x ==<span class="st"> </span><span class="kw">names</span>(ds)),
<span class="dt">USE.NAMES=</span><span class="ot">FALSE</span>) %T&gt;%<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<span class="cf">function</span>(x) <span class="kw">which</span>(x <span class="op">==</span><span class="st"> </span><span class="kw">names</span>(ds)),
<span class="dt">USE.NAMES=</span><span class="ot">FALSE</span>) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] 3 4 5 6 7 8 9 10 11 13 14 15</code></pre>
<p>For convenience we record the number of observations.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">nobs &lt;-<span class="st"> </span><span class="kw">nrow</span>(ds) %T&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">nobs &lt;-<span class="st"> </span><span class="kw">nrow</span>(ds) <span class="op">%T&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100</code></pre>
<p>Here we simply report on the dimensions of various data subsets.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Confirm various subset sizes.</span>
<span class="kw">dim</span>(ds) %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<span class="kw">dim</span>(ds) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100 15</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">dim</span>(ds[vars]) %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">dim</span>(ds[vars]) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100 13</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">dim</span>(ds[inputs]) %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">dim</span>(ds[inputs]) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100 12</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">dim</span>(ds[inputi]) %&gt;%<span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">dim</span>(ds[inputi]) <span class="op">%&gt;%</span><span class="st"> </span><span class="kw">echo</span>()</code></pre></div>
<pre><code>## 19,100 12</code></pre>
</div>
<div id="step-4.2-prepare---numeric-and-categoric-variables" class="section level1">
@ -1041,37 +1027,37 @@ inputi &lt;-<span class="st"> </span><span class="kw">sapply</span>(inputs,
<p>Sometimes we need to identify the numeric and categoric variables. Many cluster analysis algorithms only deal with numeric variables, for example. Here we identify them both by name and by index. Note that when using the index we have to assume the variables always remain in the same order within the dataset and all variables are present. Otherwise the indicies will get out of sync.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify the numeric variables by index.</span>
ds %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(is.numeric) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">intersect</span>(inputi) %T&gt;%
ds <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(is.numeric) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">intersect</span>(inputi) <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
numi</code></pre></div>
<pre><code>## [1] 5 6 7 8 9 13 14 15</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify the numeric variables by name.</span>
numc &lt;-<span class="st"> </span>
<span class="st"> </span>ds %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">names</span>() %&gt;%<span class="st"> </span>
<span class="st"> '['</span>(numi) %T&gt;%<span class="st"> </span>
<span class="st"> </span>ds <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> '['</span>(numi) <span class="op">%T&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;mathematics_marks&quot; &quot;english_marks&quot; &quot;science_marks&quot;
## [4] &quot;science_teacher&quot; &quot;languages_teacher&quot; &quot;total_students&quot;
## [7] &quot;total_toilets&quot; &quot;establishment_year&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify the categoric variables by index.</span>
ds %&gt;%
<span class="st"> </span><span class="kw">sapply</span>(is.factor) %&gt;%
<span class="st"> </span><span class="kw">which</span>() %&gt;%
<span class="st"> </span><span class="kw">intersect</span>(inputi) %T&gt;%
ds <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">sapply</span>(is.factor) <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">which</span>() <span class="op">%&gt;%</span>
<span class="st"> </span><span class="kw">intersect</span>(inputi) <span class="op">%T&gt;%</span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
cati</code></pre></div>
<pre><code>## [1] 3 4 10 11</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Identify the categoric variables by name.</span>
ds %&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">names</span>() %&gt;%<span class="st"> </span>
<span class="st"> '['</span>(cati) %T&gt;%<span class="st"> </span>
ds <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">names</span>() <span class="op">%&gt;%</span><span class="st"> </span>
<span class="st"> '['</span>(cati) <span class="op">%T&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">print</span>() -&gt;
catc</code></pre></div>
<pre><code>## [1] &quot;gender&quot; &quot;caste&quot; &quot;guardian&quot; &quot;internet&quot;</code></pre>
@ -1089,7 +1075,7 @@ dsdate &lt;-<span class="st"> &quot;_20161215&quot;</span>
<span class="co"># Filename for the saved dataset.</span>
dsrdata &lt;-<span class="st"> </span>
<span class="st"> </span><span class="kw">file.path</span>(fpath, dsname %s+%<span class="st"> </span>dsdate %s+%<span class="st"> &quot;.RData&quot;</span>) %T&gt;%<span class="st"> </span>
<span class="st"> </span><span class="kw">file.path</span>(fpath, dsname <span class="op">%s+%</span><span class="st"> </span>dsdate <span class="op">%s+%</span><span class="st"> &quot;.RData&quot;</span>) <span class="op">%T&gt;%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">print</span>()</code></pre></div>
<pre><code>## [1] &quot;data/studentDropIndia_20161215.RData&quot;</code></pre>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Save relevant R objects to the binary RData file.</span>

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -20,7 +20,7 @@ knitr::opts_chunk$set(fig.width = 6,
```
# Introducation
# Introduction
Welcome to the Data Science Design Pattern for Student Drop Out. This pattern provides a starting point for the data scientist exploring a new dataset. By no means is it the end point of the data science journey. The pattern is under regular revision and improvement and is provided as is.
@ -342,27 +342,27 @@ return(result)
Let's run the experiments using the algorihtms rpart (Therneau and Atkinson, 2014), randomForest (Breiman et al., 2012), ada (Culp et al., 2012), ctree() from party (Hothorn et al., 2013). In such way, we can conveniently implement those models and compare their performance.
```{r, message=FALSE, warning=FALSE, error=FALSE}
# Source experi.R
source("http://onepager.togaware.com/experi.R")
# Set the times of loops
n <- 10
# Run experiments
ex.rp <- experi(form, ds[vars], dsname, target, "rpart", "1", n=n, keep=TRUE)
ex.rf <- experi(form, ds[vars], dsname, target, "randomForest", "500", n=n, keep=TRUE, control=list(na.action=na.omit))
ex.ad <- experi(form, ds[vars], dsname, target, "ada", "50", n=n, keep=TRUE)
ex.ct <- experi(form, ds[vars], dsname, target, "ctree", "1", n=n, keep=TRUE)
# Compare results
results <- rbind(ex.rp, ex.rf, ex.ad, ex.ct)
rownames(results) <- results$modeller
results$modeller <- NULL
results
# # Source experi.R
#
# source("http://onepager.togaware.com/experi.R")
#
# # Set the times of loops
#
# n <- 10
#
# # Run experiments
#
# ex.rp <- experi(form, ds[vars], dsname, target, "rpart", "1", n=n, keep=TRUE)
# ex.rf <- experi(form, ds[vars], dsname, target, "randomForest", "500", n=n, keep=TRUE, control=list(na.action=na.omit))
# ex.ad <- experi(form, ds[vars], dsname, target, "ada", "50", n=n, keep=TRUE)
# ex.ct <- experi(form, ds[vars], dsname, target, "ctree", "1", n=n, keep=TRUE)
#
# # Compare results
#
# results <- rbind(ex.rp, ex.rf, ex.ad, ex.ct)
# rownames(results) <- results$modeller
# results$modeller <- NULL
# results
```
# Step 7.1: Other Models - Support Vector Machine Model
@ -372,15 +372,15 @@ Except for the above commonly used binary classification models, we could also t
```{r, message=FALSE, warning=FALSE, error=FALSE}
# Tune hyper-parameters
system.time({
m.svm.cv <- tune.svm(form,
data=ds[train, vars],
gamma=2^(-3:1),
cost=2^(2:6),
type="C-classification",
probability=TRUE,
scale=FALSE)
})
system.time({
m.svm.cv <- tune.svm(form,
data=ds[train, vars],
gamma=2^(-1:1),
cost=2^(2:4),
type="C-classification",
probability=TRUE,
scale=FALSE)
})
print(m.svm.cv$best.performance)
```
@ -391,8 +391,8 @@ print(m.svm.cv$best.performance)
system.time({
m.svm <- svm(form,
data=ds[train, vars],
gamma=m.svm.cv$best.parameters[1],
cost=m.svm.cv$best.parameters[2],
gamma=as.numeric(m.svm.cv$best.parameters[1]),
cost=as.numeric(m.svm.cv$best.parameters[2]),
type="C-classification",
probability = TRUE,
scale = FALSE)
@ -454,8 +454,8 @@ print(m.nnet.cv$best.performance)
system.time({
m.nnet <- nnet(formula=form,
data=ds[train, vars],
size=m.nnet.cv$best.parameters[1],
decay=m.nnet.cv$best.parameters[2],
size=as.numeric(m.nnet.cv$best.parameters[1]),
decay=as.numeric(m.nnet.cv$best.parameters[2]),
rang=0.1,
maxit=200)
})
@ -523,8 +523,8 @@ cv.ctrl <- trainControl(method="cv",
allowParallel=TRUE)
grid.xgb <- expand.grid(nrounds=2,
max_depth=2^(1:5),
eta=1*10^(-4:0),
max_depth=2^(1:3),
eta=1*10^(-2:0),
min_child_weight=1,
colsample_bytree=1,
subsample=1,

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны