This commit is contained in:
Dave Zeber 2020-11-20 18:12:08 -06:00
Родитель 1d7f7aecb1
Коммит bca724cd5d
8 изменённых файлов: 361 добавлений и 0 удалений

20
docs/REPORT.md Normal file
Просмотреть файл

@ -0,0 +1,20 @@
# Report output
The report generated by PRESC is built using [Jupyter Book](https://jupyterbook.org/intro.html).
Report source is maintained under [`presc/report`](../presc/report/).
To build the report, run:
```bash
# Hide all the code cells.
# Currently this has to be done by manually adding tags to each notebook cell.
python presc/report/add_tags.py
# Build
jupyter-book build presc/report
```
The rendered report can be viewed at
[`presc/report/_build/html/index.html`](../presc/report/_build/html/index.html).

21
presc/report/_config.yml Normal file
Просмотреть файл

@ -0,0 +1,21 @@
title: PRESC Report
author: ""
# Force re-execution of notebooks on each build.
execute:
execute_notebooks: force
# Information about where the book exists on the web
repository:
url: https://github.com/mozilla/PRESC
path_to_book: presc/report
branch: master
# Add GitHub buttons
html:
use_issues_button: false
use_repository_button: true
# Additional launch buttons
launch_buttons:
binderhub_url: ""

5
presc/report/_toc.yml Normal file
Просмотреть файл

@ -0,0 +1,5 @@
- file: landing
- part: Misclassifications
chapters:
- file: misclass_rate

25
presc/report/add_tags.py Normal file
Просмотреть файл

@ -0,0 +1,25 @@
"""Add tags to all code cells in the notebooks."""
import nbformat as nbf
from pathlib import Path
TAGS_ALL_CELLS = ["remove-input"]
REPORT_DIR = Path(__file__).parent
notebooks = REPORT_DIR.glob("*.ipynb")
for ipath in notebooks:
print(f"Adding tags in {ipath}")
ntbk = nbf.read(ipath, nbf.NO_CONVERT)
for cell in ntbk.cells:
if cell.cell_type != "code":
continue
cell_tags = cell.metadata.get("tags", [])
for tag_to_add in TAGS_ALL_CELLS:
if tag_to_add not in cell_tags:
cell_tags.append(tag_to_add)
if len(cell_tags) > 0:
cell.metadata["tags"] = cell_tags
nbf.write(ntbk, ipath)

171
presc/report/landing.ipynb Normal file
Просмотреть файл

@ -0,0 +1,171 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PRESC Report"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"from sklearn.metrics import classification_report, plot_confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"# Load the inputs to the report.\n",
"#\n",
"# This creates:\n",
"# - the Dataset instance `dataset`\n",
"# - the ClassificationModel instance `cm`\n",
"# - a dict of config options `config`.\n",
"#\n",
"# Imports are also all managed from here.\n",
"\n",
"%run setup_report.py"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"dataset.raw_dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"print(f\"{len(dataset.features.columns)} features:\\n\")\n",
"print(dataset.features.dtypes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"print(f\"Training set size: {dataset.train_labels.size:,}\")\n",
"print(\n",
" f\"Test set size: {dataset.test_labels.size:,}\" +\n",
" f\" ({dataset.test_labels.size / dataset.labels.size:.1%})\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Classifier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"cm.classifier"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Top level metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"print(classification_report(cm.dataset.test_labels, cm.test_predictions))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"_ = plot_confusion_matrix(cm.classifier, cm.dataset.test_features, cm.dataset.test_labels)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

Просмотреть файл

@ -0,0 +1,88 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conditional misclassification rate\n",
"\n",
"This shows how the misclassification rate varies across the values of each feature."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"from presc.evaluations._example_misclass_rate import MisclassRateEvaluation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"# TOOD: how to share state between notebooks\n",
"\n",
"%run setup_report.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"mre = MisclassRateEvaluation(cm, config=config.get(\"misclass_rate\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"remove-input"
]
},
"outputs": [],
"source": [
"mre.display()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

Просмотреть файл

Просмотреть файл

@ -0,0 +1,31 @@
"""Prepare the inputs to the PRESC report."""
import pandas as pd
from presc.dataset import Dataset
from presc.model import ClassificationModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# Better quality plots
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("svg")
# Load the dataset.
df = pd.read_csv("../../datasets/winequality.csv")
df = df.drop(columns=["quality"])
dataset = Dataset(df, label="recommend")
dataset.split_test_train(0.3)
# Set up the model
model = Pipeline([("scaler", StandardScaler()), ("clf", SVC(class_weight="balanced"))])
cm = ClassificationModel(model, dataset, should_train=True)
# Config options (TODO: read from file)
config = {"misclass_rate": {"num_bins": 20}}