Add initial report template

2020-11-20 18:12:08 -06:00 · 2020-11-20 18:12:08 -06:00 · bca724cd5d
--- a/docs/REPORT.md
+++ b/docs/REPORT.md
@ -0,0 +1,20 @@
+# Report output
+
+The report generated by PRESC is built using [Jupyter Book](https://jupyterbook.org/intro.html).
+Report source is maintained under [`presc/report`](../presc/report/).
+
+To build the report, run:
+
+```bash
+# Hide all the code cells.
+# Currently this has to be done by manually adding tags to each notebook cell.
+python presc/report/add_tags.py
+
+# Build
+jupyter-book build presc/report
+```
+
+The rendered report can be viewed at
+[`presc/report/_build/html/index.html`](../presc/report/_build/html/index.html).
+
+
--- a/presc/report/_config.yml
+++ b/presc/report/_config.yml
@ -0,0 +1,21 @@
+title: PRESC Report
+author: ""
+
+# Force re-execution of notebooks on each build.
+execute:
+  execute_notebooks: force
+
+# Information about where the book exists on the web
+repository:
+  url: https://github.com/mozilla/PRESC
+  path_to_book: presc/report
+  branch: master
+
+# Add GitHub buttons
+html:
+  use_issues_button: false
+  use_repository_button: true
+
+# Additional launch buttons
+launch_buttons:
+  binderhub_url: ""
--- a/presc/report/_toc.yml
+++ b/presc/report/_toc.yml
@ -0,0 +1,5 @@
+- file: landing
+
+- part: Misclassifications
+  chapters:
+    - file: misclass_rate
--- a/presc/report/add_tags.py
+++ b/presc/report/add_tags.py
@ -0,0 +1,25 @@
+"""Add tags to all code cells in the notebooks."""
+
+import nbformat as nbf
+from pathlib import Path
+
+TAGS_ALL_CELLS = ["remove-input"]
+REPORT_DIR = Path(__file__).parent
+
+notebooks = REPORT_DIR.glob("*.ipynb")
+
+for ipath in notebooks:
+    print(f"Adding tags in {ipath}")
+    ntbk = nbf.read(ipath, nbf.NO_CONVERT)
+    for cell in ntbk.cells:
+        if cell.cell_type != "code":
+            continue
+
+        cell_tags = cell.metadata.get("tags", [])
+        for tag_to_add in TAGS_ALL_CELLS:
+            if tag_to_add not in cell_tags:
+                cell_tags.append(tag_to_add)
+        if len(cell_tags) > 0:
+            cell.metadata["tags"] = cell_tags
+
+    nbf.write(ntbk, ipath)
--- a/presc/report/landing.ipynb
+++ b/presc/report/landing.ipynb
@ -0,0 +1,171 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PRESC Report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import classification_report, plot_confusion_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Load the inputs to the report.\n",
+    "#\n",
+    "# This creates:\n",
+    "# - the Dataset instance `dataset`\n",
+    "# - the ClassificationModel instance `cm`\n",
+    "# - a dict of config options `config`.\n",
+    "#\n",
+    "# Imports are also all managed from here.\n",
+    "\n",
+    "%run setup_report.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "dataset.raw_dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "print(f\"{len(dataset.features.columns)} features:\\n\")\n",
+    "print(dataset.features.dtypes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "print(f\"Training set size: {dataset.train_labels.size:,}\")\n",
+    "print(\n",
+    "    f\"Test set size: {dataset.test_labels.size:,}\" +\n",
+    "    f\" ({dataset.test_labels.size / dataset.labels.size:.1%})\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Classifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "cm.classifier"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Top level metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "print(classification_report(cm.dataset.test_labels, cm.test_predictions))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "_ = plot_confusion_matrix(cm.classifier, cm.dataset.test_features, cm.dataset.test_labels)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/presc/report/misclass_rate.ipynb
+++ b/presc/report/misclass_rate.ipynb
@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conditional misclassification rate\n",
+    "\n",
+    "This shows how the misclassification rate varies across the values of each feature."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from presc.evaluations._example_misclass_rate import MisclassRateEvaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# TOOD: how to share state between notebooks\n",
+    "\n",
+    "%run setup_report.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "mre = MisclassRateEvaluation(cm, config=config.get(\"misclass_rate\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "mre.display()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/presc/report/references.bib
+++ b/presc/report/references.bib
--- a/presc/report/setup_report.py
+++ b/presc/report/setup_report.py
@ -0,0 +1,31 @@
+"""Prepare the inputs to the PRESC report."""
+
+import pandas as pd
+
+from presc.dataset import Dataset
+from presc.model import ClassificationModel
+
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+
+# Better quality plots
+from IPython.display import set_matplotlib_formats
+
+set_matplotlib_formats("svg")
+
+# Load the dataset.
+
+df = pd.read_csv("../../datasets/winequality.csv")
+df = df.drop(columns=["quality"])
+
+dataset = Dataset(df, label="recommend")
+dataset.split_test_train(0.3)
+
+# Set up the model
+
+model = Pipeline([("scaler", StandardScaler()), ("clf", SVC(class_weight="balanced"))])
+cm = ClassificationModel(model, dataset, should_train=True)
+
+# Config options (TODO: read from file)
+config = {"misclass_rate": {"num_bins": 20}}