notebook 01, utils, tests

2019-02-28 14:56:30 +00:00 · 2019-02-28 14:56:30 +00:00 · 6c9f3430ff
--- a/.gitignore
+++ b/.gitignore
@ -105,3 +105,9 @@ venv.bak/

 # ycm config
 .ycm_extra_conf.py
+
+# papermill outputs
+output.ipynb
+
+# don't save any data
+image_classification/data/*
--- a/image_classification/ic_utils/empty.txt
+++ b/image_classification/ic_utils/empty.txt
--- a/image_classification/ic_utils/datasets.py
+++ b/image_classification/ic_utils/datasets.py
@ -0,0 +1,70 @@
+from pathlib import Path
+from zipfile import ZipFile
+from typing import Union
+from urllib.parse import urlparse
+import requests
+import os
+import shutil
+
+
+class Urls:
+    # for now hardcoding base url into Urls class
+    base = "https://cvbp.blob.core.windows.net/public/datasets/image_classification"
+
+    # datasets
+    fridge_objects = f"{base}/fridgeObjects.zip"
+    food_101_subset = f"{base}/food101Subset.zip"
+    flickr_logos_32_subset = f"{base}/flickrLogos32Subset.zip"
+    lettuce = f"{base}/lettuce.zip"
+    recycle = f"{base}/recycle_v3.zip"
+
+
+def data_path() -> Path:
+    """Get the data path"""
+    return os.path.realpath(os.path.join(os.path.dirname(__file__), os.pardir, "data"))
+
+
+def _get_file_name(url: str) -> str:
+    """Get a file name based on url"""
+    return urlparse(url).path.split("/")[-1]
+
+
+def unzip_url(
+    url: str,
+    fpath: Union[Path, str] = data_path(),
+    dest: Union[Path, str] = data_path(),
+    overwrite: bool = False,
+) -> Path:
+    """
+    Download file from URL to {fpath} and unzip to {dest}. 
+    {fpath} and {dest} must be directories
+    Returns path of {dest}
+    """
+    assert os.path.exists(fpath)
+    assert os.path.exists(dest)
+
+    fname = _get_file_name(url)
+    if os.path.exists(os.path.join(fpath, fname)):
+        if overwrite:
+            os.remove(os.path.join(fpath, fname))
+        else:
+            raise Exception(f"{fname} already exists in {fpath}.")
+
+    fname_without_extension = fname.split(".")[0]
+    if os.path.exists(os.path.join(fpath, fname_without_extension)):
+        if overwrite:
+            shutil.rmtree(os.path.join(fpath, fname_without_extension))
+        else:
+            raise Exception(f"{fname_without_extension} already exists in {fpath}.")
+
+    r = requests.get(url)
+    f = open(os.path.join(fpath, fname), "wb")
+    f.write(r.content)
+    f.close()
+
+    os.makedirs(os.path.join(fpath, fname_without_extension))
+    z = ZipFile(os.path.join(fpath, fname), "r")
+    z.extractall(os.path.join(fpath))
+    z.close()
+
+    return os.path.realpath(os.path.join(fpath, fname_without_extension))
--- a/image_classification/notebooks/01_Image_Classification.ipynb
+++ b/image_classification/notebooks/01_Image_Classification.ipynb
--- a/image_classification/notebooks/01_Image_Classification.py
+++ b/image_classification/notebooks/01_Image_Classification.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # Image Classification
+
+# In this notebook, we will classify different kinds of beverages you might find in the fridge.
+
+# Check out fastai version.
+
+# In[1]:
+
+
+import fastai
+fastai.__version__
+
+
+# Ensure edits to libraries are loaded and plotting is shown in the notebook.
+
+# In[2]:
+
+
+get_ipython().run_line_magic('reload_ext', 'autoreload')
+get_ipython().run_line_magic('autoreload', '2')
+get_ipython().run_line_magic('matplotlib', 'inline')
+
+
+# Import fastai. For now, we'll import all (`import *`) so that we can easily use different utilies provided by the fastai library.
+
+# In[3]:
+
+
+import sys
+sys.path.append("../")
+from fastai.vision import *
+from fastai.metrics import error_rate, accuracy
+from pathlib import Path
+from ic_utils.datasets import Urls, unzip_url, data_path
+
+
+# Set some parameters. We'll use the `unzip_url` helper function to download and unzip our data.
+
+# In[4]:
+
+
+DATA_PATH     = unzip_url(Urls.fridge_objects, overwrite=True)
+EPOCHS        = 5
+LEARNING_RATE = 1e-4
+IMAGE_SIZE    = 299
+BATCH_SIZE    = 16
+ARCHITECTURE  = models.resnet50
+
+
+# ---
+
+# ## File Structure for Image Classification
+
+# In this notebook, we'll use images from the `fridge_objects` dataset, which has been downloaded and unzip to  `image_classification/data`.
+# 
+# Lets set that directory to our `path` variable, which we'll use throughout the notebook, and checkout what's inside:
+
+# In[5]:
+
+
+path = Path(DATA_PATH)
+path.ls()
+
+
+# You'll notice that we have four different folders inside:
+# - `/milk_bottle`
+# - `/carton`
+# - `/water_bottle`
+# - `/can`
+
+# The most common data format for multiclass image classification is to have a folder titled the label with the images inside:
+# 
+# ```
+# /images
+# +-- can (class 1)
+# |   +-- image1.jpg
+# |   +-- image2.jpg
+# |   +-- ...
+# +-- carton (class 2)
+# |   +-- image31.jpg
+# |   +-- image32.jpg
+# |   +-- ...
+# +-- ...
+# ```
+# 
+# Good thing our data is already structured in that format!
+
+# ## Loading images with fast.ai
+
+# To use fastai, we want to create an ImageDataBunch so that the library can easily use multiple images (mini-batches) during training time. We create an ImageDataBunch by using fastai's [data_block apis](https://docs.fast.ai/data_block.html).
+
+# In[6]:
+
+
+np.random.seed(42)
+data = ImageItemList     .from_folder(path)     .random_split_by_pct(valid_pct=0.2, seed=10)     .label_from_folder()     .transform(size=IMAGE_SIZE)     .databunch(bs=BATCH_SIZE)     .normalize(imagenet_stats)
+
+
+# Lets take a look at our data using the databunch we created.
+
+# In[7]:
+
+
+data.show_batch(rows=3, figsize=(15,11))
+
+
+# Lets see all available classes:
+
+# In[8]:
+
+
+print(f'number of classes: {data.c}')
+print(data.classes)
+
+
+# ## Training
+
+# For the model, we use a concolutional neural network. 
+# 
+# When training a model, there are many hypter parameters to select, such as the learning rate, the model architecture, layers to tune, and many more. 
+# 
+# With fastai, we can use the `create_cnn` function that allows us to specify the model architecture and a performance indicator (metric). At this point, we already benefit from transfer learning since we download the parameters used to train imagenet. 
+
+# In[9]:
+
+
+learn = create_cnn(data, ARCHITECTURE, metrics=accuracy)
+
+
+# Unfreeze our CNN so that we're training all the layers.
+
+# In[10]:
+
+
+learn.unfreeze()
+
+
+# We can call the `fit` function to train the last layer of the dnn.
+
+# In[11]:
+
+
+learn.fit(EPOCHS, LEARNING_RATE)
+
+
+# ## Evaluating
+
+# To evaluate our model, lets take a look at the accuracy on the validation set.
+
+# In[12]:
+
+
+_, metric = learn.validate(learn.data.valid_dl, metrics=[accuracy])
+print(f'Accuracy on validation set: {float(metric)}')
+
+
+# When evaluating our results, we want to see where the model messes up, and whether or not we can do better. So we're interested in seeing images where the model predicted the image incorrectly but with high confidence (images with the highest loss).
+
+# In[13]:
+
+
+interp = ClassificationInterpretation.from_learner(learn)
+
+
+# In[14]:
+
+
+interp.plot_confusion_matrix()
+
+
+# In[15]:
+
+
+interp.plot_top_losses(9, figsize=(15,11))
+
--- a/image_classification/tests/init.py
+++ b/image_classification/tests/init.py
--- a/image_classification/tests/conftest.py
+++ b/image_classification/tests/conftest.py
@ -1,4 +1,3 @@
-
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

@ -13,12 +12,14 @@ import datetime
 import os
 import pytest

+
 def path_notebooks():
    """Returns the path of the notebooks folder"""
    return os.path.abspath(
        os.path.join(os.path.dirname(__file__), os.path.pardir, "notebooks")
    )

+
@pytest.fixture(scope="module")
 def notebooks():
    folder_notebooks = path_notebooks()
@ -26,6 +27,9 @@ def notebooks():
    # Path for the notebooks
    paths = {
        "mnist": os.path.join(folder_notebooks, "mnist.ipynb"),
-        "simple": os.path.join(folder_notebooks, "simple.ipynb")
+        "simple": os.path.join(folder_notebooks, "simple.ipynb"),
+        "01_image_classification": os.path.join(
+            folder_notebooks, "01_Image_Classification.ipynb"
+        ),
    }
    return paths
--- a/image_classification/tests/unit/test_notebooks.py
+++ b/image_classification/tests/unit/test_notebooks.py
@ -5,12 +5,15 @@
 import os
 import pytest
 import papermill as pm
+from ic_utils.datasets import Urls, unzip_url
+from tests.conftest import path_notebooks

 # Unless manually modified, python3 should be the name of the current jupyter kernel
 # that runs on the activated conda environment
-KERNEL_NAME = "python3"
+KERNEL_NAME = "cvbp"
 OUTPUT_NOTEBOOK = "output.ipynb"

+
 def test_simple_notebook_run(notebooks):
    notebook_path = notebooks["simple"]
    pm.execute_notebook(
@ -20,6 +23,7 @@ def test_simple_notebook_run(notebooks):
        kernel_name=KERNEL_NAME,
    )

+
 def test_mnist_notebook_run(notebooks):
    notebook_path = notebooks["mnist"]
    pm.execute_notebook(
@ -27,4 +31,15 @@ def test_mnist_notebook_run(notebooks):
        OUTPUT_NOTEBOOK,
        parameters=dict(PM_VERSION=pm.__version__),
        kernel_name=KERNEL_NAME,
-    )
+    )
+
+
+def test_01_notebook_run(notebooks):
+    notebook_path = notebooks["01_image_classification"]
+    data_path = unzip_url(Urls.recycle, overwrite=True)
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        parameters=dict(PM_VERSION=pm.__version__, DATA_PATH=data_path),
+        kernel_name=KERNEL_NAME,
+    )
--- a/image_classification/tests/unit/test_utils.py
+++ b/image_classification/tests/unit/test_utils.py
@ -0,0 +1,47 @@
+import pytest
+from ic_utils.datasets import Urls, unzip_url
+import os
+from pathlib import Path
+import unittest
+import shutil
+
+
+class TestUnzipUrl(unittest.TestCase):
+    """
+    This class tests the unzip_url function
+    """
+
+    TEMP_DIR = "../tmp_data"
+
+    def setUp(self):
+        os.makedirs(self.TEMP_DIR, exist_ok=True)
+
+    def tearDown(self):
+        if os.path.exists(self.TEMP_DIR):
+            shutil.rmtree(self.TEMP_DIR)
+
+    def test_unzip_url_rel_path(self):
+        """ Test unzip with relative path. """
+        rel_path = Path(self.TEMP_DIR)
+        data_path = unzip_url(
+            Urls.lettuce, fpath=rel_path, dest=rel_path, overwrite=True
+        )
+        self.assertTrue(os.path.exists(os.path.join(rel_path, "lettuce.zip")))
+        self.assertTrue(os.path.exists(os.path.join(rel_path, "lettuce")))
+        self.assertEqual(
+            os.path.realpath(os.path.join(rel_path, "lettuce")),
+            os.path.realpath(data_path),
+        )
+
+    def test_unzip_url_abs_path(self):
+        """ Test unzip with absolute path. """
+        abs_path = Path(os.path.abspath(self.TEMP_DIR))
+        data_path = unzip_url(
+            Urls.lettuce, fpath=abs_path, dest=abs_path, overwrite=True
+        )
+        self.assertTrue(os.path.exists(os.path.join(abs_path, "lettuce.zip")))
+        self.assertTrue(os.path.exists(os.path.join(abs_path, "lettuce")))
+        self.assertEqual(
+            os.path.realpath(os.path.join(abs_path, "lettuce")),
+            os.path.realpath(data_path),
+        )