This commit is contained in:
jiata 2019-02-28 14:56:30 +00:00
Родитель 8583b3f0e8
Коммит 6c9f3430ff
9 изменённых файлов: 439 добавлений и 630 удалений

6
.gitignore поставляемый
Просмотреть файл

@ -105,3 +105,9 @@ venv.bak/
# ycm config
.ycm_extra_conf.py
# papermill outputs
output.ipynb
# don't save any data
image_classification/data/*

Просмотреть файл

@ -0,0 +1,70 @@
from pathlib import Path
from zipfile import ZipFile
from typing import Union
from urllib.parse import urlparse
import requests
import os
import shutil
class Urls:
# for now hardcoding base url into Urls class
base = "https://cvbp.blob.core.windows.net/public/datasets/image_classification"
# datasets
fridge_objects = f"{base}/fridgeObjects.zip"
food_101_subset = f"{base}/food101Subset.zip"
flickr_logos_32_subset = f"{base}/flickrLogos32Subset.zip"
lettuce = f"{base}/lettuce.zip"
recycle = f"{base}/recycle_v3.zip"
def data_path() -> Path:
"""Get the data path"""
return os.path.realpath(os.path.join(os.path.dirname(__file__), os.pardir, "data"))
def _get_file_name(url: str) -> str:
"""Get a file name based on url"""
return urlparse(url).path.split("/")[-1]
def unzip_url(
url: str,
fpath: Union[Path, str] = data_path(),
dest: Union[Path, str] = data_path(),
overwrite: bool = False,
) -> Path:
"""
Download file from URL to {fpath} and unzip to {dest}.
{fpath} and {dest} must be directories
Returns path of {dest}
"""
assert os.path.exists(fpath)
assert os.path.exists(dest)
fname = _get_file_name(url)
if os.path.exists(os.path.join(fpath, fname)):
if overwrite:
os.remove(os.path.join(fpath, fname))
else:
raise Exception(f"{fname} already exists in {fpath}.")
fname_without_extension = fname.split(".")[0]
if os.path.exists(os.path.join(fpath, fname_without_extension)):
if overwrite:
shutil.rmtree(os.path.join(fpath, fname_without_extension))
else:
raise Exception(f"{fname_without_extension} already exists in {fpath}.")
r = requests.get(url)
f = open(os.path.join(fpath, fname), "wb")
f.write(r.content)
f.close()
os.makedirs(os.path.join(fpath, fname_without_extension))
z = ZipFile(os.path.join(fpath, fname), "r")
z.extractall(os.path.join(fpath))
z.close()
return os.path.realpath(os.path.join(fpath, fname_without_extension))

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,178 @@
#!/usr/bin/env python
# coding: utf-8
# # Image Classification
# In this notebook, we will classify different kinds of beverages you might find in the fridge.
# Check out fastai version.
# In[1]:
import fastai
fastai.__version__
# Ensure edits to libraries are loaded and plotting is shown in the notebook.
# In[2]:
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
# Import fastai. For now, we'll import all (`import *`) so that we can easily use different utilies provided by the fastai library.
# In[3]:
import sys
sys.path.append("../")
from fastai.vision import *
from fastai.metrics import error_rate, accuracy
from pathlib import Path
from ic_utils.datasets import Urls, unzip_url, data_path
# Set some parameters. We'll use the `unzip_url` helper function to download and unzip our data.
# In[4]:
DATA_PATH = unzip_url(Urls.fridge_objects, overwrite=True)
EPOCHS = 5
LEARNING_RATE = 1e-4
IMAGE_SIZE = 299
BATCH_SIZE = 16
ARCHITECTURE = models.resnet50
# ---
# ## File Structure for Image Classification
# In this notebook, we'll use images from the `fridge_objects` dataset, which has been downloaded and unzip to `image_classification/data`.
#
# Lets set that directory to our `path` variable, which we'll use throughout the notebook, and checkout what's inside:
# In[5]:
path = Path(DATA_PATH)
path.ls()
# You'll notice that we have four different folders inside:
# - `/milk_bottle`
# - `/carton`
# - `/water_bottle`
# - `/can`
# The most common data format for multiclass image classification is to have a folder titled the label with the images inside:
#
# ```
# /images
# +-- can (class 1)
# | +-- image1.jpg
# | +-- image2.jpg
# | +-- ...
# +-- carton (class 2)
# | +-- image31.jpg
# | +-- image32.jpg
# | +-- ...
# +-- ...
# ```
#
# Good thing our data is already structured in that format!
# ## Loading images with fast.ai
# To use fastai, we want to create an ImageDataBunch so that the library can easily use multiple images (mini-batches) during training time. We create an ImageDataBunch by using fastai's [data_block apis](https://docs.fast.ai/data_block.html).
# In[6]:
np.random.seed(42)
data = ImageItemList .from_folder(path) .random_split_by_pct(valid_pct=0.2, seed=10) .label_from_folder() .transform(size=IMAGE_SIZE) .databunch(bs=BATCH_SIZE) .normalize(imagenet_stats)
# Lets take a look at our data using the databunch we created.
# In[7]:
data.show_batch(rows=3, figsize=(15,11))
# Lets see all available classes:
# In[8]:
print(f'number of classes: {data.c}')
print(data.classes)
# ## Training
# For the model, we use a concolutional neural network.
#
# When training a model, there are many hypter parameters to select, such as the learning rate, the model architecture, layers to tune, and many more.
#
# With fastai, we can use the `create_cnn` function that allows us to specify the model architecture and a performance indicator (metric). At this point, we already benefit from transfer learning since we download the parameters used to train imagenet.
# In[9]:
learn = create_cnn(data, ARCHITECTURE, metrics=accuracy)
# Unfreeze our CNN so that we're training all the layers.
# In[10]:
learn.unfreeze()
# We can call the `fit` function to train the last layer of the dnn.
# In[11]:
learn.fit(EPOCHS, LEARNING_RATE)
# ## Evaluating
# To evaluate our model, lets take a look at the accuracy on the validation set.
# In[12]:
_, metric = learn.validate(learn.data.valid_dl, metrics=[accuracy])
print(f'Accuracy on validation set: {float(metric)}')
# When evaluating our results, we want to see where the model messes up, and whether or not we can do better. So we're interested in seeing images where the model predicted the image incorrectly but with high confidence (images with the highest loss).
# In[13]:
interp = ClassificationInterpretation.from_learner(learn)
# In[14]:
interp.plot_confusion_matrix()
# In[15]:
interp.plot_top_losses(9, figsize=(15,11))

Просмотреть файл

Просмотреть файл

@ -1,4 +1,3 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
@ -13,12 +12,14 @@ import datetime
import os
import pytest
def path_notebooks():
"""Returns the path of the notebooks folder"""
return os.path.abspath(
os.path.join(os.path.dirname(__file__), os.path.pardir, "notebooks")
)
@pytest.fixture(scope="module")
def notebooks():
folder_notebooks = path_notebooks()
@ -26,6 +27,9 @@ def notebooks():
# Path for the notebooks
paths = {
"mnist": os.path.join(folder_notebooks, "mnist.ipynb"),
"simple": os.path.join(folder_notebooks, "simple.ipynb")
"simple": os.path.join(folder_notebooks, "simple.ipynb"),
"01_image_classification": os.path.join(
folder_notebooks, "01_Image_Classification.ipynb"
),
}
return paths

Просмотреть файл

@ -5,12 +5,15 @@
import os
import pytest
import papermill as pm
from ic_utils.datasets import Urls, unzip_url
from tests.conftest import path_notebooks
# Unless manually modified, python3 should be the name of the current jupyter kernel
# that runs on the activated conda environment
KERNEL_NAME = "python3"
KERNEL_NAME = "cvbp"
OUTPUT_NOTEBOOK = "output.ipynb"
def test_simple_notebook_run(notebooks):
notebook_path = notebooks["simple"]
pm.execute_notebook(
@ -20,6 +23,7 @@ def test_simple_notebook_run(notebooks):
kernel_name=KERNEL_NAME,
)
def test_mnist_notebook_run(notebooks):
notebook_path = notebooks["mnist"]
pm.execute_notebook(
@ -27,4 +31,15 @@ def test_mnist_notebook_run(notebooks):
OUTPUT_NOTEBOOK,
parameters=dict(PM_VERSION=pm.__version__),
kernel_name=KERNEL_NAME,
)
)
def test_01_notebook_run(notebooks):
notebook_path = notebooks["01_image_classification"]
data_path = unzip_url(Urls.recycle, overwrite=True)
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
parameters=dict(PM_VERSION=pm.__version__, DATA_PATH=data_path),
kernel_name=KERNEL_NAME,
)

Просмотреть файл

@ -0,0 +1,47 @@
import pytest
from ic_utils.datasets import Urls, unzip_url
import os
from pathlib import Path
import unittest
import shutil
class TestUnzipUrl(unittest.TestCase):
"""
This class tests the unzip_url function
"""
TEMP_DIR = "../tmp_data"
def setUp(self):
os.makedirs(self.TEMP_DIR, exist_ok=True)
def tearDown(self):
if os.path.exists(self.TEMP_DIR):
shutil.rmtree(self.TEMP_DIR)
def test_unzip_url_rel_path(self):
""" Test unzip with relative path. """
rel_path = Path(self.TEMP_DIR)
data_path = unzip_url(
Urls.lettuce, fpath=rel_path, dest=rel_path, overwrite=True
)
self.assertTrue(os.path.exists(os.path.join(rel_path, "lettuce.zip")))
self.assertTrue(os.path.exists(os.path.join(rel_path, "lettuce")))
self.assertEqual(
os.path.realpath(os.path.join(rel_path, "lettuce")),
os.path.realpath(data_path),
)
def test_unzip_url_abs_path(self):
""" Test unzip with absolute path. """
abs_path = Path(os.path.abspath(self.TEMP_DIR))
data_path = unzip_url(
Urls.lettuce, fpath=abs_path, dest=abs_path, overwrite=True
)
self.assertTrue(os.path.exists(os.path.join(abs_path, "lettuce.zip")))
self.assertTrue(os.path.exists(os.path.join(abs_path, "lettuce")))
self.assertEqual(
os.path.realpath(os.path.join(abs_path, "lettuce")),
os.path.realpath(data_path),
)