Add Mask R-CNN (#389)

2019-11-14 01:46:37 +08:00 · 2019-11-14 01:46:37 +08:00 · e7389bdb1c
--- a/scenarios/detection/00_webcam.ipynb
+++ b/scenarios/detection/00_webcam.ipynb
@ -78,7 +78,7 @@
    "from utils_cv.common.data import data_path\n",
    "from utils_cv.common.gpu import which_processor, is_windows\n",
    "from utils_cv.detection.data import coco_labels\n",
-    "from utils_cv.detection.model import _get_det_bboxes, DetectionLearner\n",
+    "from utils_cv.detection.model import DetectionLearner\n",
    "from utils_cv.detection.plot import PlotSettings, plot_boxes\n",
    "\n",
    "# Change matplotlib backend so that plots are shown for windows\n",
@ -145,8 +145,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "detector = DetectionLearner(model=model)\n",
-    "detector.add_labels(coco_labels()[1:]) # we use [1:] because the first element of the array is '__background__'"
+    "detector = DetectionLearner(\n",
+    "    model=model, \n",
+    "    labels=coco_labels()[1:],  # we use [1:] because the first element of the array is '__background__'\n",
+    ")"
   ]
  },
  {
@ -220,7 +222,7 @@
    }
   ],
   "source": [
-    "plot_boxes(im, detections, plot_settings=PlotSettings(rect_color=(0, 255, 0)))"
+    "plot_boxes(im, detections[\"det_bboxes\"], plot_settings=PlotSettings(rect_color=(0, 255, 0)))"
   ]
  },
  {
@ -285,7 +287,7 @@
    "        \n",
    "        # Process the captured image\n",
    "        detections = detector.predict(im)\n",
-    "        plot_boxes(im, detections, plot_settings=PlotSettings(rect_color=(0, 255, 0)))\n",
+    "        plot_boxes(im, detections[\"det_bboxes\"], plot_settings=PlotSettings(rect_color=(0, 255, 0)))\n",
    "        \n",
    "        # Convert the processed image back into the image widget for display\n",
    "        f = io.BytesIO()\n",
@ -400,7 +402,10 @@
   ],
   "source": [
    "# Preserve some of the notebook outputs\n",
-    "detections = [(x.label_idx, x.label_name, [(x.left, x.top), (x.right, x.bottom)]) for x in detections]\n",
+    "detections = [\n",
+    "    (x.label_idx, x.label_name, [(x.left, x.top), (x.right, x.bottom)]) \n",
+    "    for x in detections[\"det_bboxes\"]\n",
+    "]\n",
    "sb.glue(\"detection_bounding_box\", detections)"
   ]
  }
--- a/scenarios/detection/01_training_introduction.ipynb
+++ b/scenarios/detection/01_training_introduction.ipynb
@ -68,7 +68,7 @@
    "from utils_cv.detection.data import Urls\n",
    "from utils_cv.detection.dataset import DetectionDataset, get_transform\n",
    "from utils_cv.detection.plot import (\n",
-    "    display_bboxes,\n",
+    "    display_bboxes_mask,\n",
    "    plot_grid,\n",
    "    plot_boxes,\n",
    "    plot_pr_curves,\n",
@ -78,7 +78,6 @@
    ")\n",
    "from utils_cv.detection.model import (\n",
    "    DetectionLearner,\n",
-    "    _get_det_bboxes,\n",
    "    get_pretrained_fasterrcnn,\n",
    ")\n",
    "from utils_cv.common.gpu import which_processor, is_windows\n",
@ -1071,7 +1070,7 @@
   "source": [
    "plot_settings = PlotSettings(rect_color=(0, 255, 0))\n",
    "\n",
-    "display_bboxes(detections, new_im_path, plot_settings=plot_settings)"
+    "display_bboxes_mask(detections[\"det_bboxes\"], new_im_path, plot_settings=plot_settings)"
   ]
  },
  {
@ -1204,6 +1203,19 @@
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
  }
 },
 "nbformat": 4,
--- a/scenarios/detection/02_mask_rcnn.ipynb
+++ b/scenarios/detection/02_mask_rcnn.ipynb
--- a/scenarios/detection/11_exploring_hyperparameters_on_azureml.ipynb
+++ b/scenarios/detection/11_exploring_hyperparameters_on_azureml.ipynb
@ -357,7 +357,7 @@
    "print(f\"Average precision after each epoch: {detector.ap}\")\n",
    "\n",
    "# Get accuracy on test set at IOU=0.5:0.95\n",
-    "acc = float(detector.ap[-1])\n",
+    "acc = float(detector.ap[-1][\"bbox\"])\n",
    "\n",
    "# Add log entries\n",
    "run = Run.get_context()\n",
--- a/scenarios/detection/12_hard_negative_sampling.ipynb
+++ b/scenarios/detection/12_hard_negative_sampling.ipynb
@ -548,7 +548,7 @@
   ],
   "source": [
    "# Get validation accuracy on test set at IOU=0.5:0.95\n",
-    "acc = float(detector.ap[-1])\n",
+    "acc = float(detector.ap[-1][\"bbox\"])\n",
    "valid_accs.append(acc)\n",
    "\n",
    "# Plot validation accuracy versus number of hard-negative mining iterations\n",
--- a/scenarios/detection/media/mask-r-cnn-framework.png
+++ b/scenarios/detection/media/mask-r-cnn-framework.png
--- a/scenarios/detection/media/segmentaion_comparison.png
+++ b/scenarios/detection/media/segmentaion_comparison.png
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -7,16 +7,18 @@
 # you can move it to a conftest.py file. You don't need to import the fixture you want to use in a test, it
 # automatically gets discovered by pytest."

+import numpy as np
 import os
 import pytest
 import torch
 import urllib.request
 import random
+from PIL import Image
 from torch import tensor
 from pathlib import Path
 from fastai.vision import cnn_learner, models
 from fastai.vision.data import ImageList, imagenet_stats
-from typing import List
+from typing import List, Tuple
 from tempfile import TemporaryDirectory
 from utils_cv.common.data import unzip_url
 from utils_cv.classification.data import Urls as ic_urls
@ -25,6 +27,7 @@ from utils_cv.detection.bbox import DetectionBbox, AnnotationBbox
 from utils_cv.detection.dataset import DetectionDataset
 from utils_cv.detection.model import (
    get_pretrained_fasterrcnn,
+    get_pretrained_maskrcnn,
    DetectionLearner,
 )

@ -54,7 +57,7 @@ def path_similarity_notebooks():


 def path_detection_notebooks():
-    """ Returns the path of the similarity notebooks folder. """
+    """ Returns the path of the detection notebooks folder. """
    return os.path.abspath(
        os.path.join(
            os.path.dirname(__file__), os.path.pardir, "scenarios", "detection"
@ -136,6 +139,7 @@ def detection_notebooks():
    paths = {
        "00": os.path.join(folder_notebooks, "00_webcam.ipynb"),
        "01": os.path.join(folder_notebooks, "01_training_introduction.ipynb"),
+        "02": os.path.join(folder_notebooks, "02_mask_rcnn.ipynb"),
        "11": os.path.join(
            folder_notebooks, "11_exploring_hyperparameters_on_azureml.ipynb"
        ),
@ -350,6 +354,15 @@ def od_cup_path(tmp_session) -> str:
    return im_path


+@pytest.fixture(scope="session")
+def od_cup_mask_path(tmp_session) -> str:
+    """ Returns the path to the downloaded cup image. """
+    im_url = "https://cvbp.blob.core.windows.net/public/images/cvbp_cup_mask.png"
+    im_path = os.path.join(tmp_session, "example_mask.png")
+    urllib.request.urlretrieve(im_url, im_path)
+    return im_path
+
+
@pytest.fixture(scope="session")
 def od_cup_anno_bboxes(tmp_session, od_cup_path) -> List[AnnotationBbox]:
    return [
@ -359,7 +372,7 @@ def od_cup_anno_bboxes(tmp_session, od_cup_path) -> List[AnnotationBbox]:
            right=273,
            bottom=244,
            label_name="cup",
-            label_idx="0",
+            label_idx=0,
            im_path=od_cup_path,
        )
    ]
@ -374,13 +387,34 @@ def od_cup_det_bboxes(tmp_session, od_cup_path) -> List[DetectionBbox]:
            right=273,
            bottom=244,
            label_name="cup",
-            label_idx="0",
+            label_idx=0,
            im_path=od_cup_path,
            score=0.99,
        )
    ]


+@pytest.fixture(scope="session")
+def od_mask_rects() -> Tuple:
+    """ Returns synthetic mask and rectangles ([left, top, right, bottom]) for
+    object detection.
+    """
+    height = width = 100
+
+    mask = np.zeros((height, width), dtype=np.uint8)
+    mask[:10, :20] = 1
+    mask[20:40, 30:60] = 2
+    # corresponding binary masks of the mask above
+    binary_masks = np.zeros((2, height, width), dtype=np.bool)
+    binary_masks[0, :10, :20] = True
+    binary_masks[1, 20:40, 30:60] = True
+    # corresponding rectangles of the mask above
+    rects = [[0, 0, 19, 9], [30, 20, 59, 39]]
+    # a completely black image
+    im = Image.fromarray(np.zeros((height, width, 3), dtype=np.uint8))
+    return binary_masks, mask, rects, im
+
+
@pytest.fixture(scope="session")
 def tiny_od_data_path(tmp_session) -> str:
    """ Returns the path to the fridge object detection dataset. """
@ -393,13 +427,24 @@ def tiny_od_data_path(tmp_session) -> str:


@pytest.fixture(scope="session")
-def od_sample_im_anno(tiny_od_data_path) -> str:
+def tiny_od_mask_data_path(tmp_session) -> str:
+    """ Returns the path to the fridge object detection mask dataset. """
+    return unzip_url(
+        od_urls.fridge_objects_mask_tiny_path,
+        fpath=tmp_session,
+        dest=tmp_session,
+        exist_ok=True,
+    )
+
+
+@pytest.fixture(scope="session")
+def od_sample_im_anno(tiny_od_data_path) -> Tuple[Path, ...]:
    """ Returns an annotation and image path from the tiny_od_data_path fixture.
    Specifically, using the paths for 1.xml and 1.jpg
    """
    anno_path = Path(tiny_od_data_path) / "annotations" / "1.xml"
    im_path = Path(tiny_od_data_path) / "images" / "1.jpg"
-    return (anno_path, im_path)
+    return anno_path, im_path


@pytest.fixture(scope="session")
@ -435,39 +480,33 @@ def od_sample_raw_preds():


@pytest.fixture(scope="session")
-def od_sample_detection_bboxes():
-    return [
-        DetectionBbox.from_array(
-            [109.0, 190.0, 205.0, 408.0],
-            label_idx=3,
-            label_name="carton",
-            score=0.9985,
-        ),
-        DetectionBbox.from_array(
-            [340.0, 326.0, 465.0, 549.0],
-            label_idx=2,
-            label_name="milk_bottle",
-            score=0.9979,
-        ),
-        DetectionBbox.from_array(
-            [214.0, 181.0, 315.0, 460.0],
-            label_idx=1,
-            label_name="can",
-            score=0.9945,
-        ),
-        DetectionBbox.from_array(
-            [215.0, 193.0, 316.0, 471.0],
-            label_idx=2,
-            label_name="milk_bottle",
-            score=0.1470,
-        ),
-        DetectionBbox.from_array(
-            [109.0, 209.0, 209.0, 420.0],
-            label_idx=1,
-            label_name="can",
-            score=0.0903,
-        ),
+def od_sample_output():
+    width = 500
+    height = 600
+    boxes = [
+        [109.0, 190.0, 205.0, 408.0],
+        [340.0, 326.0, 465.0, 549.0],
+        [214.0, 181.0, 315.0, 460.0],
+        [215.0, 193.0, 316.0, 471.0],
+        [109.0, 209.0, 209.0, 420.0],
    ]
+    labels = [3, 2, 1, 2, 1]
+    scores = [0.9985, 0.9979, 0.9945, 0.1470, 0.0903]
+    # construct masks
+    masks = np.zeros((len(boxes), 1, height, width), dtype=np.float)
+    for rect, mask in zip(boxes, masks):
+        left, top, right, bottom = [int(x) for x in rect]
+        # first line of the bounding box
+        mask[:, top, left:(right+1)] = 0.05
+        # other lines of the bounding box
+        mask[:, (top+1):(bottom+1), left:(right+1)] = 0.7
+
+    return {
+        "boxes": tensor(boxes, dtype=torch.float),
+        "labels": tensor(labels, dtype=torch.int64),
+        "scores": tensor(scores, dtype=torch.float),
+        "masks": tensor(masks),
+    }


@pytest.fixture(scope="session")
@ -476,6 +515,12 @@ def od_detection_dataset(tiny_od_data_path):
    return DetectionDataset(tiny_od_data_path)


+@pytest.fixture(scope="session")
+def od_detection_mask_dataset(tiny_od_mask_data_path):
+    """ returns a basic detection mask dataset. """
+    return DetectionDataset(tiny_od_mask_data_path, mask_dir="segmentation-masks")
+
+
@pytest.mark.gpu
@pytest.fixture(scope="session")
 def od_detection_learner(od_detection_dataset):
@ -494,6 +539,24 @@ def od_detection_learner(od_detection_dataset):
    return learner


+@pytest.mark.gpu
+@pytest.fixture(scope="session")
+def od_detection_mask_learner(od_detection_mask_dataset):
+    """ returns a basic detection learner that has been trained for one epoch. """
+    model = get_pretrained_maskrcnn(
+        num_classes=len(od_detection_mask_dataset.labels) + 1,
+        min_size=100,
+        max_size=200,
+        rpn_pre_nms_top_n_train=500,
+        rpn_pre_nms_top_n_test=250,
+        rpn_post_nms_top_n_train=500,
+        rpn_post_nms_top_n_test=250,
+    )
+    learner = DetectionLearner(od_detection_mask_dataset, model=model)
+    learner.fit(1)
+    return learner
+
+
@pytest.mark.gpu
@pytest.fixture(scope="session")
 def od_detection_eval(od_detection_learner):
@ -501,6 +564,13 @@ def od_detection_eval(od_detection_learner):
    return od_detection_learner.evaluate()


+@pytest.mark.gpu
+@pytest.fixture(scope="session")
+def od_detection_mask_eval(od_detection_mask_learner):
+    """ returns the eval results of a detection learner after one epoch of training. """
+    return od_detection_mask_learner.evaluate()
+
+
@pytest.mark.gpu
@pytest.fixture(scope="session")
 def od_detections(od_detection_dataset):
--- a/tests/integration/detection/test_integration_detection.py
+++ b/tests/integration/detection/test_integration_detection.py
@ -23,9 +23,39 @@ def test_01_notebook_run(detection_notebooks):
    )

    nb_output = sb.read_notebook(OUTPUT_NOTEBOOK)
-    assert len(nb_output.scraps["training_losses"].data) == epochs
-    assert nb_output.scraps["training_losses"].data[-1] < 0.5
-    assert nb_output.scraps["training_average_precision"].data[-1] > 0.5
+    training_losses = nb_output.scraps["training_losses"].data
+    assert len(training_losses) == epochs
+    assert training_losses[-1] < 0.5
+    training_aps = nb_output.scraps["training_average_precision"].data
+    assert len(training_aps) == epochs
+    for d in training_aps[-1].values():
+        assert d > 0.5
+
+
+@pytest.mark.notebooks
+@pytest.mark.linuxgpu
+def test_02_notebook_run(detection_notebooks):
+    epochs = 5
+    notebook_path = detection_notebooks["02"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        parameters=dict(
+            PM_VERSION=pm.__version__,
+            EPOCHS=epochs,
+        ),
+        kernel_name=KERNEL_NAME,
+    )
+
+    nb_output = sb.read_notebook(OUTPUT_NOTEBOOK)
+    training_losses = nb_output.scraps["training_losses"].data
+    assert len(training_losses) == epochs
+    assert training_losses[-1] < 0.85
+    training_aps = nb_output.scraps["training_average_precision"].data
+    assert len(training_aps) == epochs
+    for d in training_aps[-1].values():
+        assert d > 0.15
+

@pytest.mark.notebooks
@pytest.mark.linuxgpu
--- a/tests/unit/common/test_common_plot.py
+++ b/tests/unit/common/test_common_plot.py
@ -29,7 +29,7 @@ def test_line_graph():

 def test_show_ims(tiny_ic_data_path):
    # Naive test to run the function without errors
-    ims = [str(i) for i in Path(tiny_ic_data_path).glob("*.*")]
+    ims = [str(i) for i in Path(tiny_ic_data_path).glob("**/*.*")]
    show_ims(ims)
    plt.close()

--- a/tests/unit/detection/test_detection_bbox.py
+++ b/tests/unit/detection/test_detection_bbox.py
@ -2,11 +2,12 @@
 # Licensed under the MIT License.

 import pytest
+from typing import List, Optional

 from utils_cv.detection.bbox import DetectionBbox, AnnotationBbox, _Bbox, bboxes_iou


-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def basic_bbox() -> "_Bbox":
    return _Bbox(left=0, top=10, right=100, bottom=1000)

@ -23,11 +24,27 @@ def det_bbox() -> "DetectionBbox":
    )


-def validate_bbox(bbox: _Bbox) -> bool:
-    assert bbox.left == 0
-    assert bbox.top == 10
-    assert bbox.right == 100
-    assert bbox.bottom == 1000
+def validate_bbox(
+    bbox: _Bbox,
+    rect: Optional[List[int]] = None
+) -> None:
+    if rect is None:
+        rect = [0, 10, 100, 1000]
+    assert [bbox.left, bbox.top, bbox.right, bbox.bottom] == rect
+
+
+def validate_anno_bbox(
+    bbox: AnnotationBbox,
+    label_idx: int,
+    rect: Optional[List[int]] = None,
+    im_path: Optional[str] = None,
+    label_name: Optional[str] = None
+):
+    validate_bbox(bbox, rect)
+    assert type(bbox) == AnnotationBbox
+    assert bbox.label_idx == label_idx
+    assert bbox.im_path == im_path
+    assert bbox.label_name == label_name


 def text__bbox_init(basic_bbox):
@ -35,12 +52,12 @@ def text__bbox_init(basic_bbox):
    validate_bbox(basic_bbox)


-def test__bbox_from_array(basic_bbox):
+def test__bbox_from_array():
    # test `from_array()` bbox initialization method
    bbox_from_array = _Bbox.from_array([0, 10, 100, 1000])
    validate_bbox(bbox_from_array)
-    # test `from_array_xymh()` bbox initialization method
-    bbox_from_array_xywh = _Bbox.from_array_xywh([0, 10, 100, 990])
+    # test `from_array_xywh()` bbox initialization method
+    bbox_from_array_xywh = _Bbox.from_array_xywh([0, 10, 101, 991])
    validate_bbox(bbox_from_array_xywh)


@ -91,16 +108,14 @@ def test__bbox_is_valid(basic_bbox):


 def test_annotation_bbox_init(anno_bbox):
-    validate_bbox(anno_bbox)
-    assert type(anno_bbox) == AnnotationBbox
+    validate_anno_bbox(anno_bbox, label_idx=0)


-def test_annotation_bbox_from_array(anno_bbox):
+def test_annotation_bbox_from_array():
    bbox_from_array = AnnotationBbox.from_array(
        [0, 10, 100, 1000], label_idx=0
    )
-    validate_bbox(bbox_from_array)
-    assert type(bbox_from_array) == AnnotationBbox
+    validate_anno_bbox(bbox_from_array, label_idx=0)


 def test_detection_bbox_init(det_bbox):
--- a/tests/unit/detection/test_detection_dataset.py
+++ b/tests/unit/detection/test_detection_dataset.py
@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

+import numpy as np
 import pytest
 import torch
 from pathlib import Path
@ -23,6 +24,9 @@ def basic_im(od_cup_path) -> Tuple[Image.Image, dict]:

    boxes = torch.as_tensor([[61, 59, 273, 244]], dtype=torch.float32)
    labels = torch.as_tensor([[0]], dtype=torch.int64)
+    masks = np.zeros((500, 500), dtype=np.bool)
+    masks[100:200, 100:200] = True
+    masks = torch.as_tensor(masks, dtype=torch.uint8)

    target = {
        "boxes": boxes,
@ -30,13 +34,14 @@ def basic_im(od_cup_path) -> Tuple[Image.Image, dict]:
        "image_id": None,
        "area": None,
        "iscrowd": False,
+        "masks": masks,
    }

-    return (im, target)
+    return im, target


@pytest.fixture(scope="session")
-def od_sample_bboxes() -> str:
+def od_sample_bboxes() -> List[_Bbox]:
    """ Returns the true bboxes from the `od_sample_im_anno` fixture. """
    return [_Bbox(left=100, top=173, right=233, bottom=521)]

@ -75,23 +80,45 @@ def test_parse_pascal_voc(od_sample_im_anno, od_sample_bboxes):


 def validate_detection_dataset(data: DetectionDataset, labels: List[str]):
-    assert len(data) == 39
+    assert len(data) == 39 if data.mask_paths is None else 31
    assert type(data) == DetectionDataset
    assert len(data.labels) == 4
    for label in data.labels:
        assert label in labels

+    if data.mask_paths:
+        assert len(data.mask_paths) == len(data.im_paths)

-def test_detection_dataset_init_basic(tiny_od_data_path, od_data_path_labels):
+
+def test_detection_dataset_init_basic(
+    tiny_od_data_path,
+    od_data_path_labels,
+    tiny_od_mask_data_path
+):
    """ Tests that initialization of the Detection Dataset works. """
    data = DetectionDataset(tiny_od_data_path)
    validate_detection_dataset(data, od_data_path_labels)
    assert len(data.test_ds) == 19
    assert len(data.train_ds) == 20

+    # test random seed
+    data = DetectionDataset(tiny_od_data_path, seed=9)
+    data2 = DetectionDataset(tiny_od_data_path, seed=9)
+    assert data.train_dl.dataset.indices == data2.train_dl.dataset.indices
+    assert data.test_dl.dataset.indices == data2.test_dl.dataset.indices
+
+    # test mask data
+    data = DetectionDataset(
+        tiny_od_mask_data_path,
+        mask_dir="segmentation-masks"
+    )
+    validate_detection_dataset(data, od_data_path_labels)
+    assert len(data.test_ds) == 15
+    assert len(data.train_ds) == 16
+

 def test_detection_dataset_init_train_pct(
-    tiny_od_data_path, od_data_path_labels
+    tiny_od_data_path, od_data_path_labels, tiny_od_mask_data_path
 ):
    """ Tests that initialization with train_pct."""
    data = DetectionDataset(tiny_od_data_path, train_pct=0.75)
@ -99,15 +126,33 @@ def test_detection_dataset_init_train_pct(
    assert len(data.test_ds) == 9
    assert len(data.train_ds) == 30

+    # test mask data
+    data = DetectionDataset(
+        tiny_od_mask_data_path,
+        train_pct=0.75,
+        mask_dir="segmentation-masks"
+    )
+    validate_detection_dataset(data, od_data_path_labels)
+    assert len(data.test_ds) == 7
+    assert len(data.train_ds) == 24

-def test_detection_dataset_show_ims(basic_detection_dataset):
+
+def test_detection_dataset_show_ims(
+    basic_detection_dataset,
+    od_detection_mask_dataset
+):
    # simply test that this is error free for now
    basic_detection_dataset.show_ims()
+    od_detection_mask_dataset.show_ims()


-def test_detection_dataset_show_im_transformations(basic_detection_dataset):
+def test_detection_dataset_show_im_transformations(
+    basic_detection_dataset,
+    od_detection_mask_dataset
+):
    # simply test that this is error free for now
    basic_detection_dataset.show_im_transformations()
+    od_detection_mask_dataset.show_im_transformations()


 def test_detection_dataset_init_anno_im_dirs(
--- a/tests/unit/detection/test_detection_mask.py
+++ b/tests/unit/detection/test_detection_mask.py
@ -0,0 +1,47 @@
+import numpy as np
+from utils_cv.detection.mask import (
+    binarise_mask,
+    colorise_binary_mask,
+    transparentise_mask,
+    merge_binary_masks,
+)
+
+
+def test_binarise_mask(od_mask_rects):
+    """ Test that `binarise_mask` works. """
+    binary_masks, mask, _, _ = od_mask_rects
+    assert np.all(binarise_mask(mask) == binary_masks)
+
+
+def test_colorise_binary_mask(od_mask_rects):
+    """ Test that `colorise_binary_mask` works. """
+    (binary_mask, _), _, _, _ = od_mask_rects
+    foreground = 9
+    background = 0
+    colored_mask = colorise_binary_mask(
+        binary_mask,
+        color=(foreground, foreground, foreground)
+    )
+    for ch in colored_mask.transpose((2, 0, 1)):
+        assert np.all(ch[binary_mask] == foreground)
+        assert np.all(ch[binary_mask != True] == background)
+
+
+def test_transparentise_mask(od_mask_rects):
+    """ Test that `transparentise_mask` works. """
+    (binary_mask, _), _, _, _ = od_mask_rects
+    foreground = 9
+    background = 0
+    colored_mask = colorise_binary_mask(
+        binary_mask,
+        color=(foreground, foreground, foreground)
+    )
+    transparent_mask = transparentise_mask(colored_mask, alpha=0.7)
+    assert np.all(transparent_mask[binary_mask] != background)
+    assert np.all(transparent_mask[binary_mask != True] == background)
+
+
+def test_merge_binary_masks(od_mask_rects):
+    """ Test that `merge_binary_masks` works. """
+    binary_masks, mask, _, _ = od_mask_rects
+    assert np.all(merge_binary_masks(binary_masks) == mask)
--- a/tests/unit/detection/test_detection_model.py
+++ b/tests/unit/detection/test_detection_model.py
@ -2,41 +2,39 @@
 # Licensed under the MIT License.

 from torchvision.models.detection.faster_rcnn import FasterRCNN
+from torchvision.models.detection.mask_rcnn import MaskRCNN
 from collections.abc import Iterable
 import numpy as np
 import pytest
 import shutil
 from pathlib import Path
-from typing import Union
+from typing import Tuple

-from utils_cv.detection.bbox import DetectionBbox
 from utils_cv.detection.model import (
    get_pretrained_fasterrcnn,
+    get_pretrained_maskrcnn,
    DetectionLearner,
-    _get_det_bboxes,
    _apply_threshold,
    _calculate_ap,
    ims_eval_detections,
 )


-def test__get_det_bboxes(od_sample_raw_preds, od_data_path_labels):
-    """ test that `_get_det_bboxes` can convert raw preds to DetectionBboxes. """
-    det_bboxes = _get_det_bboxes(
-        od_sample_raw_preds, labels=od_data_path_labels, im_path=None
-    )
-    assert type(det_bboxes[0]) == DetectionBbox
-    assert len(det_bboxes) == 5
-
-
-def test__apply_threshold(od_sample_detection_bboxes):
+def test__apply_threshold(od_sample_output):
    """ Test `_apply_threshold` and verify it works at different thresholds. """
-    det_bboxes = _apply_threshold(od_sample_detection_bboxes, threshold=0.5)
-    assert len(det_bboxes) == 3
-    det_bboxes = _apply_threshold(od_sample_detection_bboxes, threshold=0.01)
-    assert len(det_bboxes) == 5
-    det_bboxes = _apply_threshold(od_sample_detection_bboxes, threshold=0.995)
-    assert len(det_bboxes) == 2
+    # test cases: [(threshold, num, mask_pixels)]
+    test_cases = [
+        (0.5, 3, (21146, 28098, 28458)),
+        (0.01, 5, (21146, 28098, 28458, 28356, 21311)),
+        (0.995, 2, (21146, 28098)),
+    ]
+    res = {k: v.detach().cpu().numpy() for k, v in od_sample_output.items()}
+    for threshold, num, mask_pixels in test_cases:
+        pred = _apply_threshold(res, threshold=threshold)
+        for v in pred.values():
+            assert len(v) == num
+        for mask, num_pixels in zip(pred["masks"], mask_pixels):
+            assert np.sum(mask) == num_pixels


 def test_get_pretrained_fasterrcnn():
@ -44,11 +42,18 @@ def test_get_pretrained_fasterrcnn():
    assert type(get_pretrained_fasterrcnn(4)) == FasterRCNN


+def test_get_pretrained_maskrcnn():
+    """ Simply test that `get_pretrained_maskrcnn` returns the correct type for now. """
+    assert type(get_pretrained_maskrcnn(4)) == MaskRCNN
+
+
@pytest.mark.gpu
 def test__calculate_ap(od_detection_eval):
    """ Test `_calculate_ap`. """
    ret = _calculate_ap(od_detection_eval)
-    assert type(ret) == np.float64
+    assert type(ret) == dict
+    for v in ret.values():
+        assert type(v) == np.float64


@pytest.mark.gpu
@ -97,43 +102,114 @@ def test_detection_learner_init_model(od_detection_dataset):


@pytest.mark.gpu
-def test_detection_learner_train_one_epoch(od_detection_learner):
+def test_detection_learner_train_one_epoch(
+    od_detection_learner,
+):
    """ Simply test that a small training loop works. """
    od_detection_learner.fit(epochs=1)


@pytest.mark.gpu
-def test_detection_learner_plot_precision_loss_curves(od_detection_learner):
+def test_detection_mask_learner_train_one_epoch(
+    od_detection_mask_learner,
+):
+    """ Simply test that a small training loop works for mask learner. """
+    od_detection_mask_learner.fit(epochs=1)
+
+
+@pytest.mark.gpu
+def test_detection_learner_plot_precision_loss_curves(
+    od_detection_learner,
+):
    """ Simply test that `plot_precision_loss_curves` works. """
    od_detection_learner.plot_precision_loss_curves()


@pytest.mark.gpu
-def test_detection_learner_evalute(od_detection_learner):
+def test_detection_mask_learner_plot_precision_loss_curves(
+    od_detection_mask_learner,
+):
+    """ Simply test that `plot_precision_loss_curves` works for mask learner. """
+    # test mask learner
+    od_detection_mask_learner.plot_precision_loss_curves()
+
+
+@pytest.mark.gpu
+def test_detection_learner_evaluate(
+    od_detection_learner,
+):
    """ Simply test that `evaluate` works. """
    od_detection_learner.evaluate()


@pytest.mark.gpu
-def test_detection_learner_predict(od_detection_learner, od_cup_path):
+def test_detection_mask_learner_evaluate(
+    od_detection_mask_learner,
+):
+    """ Simply test that `evaluate` works for mask learner. """
+    od_detection_mask_learner.evaluate()
+
+
+@pytest.mark.gpu
+def test_detection_learner_predict(
+    od_detection_learner,
+    od_cup_path,
+):
    """ Simply test that `predict` works. """
-    bboxes = od_detection_learner.predict(od_cup_path)
+    bboxes = od_detection_learner.predict(od_cup_path)["det_bboxes"]
    assert type(bboxes) == list


@pytest.mark.gpu
-def test_detection_learner_predict_threshold(
-    od_detection_learner, od_cup_path
+def test_detection_mask_learner_predict(
+    od_detection_mask_learner,
+    od_cup_path,
 ):
-    """ Simply test that `predict` works with a threshold by setting a really high threshold. """
-    bboxes = od_detection_learner.predict(od_cup_path, threshold=0.9999)
+    """ Simply test that `predict` works for mask learner. """
+    pred = od_detection_mask_learner.predict(
+        od_cup_path, threshold=0.1
+    )
+    bboxes = pred["det_bboxes"]
+    masks = pred["masks"]
+    assert type(bboxes) == list
+    assert type(masks) == np.ndarray
+    assert len(bboxes) == len(masks)
+
+
+@pytest.mark.gpu
+def test_detection_learner_predict_threshold(
+    od_detection_learner,
+    od_cup_path,
+):
+    """ Simply test that `predict` works with a threshold by setting a really
+    high threshold.
+    """
+    bboxes = od_detection_learner.predict(od_cup_path, threshold=0.9999)["det_bboxes"]
    assert type(bboxes) == list
    assert len(bboxes) == 0


+@pytest.mark.gpu
+def test_detection_mask_learner_predict_threshold(
+    od_detection_mask_learner,
+    od_cup_path,
+):
+    """ Simply test that `predict` works for mask learner with a threshold by
+    setting a really high threshold.
+    """
+    pred = od_detection_mask_learner.predict(od_cup_path, threshold=0.9999)
+    bboxes = pred["det_bboxes"]
+    masks = pred["masks"]
+    assert type(bboxes) == list
+    assert type(masks) == np.ndarray
+    assert len(bboxes) == len(masks)
+    assert len(bboxes) == 0
+
+
@pytest.mark.gpu
 def test_detection_learner_predict_batch(
-    od_detection_learner, od_detection_dataset
+    od_detection_learner,
+    od_detection_dataset,
 ):
    """ Simply test that `predict_batch` works. """
    generator = od_detection_learner.predict_batch(
@ -143,25 +219,65 @@ def test_detection_learner_predict_batch(


@pytest.mark.gpu
-def test_detection_learner_predict_batch_threshold(
-    od_detection_learner, od_detection_dataset
+def test_detection_mask_learner_predict_batch(
+    od_detection_mask_learner,
+    od_detection_mask_dataset,
 ):
-    """ Simply test that `predict_batch` works with a threshold by setting it really high.. """
+    """ Simply test that `predict_batch` works for mask learner. """
+    generator = od_detection_mask_learner.predict_batch(
+        od_detection_mask_dataset.test_dl
+    )
+    assert isinstance(generator, Iterable)
+
+
+@pytest.mark.gpu
+def test_detection_learner_predict_batch_threshold(
+    od_detection_learner,
+    od_detection_dataset,
+):
+    """ Simply test that `predict_batch` works with a threshold by setting it
+    really high.
+    """
    generator = od_detection_learner.predict_batch(
        od_detection_dataset.test_dl, threshold=0.9999
    )
    assert isinstance(generator, Iterable)


+@pytest.mark.gpu
+def test_detection_mask_learner_predict_batch_threshold(
+    od_detection_mask_learner,
+    od_detection_mask_dataset,
+):
+    """ Simply test that `predict_batch` works for mask learner with a
+    threshold by setting it really high.
+    """
+    generator = od_detection_mask_learner.predict_batch(
+        od_detection_mask_dataset.test_dl,
+        threshold=0.9999,
+    )
+    assert isinstance(generator, Iterable)
+
+
@pytest.mark.gpu
 def test_detection_dataset_predict_dl(
-    od_detection_learner, od_detection_dataset
+    od_detection_learner,
+    od_detection_dataset,
 ):
    """ Simply test that `predict_dl` works. """
    od_detection_learner.predict_dl(od_detection_dataset.test_dl)


-def validate_saved_model(name: str, path: str) -> bool:
+@pytest.mark.gpu
+def test_detection_mask_dataset_predict_dl(
+    od_detection_mask_learner,
+    od_detection_mask_dataset,
+):
+    """ Simply test that `predict_dl` works for mask learner. """
+    od_detection_mask_learner.predict_dl(od_detection_mask_dataset.test_dl)
+
+
+def validate_saved_model(name: str, path: str) -> None:
    """ Tests that saved model is there """
    assert (Path(path)).exists()
    assert (Path(path) / name).exists()
@ -191,7 +307,7 @@ def test_detection_save_model(od_detection_learner, tiny_od_data_path):

@pytest.mark.gpu
@pytest.fixture(scope="session")
-def saved_model(od_detection_learner, tiny_od_data_path) -> Union[str, Path]:
+def saved_model(od_detection_learner, tiny_od_data_path) -> Tuple[str, Path]:
    """ A saved model so that loading functions can reuse. """
    model_name = "test_fixture_model"
    od_detection_learner.save(model_name)
--- a/tests/unit/detection/test_detection_notebooks.py
+++ b/tests/unit/detection/test_detection_notebooks.py
@ -47,9 +47,36 @@ def test_01_notebook_run(detection_notebooks, tiny_od_data_path):

    nb_output = sb.read_notebook(OUTPUT_NOTEBOOK)
    assert len(nb_output.scraps["training_losses"].data) > 0
-    assert len(nb_output.scraps["training_average_precision"].data) > 0
+    training_aps = nb_output.scraps["training_average_precision"].data
+    assert len(training_aps) > 0
+    for d in training_aps:
+        assert isinstance(d, dict)
+    assert len(set([len(d) for d in training_aps])) == 1
+
+
+@pytest.mark.gpu
+@pytest.mark.notebooks
+def test_02_notebook_run(detection_notebooks, tiny_od_mask_data_path):
+    notebook_path = detection_notebooks["02"]
+    pm.execute_notebook(
+        notebook_path,
+        OUTPUT_NOTEBOOK,
+        parameters=dict(
+            PM_VERSION=pm.__version__,
+            DATA_PATH=tiny_od_mask_data_path,
+            EPOCHS=1,
+        ),
+        kernel_name=KERNEL_NAME,
+    )
+    nb_output = sb.read_notebook(OUTPUT_NOTEBOOK)
+    assert len(nb_output.scraps["training_losses"].data) > 0
+    training_aps = nb_output.scraps["training_average_precision"].data
+    assert len(training_aps) > 0
+    for d in training_aps:
+        assert isinstance(d, dict)
+    assert len(set([len(d) for d in training_aps])) == 1
+

-    
@pytest.mark.gpu
@pytest.mark.notebooks
 def test_12_notebook_run(detection_notebooks, tiny_od_data_path):
@ -69,4 +96,3 @@ def test_12_notebook_run(detection_notebooks, tiny_od_data_path):
    nb_output = sb.read_notebook(OUTPUT_NOTEBOOK)
    assert len(nb_output.scraps["valid_accs"].data) == 1
    assert len(nb_output.scraps["hard_im_scores"].data) == 10
-    
--- a/tests/unit/detection/test_detection_plot.py
+++ b/tests/unit/detection/test_detection_plot.py
@ -4,11 +4,11 @@
 import pytest
 from PIL import Image
 import matplotlib.pyplot as plt
+import numpy as np

 from utils_cv.detection.plot import (
    PlotSettings,
    plot_boxes,
-    display_bboxes,
    plot_grid,
    plot_detection_vs_ground_truth,
    _setup_pr_axes,
@ -17,6 +17,8 @@ from utils_cv.detection.plot import (
    _plot_pr_curve_iou_mean,
    plot_pr_curves,
    plot_counts_curves,
+    plot_mask,
+    display_bboxes_mask,
 )


@ -36,6 +38,8 @@ def test_plot_setting_init(basic_plot_settings):
    assert basic_plot_settings.rect_color is not None
    assert basic_plot_settings.text_size is not None
    assert basic_plot_settings.text_color is not None
+    assert basic_plot_settings.mask_color is not None
+    assert basic_plot_settings.mask_alpha is not None


 def test_plot_boxes(od_cup_path, od_cup_anno_bboxes, basic_plot_settings):
@ -51,29 +55,58 @@ def test_plot_boxes(od_cup_path, od_cup_anno_bboxes, basic_plot_settings):
    )


-def test_display_bboxes(od_cup_anno_bboxes, od_cup_path):
-    """ Test that `display_bboxes` works. """
-    display_bboxes(bboxes=od_cup_anno_bboxes, im_path=od_cup_path)
+def test_plot_mask(od_mask_rects):
+    """ Test that `plot_mask` works. """
+    plot_setting = PlotSettings()
+    _, mask, rects, im = od_mask_rects
+    # plot mask
+    im = plot_mask(im, mask, plot_settings=plot_setting).convert('RGB')
+    im = np.transpose(np.array(im), (2, 0, 1))
+    # validate each channel matches the mask
+    for ch in im:
+        ch_uniques = np.unique(ch)
+        foreground_uniques = np.unique(ch[np.where(mask != 0)])
+        assert len(foreground_uniques) == 1
+        assert foreground_uniques[0] == ch_uniques[1]
+        background_uniques = np.unique(ch[np.where(mask == 0)])
+        assert len(background_uniques) == 1
+        assert background_uniques[0] == ch_uniques[0]


-def test_plot_grid(od_cup_anno_bboxes, od_cup_path):
+def test_display_bboxes_mask(
+    od_cup_anno_bboxes,
+    od_cup_path,
+    od_cup_mask_path,
+    basic_ax
+):
+    """ Test that `display_bboxes_mask` works. """
+    display_bboxes_mask(
+        bboxes=od_cup_anno_bboxes,
+        im_path=od_cup_path,
+        mask_path=od_cup_mask_path,
+        ax=basic_ax
+    )
+
+
+def test_plot_grid(od_cup_anno_bboxes, od_cup_path, od_cup_mask_path):
    """ Test that `plot_grid` works. """

    # test callable args
    def callable_args():
-        return od_cup_anno_bboxes, od_cup_path
+        return od_cup_anno_bboxes, od_cup_path, od_cup_mask_path

-    plot_grid(display_bboxes, callable_args, rows=1)
+    plot_grid(display_bboxes_mask, callable_args, rows=1)

    # test iterable args
    od_cup_paths = [od_cup_path, od_cup_path, od_cup_path]
    od_cup_annos = [od_cup_anno_bboxes, od_cup_anno_bboxes, od_cup_anno_bboxes]
+    od_cup_mask_paths = [od_cup_mask_path, None, od_cup_mask_path]

    def iterator_args():
-        for path, bboxes in zip(od_cup_paths, od_cup_annos):
-            yield bboxes, path
+        for path, bboxes, mask_path in zip(od_cup_paths, od_cup_annos, od_cup_mask_paths):
+            yield bboxes, path, mask_path

-    plot_grid(display_bboxes, iterator_args(), rows=1)
+    plot_grid(display_bboxes_mask, iterator_args(), rows=1)


 def test_plot_detection_vs_ground_truth(
@ -111,12 +144,13 @@ def test__plot_pr_curve_iou_mean(od_detection_eval, basic_ax):


@pytest.mark.gpu
-def test_plot_pr_curves(od_detection_eval):
+def test_plot_pr_curves(od_detection_eval, od_detection_mask_eval):
    """ Test that `plot_pr_curves` works. """
    plot_pr_curves(od_detection_eval)
+    plot_pr_curves(od_detection_mask_eval)


@pytest.mark.gpu
 def test_plot_counts_curves(od_detection_dataset, od_detections):
    """ Test that `plot_counts_curves` works. """
-    plot_counts_curves(od_detections, od_detection_dataset.test_ds, od_detections)
+    plot_counts_curves(od_detections, od_detection_dataset.test_ds, od_detections)
--- a/utils_cv/common/data.py
+++ b/utils_cv/common/data.py
@ -40,7 +40,7 @@ def get_files_in_directory(
        filenames = [
            s for s in filenames if s.lower().endswith(tuple(suffixes))
        ]
-    return filenames
+    return sorted(filenames)


 def _get_file_name(url: str) -> str:
--- a/utils_cv/detection/bbox.py
+++ b/utils_cv/detection/bbox.py
@ -25,14 +25,14 @@ class _Bbox:
        self.standardize()

    @classmethod
-    def from_array(cls, arr: List[int]) -> "Bbox":
+    def from_array(cls, arr: List[int]) -> "_Bbox":
        """ Create a Bbox object from an array [left, top, right, bottom] """
        return _Bbox(arr[0], arr[1], arr[2], arr[3])

    @classmethod
-    def from_array_xywh(cls, arr: List[int]) -> "Bbox":
-        """ create a Bbox object from an array [left, top, width, height] """
-        return _Bbox(arr[0], arr[1], arr[0] + arr[2], arr[1] + arr[3])
+    def from_array_xywh(cls, arr: List[int]) -> "_Bbox":
+        """ Create a Bbox object from an array [left, top, width, height] """
+        return _Bbox(arr[0], arr[1], arr[0] + arr[2] - 1, arr[1] + arr[3] - 1)

    def __str__(self):
        return f"""\
@ -65,7 +65,7 @@ bottom={self.bottom}]\
    def surface_area(self) -> float:
        return self.width() * self.height()

-    def get_overlap_bbox(self, bbox: "Bbox") -> Union[None, "Bbox"]:
+    def get_overlap_bbox(self, bbox: "_Bbox") -> Union[None, "_Bbox"]:
        left1, top1, right1, bottom1 = self.rect()
        left2, top2, right2, bottom2 = bbox.rect()
        overlap_left = max(left1, left2)
@ -92,7 +92,7 @@ bottom={self.bottom}]\
        self.right = right_new
        self.bottom = bottom_new

-    def crop(self, max_width: int, max_height: int) -> "Bbox":
+    def crop(self, max_width: int, max_height: int) -> "_Bbox":
        if max_height > self.height():
            raise Exception("crop height cannot be bigger than bbox height.")
        if max_width > self.width():
@ -181,12 +181,12 @@ class DetectionBbox(AnnotationBbox):
        self.score = score

    @classmethod
-    def from_array(
-        cls, arr: List[int], score: float, **kwargs
-    ) -> "DetectionBbox":
+    def from_array(cls, arr: List[int], **kwargs) -> "DetectionBbox":
        """ Create a Bbox object from an array [left, top, right, bottom]
-        This funciton must take in a score.
+        This function must take in a score.
        """
+        score = kwargs['score']
+        del kwargs['score']
        bbox = super().from_array(arr, **kwargs)
        bbox.__class__ = DetectionBbox
        bbox.score = score
@ -217,4 +217,4 @@ def bboxes_iou(bbox1: DetectionBbox, bbox2: DetectionBbox):
    else:
        iou = 0
    assert iou >= 0
-    return iou
+    return iou
--- a/utils_cv/detection/data.py
+++ b/utils_cv/detection/data.py
@ -21,13 +21,17 @@ class Urls:
        base, "odFridgeObjectsWatermarkTiny.zip"
    )

+    # mask datasets
+    fridge_objects_mask_path = urljoin(base, "odFridgeObjectsMask.zip")
+    fridge_objects_mask_tiny_path = urljoin(base, "odFridgeObjectsMaskTiny.zip")
+
    @classmethod
    def all(cls) -> List[str]:
        return [v for k, v in cls.__dict__.items() if k.endswith("_path")]


 def coco_labels() -> List[str]:
-    """ List of Coco labels with the original idexing.
+    """ List of Coco labels with the original indexing.

    Reference: https://github.com/pytorch/vision/blob/master/docs/source/models.rst

--- a/utils_cv/detection/dataset.py
+++ b/utils_cv/detection/dataset.py
@ -4,9 +4,10 @@
 import os
 import copy
 import math
+import numpy as np
 from pathlib import Path
-from random import randrange
-from typing import List, Tuple, Union
+import random
+from typing import Callable, List, Tuple, Union

 import torch
 from torch.utils.data import Dataset, Subset, DataLoader
@ -14,11 +15,14 @@ from torchvision.transforms import ColorJitter
 import xml.etree.ElementTree as ET
 from PIL import Image

-from .plot import display_bboxes, plot_grid
+from .plot import display_bboxes_mask, plot_grid
 from .bbox import AnnotationBbox
+from .mask import binarise_mask
 from .references.utils import collate_fn
 from .references.transforms import RandomHorizontalFlip, Compose, ToTensor
-from utils_cv.common.gpu import db_num_workers
+from ..common.gpu import db_num_workers
+
+Trans = Callable[[object, dict], Tuple[object, dict]]


 class ColorJitterTransform(object):
@ -41,7 +45,7 @@ class ColorJitterTransform(object):
        return im, target


-def get_transform(train: bool) -> List[object]:
+def get_transform(train: bool) -> Trans:
    """ Gets basic the transformations to apply to images.

    Source:
@ -127,7 +131,7 @@ def parse_pascal_voc_anno(
        assert anno_bbox.is_valid()
        anno_bboxes.append(anno_bbox)

-    return (anno_bboxes, im_path)
+    return anno_bboxes, im_path


 class DetectionDataset:
@ -141,12 +145,14 @@ class DetectionDataset:
        self,
        root: Union[str, Path],
        batch_size: int = 2,
-        train_transforms: object = get_transform(train=True),
-        test_transforms: object = get_transform(train=False),
+        train_transforms: Trans = get_transform(train=True),
+        test_transforms: Trans = get_transform(train=False),
        train_pct: float = 0.5,
        anno_dir: str = "annotations",
        im_dir: str = "images",
-        allow_negatives: bool = False,
+        mask_dir: str = None,
+        seed: int = None,
+        allow_negatives: bool = False
    ):
        """ initialize dataset

@ -162,9 +168,11 @@ class DetectionDataset:
            train_transforms: the transformations to apply to the train set
            test_transforms: the transformations to apply to the test set
            train_pct: the ratio of training to testing data
-            annotation_dir: the name of the annotation subfolder under the root directory
+            anno_dir: the name of the annotation subfolder under the root directory
            im_dir: the name of the image subfolder under the root directory. If set to 'None' then infers image location from annotation .xml files
            allow_negatives: is false (default) then will throw an error if no anntation .xml file can be found for a given image. Otherwise use image as negative, ie assume that the image does not contain any of the objects of interest.
+            mask_dir: the name of the mask subfolder under the root directory if the dataset is used for instance segmentation
+            seed: random seed for splitting dataset to training and testing data
        """

        self.root = Path(root)
@ -172,9 +180,11 @@ class DetectionDataset:
        self.test_transforms = test_transforms
        self.im_dir = im_dir
        self.anno_dir = anno_dir
+        self.mask_dir = mask_dir
        self.batch_size = batch_size
        self.train_pct = train_pct
        self.allow_negatives = allow_negatives
+        self.seed = seed

        # read annotations
        self._read_annos()
@ -187,7 +197,7 @@ class DetectionDataset:
        # create training and validation data loaders
        self.init_data_loaders()

-    def _read_annos(self) -> List[str]:
+    def _read_annos(self) -> None:
        """ Parses all Pascal VOC formatted annotation files to extract all
        possible labels. """

@ -211,6 +221,7 @@ class DetectionDataset:
        self.im_paths = []
        self.anno_paths = []
        self.anno_bboxes = []
+        self.mask_paths = []
        for anno_idx, anno_filename in enumerate(anno_filenames):
            anno_path = self.root / self.anno_dir / str(anno_filename)

@ -239,6 +250,23 @@ class DetectionDataset:
                self.im_paths.append(im_path)
            else:
                self.im_paths.append(im_paths[anno_idx])
+
+            if self.mask_dir:
+                # Assume mask image name matches image name but has .png
+                # extension
+                mask_name = os.path.basename(self.im_paths[-1])
+                mask_name = mask_name[:mask_name.rindex('.')] + ".png"
+                mask_path = self.root / self.mask_dir / mask_name
+                # For mask prediction, if no mask provided and negatives not
+                # allowed (), ignore the image
+                if not mask_path.exists():
+                    if not self.allow_negatives:
+                        raise FileNotFoundError(mask_path)
+                    else:
+                        self.mask_paths.append(None)
+                else:
+                    self.mask_paths.append(mask_path)
+
            self.anno_paths.append(anno_path)
            self.anno_bboxes.append(anno_bboxes)
        assert len(self.im_paths) == len(self.anno_paths)
@ -276,6 +304,8 @@ class DetectionDataset:
            A training and testing dataset in that order
        """
        test_num = math.floor(len(self) * (1 - train_pct))
+        if self.seed:
+            torch.manual_seed(self.seed)
        indices = torch.randperm(len(self)).tolist()

        train = copy.deepcopy(Subset(self, indices[test_num:]))
@ -309,6 +339,7 @@ class DetectionDataset:
        im_paths: List[str],
        anno_bboxes: List[AnnotationBbox],
        target: str = "train",
+        mask_paths: List[str] = None,
    ):
        """ Add new images to either the training or test set.

@ -316,21 +347,28 @@ class DetectionDataset:
            im_paths: path to the images.
            anno_bboxes: ground truth boxes for each image.
            target: specify if images are to be added to the training or test set. Valid options: "train" or "test".
+            mask_paths: path to the masks.

        Raises:
            Exception if `target` variable is neither 'train' nor 'test'
        """
        assert len(im_paths) == len(anno_bboxes)
-        for im_path, anno_bbox in zip(im_paths, anno_bboxes):
+        for i, (im_path, anno_bbox) in enumerate(zip(im_paths, anno_bboxes)):
            self.im_paths.append(im_path)
            self.anno_bboxes.append(anno_bbox)
+            if mask_paths is not None:
+                self.mask_paths.append(mask_paths[i])
            if target.lower() == "train":
                self.train_ds.dataset.im_paths.append(im_path)
                self.train_ds.dataset.anno_bboxes.append(anno_bbox)
+                if mask_paths is not None:
+                    self.train_ds.dataset.mask_paths.append(mask_paths[i])
                self.train_ds.indices.append(len(self.im_paths) - 1)
            elif target.lower() == "test":
                self.test_ds.dataset.im_paths.append(im_path)
                self.test_ds.dataset.anno_bboxes.append(anno_bbox)
+                if mask_paths is not None:
+                    self.test_ds.dataset.mask_paths.append(mask_paths[i])
                self.test_ds.indices.append(len(self.im_paths) - 1)
            else:
                raise Exception(f"Target {target} unknown.")
@ -338,26 +376,30 @@ class DetectionDataset:
        # Re-initialize the data loaders
        self.init_data_loaders()

-    def show_ims(self, rows: int = 1, cols: int = 3) -> None:
+    def show_ims(self, rows: int = 1, cols: int = 3, seed: int = None) -> None:
        """ Show a set of images.

        Args:
            rows: the number of rows images to display
            cols: cols to display, NOTE: use 3 for best looking grid
+            seed: random seed for selecting images

        Returns None but displays a grid of annotated images.
        """
-        plot_grid(display_bboxes, self._get_random_anno, rows=rows, cols=cols)
+        if seed or self.seed:
+            random.seed(seed or self.seed)
+
+        plot_grid(display_bboxes_mask, self._get_random_anno, rows=rows, cols=cols)

    def show_im_transformations(
        self, idx: int = None, rows: int = 1, cols: int = 3
    ) -> None:
-        """ Show a set of images after transfomrations have been applied.
+        """ Show a set of images after transformations have been applied.

        Args:
            idx: the index to of the image to show the transformations for.
            rows: number of rows to display
-            cols: number of cols to dipslay, NOTE: use 3 for best looing grid
+            cols: number of cols to display, NOTE: use 3 for best looking grid

        Returns None but displays a grid of randomly applied transformations.
        """
@ -371,7 +413,7 @@ class DetectionDataset:
            )
        else:
            if idx is None:
-                idx = randrange(len(self.anno_paths))
+                idx = random.randrange(len(self.anno_paths))

            def plotter(im, ax):
                ax.set_xticks([])
@ -386,15 +428,30 @@ class DetectionDataset:
            print(f"Transformations applied on {self.im_paths[idx]}:")
            [print(transform) for transform in self.transforms.transforms]

-    def _get_random_anno(
-        self
-    ) -> Tuple[List[AnnotationBbox], Union[str, Path]]:
+    def _get_binary_mask(self, idx: int) -> Union[np.ndarray, None]:
+        """ Return binary masks for objects in the mask image. """
+        binary_masks = None
+        if self.mask_paths:
+            if self.mask_paths[idx] is not None:
+                binary_masks = binarise_mask(Image.open(self.mask_paths[idx]))
+            else:
+                # for the tiny bounding box in _read_annos(), make the mask to
+                # be the whole box
+                mask = np.zeros(
+                    Image.open(self.im_paths[idx]).size[::-1],
+                    dtype=np.uint8
+                )
+                binary_masks = binarise_mask(mask)
+
+        return binary_masks
+
+    def _get_random_anno(self) -> Tuple:
        """ Get random annotation and corresponding image

        Returns a list of annotations and the image path
        """
-        idx = randrange(len(self.anno_paths))
-        return self.anno_bboxes[idx], self.im_paths[idx]
+        idx = random.randrange(len(self.im_paths))
+        return self.anno_bboxes[idx], self.im_paths[idx], self._get_binary_mask(idx)

    def __getitem__(self, idx):
        """ Make iterable. """
@ -430,6 +487,11 @@ class DetectionDataset:
            "iscrowd": iscrowd,
        }

+        # get masks
+        binary_masks = self._get_binary_mask(idx)
+        if binary_masks is not None:
+            target["masks"] = torch.as_tensor(binary_masks, dtype=torch.uint8)
+
        # get image
        im = Image.open(im_path).convert("RGB")

@ -437,7 +499,7 @@ class DetectionDataset:
        if self.transforms is not None:
            im, target = self.transforms(im, target)

-        return (im, target)
+        return im, target

    def __len__(self):
-        return len(self.anno_paths)
+        return len(self.im_paths)
--- a/utils_cv/detection/mask.py
+++ b/utils_cv/detection/mask.py
@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+
+from PIL import Image
+from pathlib import Path
+from typing import Tuple, Union
+
+
+def binarise_mask(mask: Union[np.ndarray, str, Path]) -> np.ndarray:
+    """ Split the mask into a set of binary masks.
+
+    Assume the mask is already binary masks of [N, Height, Width], or
+    grayscale mask of [Height, Width] with different values
+    representing different objects, 0 as background.
+    """
+    # get numpy array from image file
+    if isinstance(mask, (str, Path)):
+        mask = np.array(Image.open(mask))
+
+    # convert to numpy array
+    mask = np.asarray(mask)
+
+    # if all values are False or True, consider it's already binarised
+    if mask.ndim == 3:
+        assert all(i in [False, True] for i in np.unique(mask).tolist()), \
+            "'mask' should be grayscale."
+        return mask
+
+    assert mask.ndim == 2, "'mask' should have at least 2 channels."
+    # remove background
+    obj_values = np.unique(mask)[1:]
+    # get the binary masks for each color (instance)
+    binary_masks = mask == obj_values[:, None, None]
+    return binary_masks
+
+
+def colorise_binary_mask(
+    binary_mask: np.ndarray,
+    color: Tuple[int, int, int] = (2, 166, 101),
+) -> np.ndarray:
+    """ Set the color for the instance in the mask. """
+    # create empty RGB channels
+    h = binary_mask.shape[0]
+    w = binary_mask.shape[1]
+    r, g, b = np.zeros([3, h, w]).astype(np.uint8)
+    # set corresponding color for each channel
+    r[binary_mask], g[binary_mask], b[binary_mask] = color
+    # merge RGB channels
+    colored_mask = np.dstack([r, g, b])
+    return colored_mask
+
+
+def transparentise_mask(
+    colored_mask: np.ndarray,
+    alpha: float = 0.5,
+) -> np.ndarray:
+    """ Return a mask with fully transparent background and alpha-transparent
+    instances.
+
+    Assume channel is the third dimension of mask, and no alpha channel.
+    """
+    assert colored_mask.shape[2] == 3, \
+        "'colored_mask' should be of 3-channels RGB."
+    # convert (0, 0, 0) to (0, 0, 0, 0) and
+    # all other (x, y, z) to (x, y, z, alpha*255)
+    binary_mask = (colored_mask != 0).any(axis=2)
+    alpha_mask = (alpha * 255 * binary_mask).astype(np.uint8)
+    return np.dstack([colored_mask, alpha_mask])
+
+
+def merge_binary_masks(binary_masks: np.ndarray) -> np.ndarray:
+    """ Merge binary masks into one grayscale mask.
+
+    Assume binary_masks is of [N, Height, Width].
+    """
+    obj_values = np.arange(len(binary_masks)) + 1
+    # label mask from 1 to number of instances
+    labeled_masks = binary_masks * obj_values[:, None, None]
+    return np.max(labeled_masks, axis=0).astype(np.uint8)
--- a/utils_cv/detection/model.py
+++ b/utils_cv/detection/model.py
@ -4,17 +4,30 @@
 import os
 import itertools
 import json
+from typing import (
+    Callable,
+    List,
+    Tuple,
+    Union,
+    Generator,
+    Optional,
+    Dict,
+)
+
 from pathlib import Path
 import shutil
-from typing import List, Tuple, Union, Generator, Optional

-import numpy as np
 from PIL import Image
+import numpy as np
 import torch
 import torch.nn as nn
 from torchvision import transforms
-from torchvision.models.detection import fasterrcnn_resnet50_fpn
+from torchvision.models.detection import (
+    fasterrcnn_resnet50_fpn,
+    maskrcnn_resnet50_fpn,
+)
 from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
+from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
 from torch.utils.data import Dataset, DataLoader, Subset
 import matplotlib.pyplot as plt

@ -24,25 +37,25 @@ from .bbox import bboxes_iou, DetectionBbox
 from ..common.gpu import torch_device


-def _get_det_bboxes(
-    pred: List[dict], labels: List[str], im_path: str = None
-) -> List[DetectionBbox]:
-    """ Gets the bounding boxes and labels from the prediction object
+def _get_det_bboxes_and_mask(
+    pred: Dict[str, np.ndarray],
+    labels: List[str],
+    im_path: Union[str, Path] = None,
+) -> Dict:
+    """ Gets the bounding boxes and masks from the prediction object.

    Args:
        pred: the output of passing in an image to torchvision's FasterRCNN
-        model
-        labels: list of labels
+            or MaskRCNN model, detached in the form of numpy array
+        labels: list of labels without "__background__".
        im_path: the image path of the preds

    Return:
-        a list of DetectionBboxes
+        a dict of DetectionBboxes and masks
    """
-    pred_labels = pred[0]["labels"].detach().cpu().numpy().tolist()
-    pred_boxes = (
-        pred[0]["boxes"].detach().cpu().numpy().astype(np.int32).tolist()
-    )
-    pred_scores = pred[0]["scores"].detach().cpu().numpy().tolist()
+    pred_labels = pred['labels'].tolist()
+    pred_boxes = pred['boxes'].tolist()
+    pred_scores = pred['scores'].tolist()

    det_bboxes = []
    for label, box, score in zip(pred_labels, pred_boxes, pred_scores):
@ -56,22 +69,37 @@ def _get_det_bboxes(
        )
        det_bboxes.append(det_bbox)

-    return det_bboxes
+    res = {"det_bboxes": det_bboxes}
+
+    if "masks" in pred:
+        res["masks"] = pred["masks"].squeeze(1)
+    return res


 def _apply_threshold(
-    det_bboxes: List[DetectionBbox], threshold: Optional[float] = 0.5
-) -> List[DetectionBbox]:
-    """ Filters the list of DetectionBboxes by score threshold. """
-    return (
-        [det_bbox for det_bbox in det_bboxes if det_bbox.score > threshold]
-        if threshold is not None
-        else det_bboxes
-    )
+    pred: Dict[str, np.ndarray],
+    threshold: Optional[float] = 0.5,
+) -> Dict:
+    """ Return prediction results that are above the threshold if any.
+
+    Args:
+        pred: the output of passing in an image to torchvision's FasterRCNN
+            or MaskRCNN model, detached in the form of numpy array
+        threshold: iou threshold for a positive detection. Note: set
+            threshold to None to omit a threshold
+    """
+    # apply score threshold
+    if threshold:
+        selected = pred['scores'] > threshold
+        pred = {k: v[selected] for k, v in pred.items()}
+    # apply mask threshold
+    if "masks" in pred:
+        pred["masks"] = pred["masks"] > 0.5
+    return pred


-def get_pretrained_fasterrcnn(
-    num_classes: int,
+def _get_pretrained_rcnn(
+    model_func: Callable[..., nn.Module],
    # transform parameters
    min_size: int = 800,
    max_size: int = 1333,
@ -89,7 +117,8 @@ def get_pretrained_fasterrcnn(
    """ Gets a pretrained FasterRCNN model

    Args:
-        num_classes: number of output classes of the model (including the background).
+        model_func: pretrained R-CNN model generating functions, such as
+            fasterrcnn_resnet50_fpn(), get_pretrained_fasterrcnn(), etc.
        min_size: minimum size of the image to be rescaled before feeding it to the backbone
        max_size: maximum size of the image to be rescaled before feeding it to the backbone
        rpn_pre_nms_top_n_train: number of proposals to keep before applying NMS during training
@ -97,21 +126,11 @@ def get_pretrained_fasterrcnn(
        rpn_post_nms_top_n_train: number of proposals to keep after applying NMS during training
        rpn_post_nms_top_n_test: number of proposals to keep after applying NMS during testing
        rpn_nms_thresh: NMS threshold used for postprocessing the RPN proposals
-        box_score_thresh: during inference, only return proposals with a classification score greater than box_score_thresh
-        box_nms_thresh: NMS threshold for the prediction head. Used during inference
-        box_detections_per_img: maximum number of detections per image, for all classes

    Returns
-        The model to fine-tine/inference with
-
-    For a list of all parameters see:
-        https://github.com/pytorch/vision/blob/master/torchvision/models/detection/faster_rcnn.py
-
+        The pre-trained model
    """
-    # TODO - reconsider that num_classes includes background. This doesn't feel intuitive.
-
-    # load a model pre-trained pre-trained on COCO
-    model = fasterrcnn_resnet50_fpn(
+    model = model_func(
        pretrained=True,
        min_size=min_size,
        max_size=max_size,
@ -124,7 +143,11 @@ def get_pretrained_fasterrcnn(
        box_nms_thresh=box_nms_thresh,
        box_detections_per_img=box_detections_per_img,
    )
+    return model

+
+def _tune_box_predictor(model: nn.Module, num_classes: int) -> nn.Module:
+    """ Tune box predictor in the model. """
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

@ -134,9 +157,84 @@ def get_pretrained_fasterrcnn(
    return model


+def get_pretrained_fasterrcnn(
+    num_classes: int = None,
+    **kwargs,
+) -> nn.Module:
+    """ Gets a pretrained FasterRCNN model
+
+    Args:
+        num_classes: number of output classes of the model (including the
+            background).  If None, 91 as COCO datasets.
+
+    Returns
+        The model to fine-tine/inference with
+
+    For a list of all parameters see:
+        https://github.com/pytorch/vision/blob/master/torchvision/models/detection/faster_rcnn.py
+    """
+    # TODO - reconsider that num_classes includes background. This doesn't feel
+    #     intuitive.
+
+    # load a model pre-trained on COCO
+    model = _get_pretrained_rcnn(
+        fasterrcnn_resnet50_fpn,
+        **kwargs,
+    )
+
+    # if num_classes is specified, then create new final bounding box
+    # prediction layers, otherwise use pre-trained layers
+    if num_classes:
+        model = _tune_box_predictor(model, num_classes)
+
+    return model
+
+
+def get_pretrained_maskrcnn(
+    num_classes: int = None,
+    **kwargs,
+) -> nn.Module:
+    """ Gets a pretrained Mask R-CNN model
+
+    Args:
+        num_classes: number of output classes of the model (including the
+            background).  If None, 91 as COCO datasets.
+
+    Returns
+        The model to fine-tine/inference with
+
+    For a list of all parameters see:
+        https://github.com/pytorch/vision/blob/master/torchvision/models/detection/mask_rcnn.py
+
+    """
+    # load a model pre-trained on COCO
+    model = _get_pretrained_rcnn(
+        maskrcnn_resnet50_fpn,
+        **kwargs,
+    )
+
+    # if num_classes is specified, then create new final bounding box
+    # and mask prediction layers, otherwise use pre-trained layers
+    if num_classes:
+        model = _tune_box_predictor(model, num_classes)
+
+        # tune mask predictor in the model.
+        # get the number of input features of mask predictor from the pretrained
+        # model
+        in_features = model.roi_heads.mask_predictor.conv5_mask.in_channels
+        # replace the mask predictor with a new one
+        model.roi_heads.mask_predictor = MaskRCNNPredictor(
+            in_features,
+            256,
+            num_classes
+        )
+
+    return model
+
+
 def _calculate_ap(
    e: CocoEvaluator, iou_threshold_idx: Union[int, slice] = slice(0, None)
-) -> float:
+) -> Dict[str, float]:
    """ Calculate the Average Precision (AP) by averaging all iou
    thresholds across all labels.

@ -161,8 +259,11 @@ def _calculate_ap(
        0,
        2,
    )
-    coco_eval = e.coco_eval["bbox"].eval["precision"]
-    return np.mean(np.mean(coco_eval[precision_settings]))
+    ap = {
+        k: np.mean(np.mean(v.eval["precision"][precision_settings]))
+        for k, v in e.coco_eval.items()
+    }
+    return ap


 def _im_eval_detections(
@ -308,6 +409,8 @@ class DetectionLearner:
        dataset: Dataset = None,
        model: nn.Module = None,
        im_size: int = None,
+        device: torch.device = None,
+        labels: List[str] = None,
    ):
        """ Initialize leaner object.

@ -330,15 +433,26 @@ class DetectionLearner:
        if im_size is None:
            im_size = 500

-        self.device = torch_device()
+        self.device = device
+        if self.device is None:
+            self.device = torch_device()
+
        self.model = model
        self.dataset = dataset
        self.im_size = im_size

+        # make sure '__background__' is not included in labels
+        if dataset and "labels" in dataset.__dict__:
+            self.labels = dataset.labels
+        elif labels is not None:
+            self.labels = labels
+        else:
+            raise ValueError("No labels provided in dataset.labels or labels")
+
        # setup model, default to fasterrcnn
        if self.model is None:
            self.model = get_pretrained_fasterrcnn(
-                len(self.dataset.labels) + 1,
+                len(self.labels) + 1,
                min_size=self.im_size,
                max_size=self.im_size,
            )
@ -354,12 +468,6 @@ class DetectionLearner:
            )
        )

-    def add_labels(self, labels: List[str]):
-        """ Add labels to this detector. This class does not expect a label
-        '__background__' in first element of the label list. Make sure it is
-        omitted before adding it. """
-        self.labels = labels
-
    def fit(
        self,
        epochs: int,
@ -372,6 +480,9 @@ class DetectionLearner:
    ) -> None:
        """ The main training loop. """

+        if not self.dataset:
+            raise Exception("No dataset provided")
+
        # reduce learning rate every step_size epochs by a factor of gamma (by default) 0.1.
        if step_size is None:
            step_size = int(np.round(epochs / 1.5))
@ -421,22 +532,34 @@ class DetectionLearner:
        """ Plot training loss from calling `fit` and average precision on the
        test set. """
        fig = plt.figure(figsize=figsize)
-        ax1 = fig.add_subplot(111)
+        ap = {k: [dic[k] for dic in self.ap] for k in self.ap[0]}

-        ax1.set_xlim([0, self.epochs - 1])
-        ax1.set_xticks(range(0, self.epochs))
-        ax1.set_title("Loss and Average Precision over epochs")
-        ax1.set_xlabel("epochs")
-        ax1.set_ylabel("loss", color="g")
-        ax1.plot(self.losses, "g-")
+        for i, (k, v) in enumerate(ap.items()):

-        ax2 = ax1.twinx()
-        ax2.set_ylabel("average precision", color="b")
-        ax2.plot(self.ap, "b-")
+            ax1 = fig.add_subplot(1, len(ap), i+1)
+
+            ax1.set_xlim([0, self.epochs - 1])
+            ax1.set_xticks(range(0, self.epochs))
+            ax1.set_xlabel("epochs")
+            ax1.set_ylabel("loss", color="g")
+            ax1.plot(self.losses, "g-")
+
+            ax2 = ax1.twinx()
+            ax2.set_ylabel(f"AP for {k}", color="b")
+            ax2.plot(v, "b-")
+
+        fig.suptitle("Loss and Average Precision (AP) over Epochs")

    def evaluate(self, dl: DataLoader = None) -> CocoEvaluator:
-        """ eval code on validation/test set and saves the evaluation results in self.results. """
+        """ eval code on validation/test set and saves the evaluation results
+        in self.results.
+
+        Raises:
+            Exception: if both `dl` and `self.dataset` are None.
+        """
        if dl is None:
+            if not self.dataset:
+                raise Exception("No dataset provided for evaluation")
            dl = self.dataset.test_dl
        self.results = evaluate(self.model, dl, device=self.device)
        return self.results
@ -445,94 +568,97 @@ class DetectionLearner:
        self,
        im_or_path: Union[np.ndarray, Union[str, Path]],
        threshold: Optional[int] = 0.5,
-    ) -> List[DetectionBbox]:
+    ) -> Dict:
        """ Performs inferencing on an image path or image.

        Args:
-            im_or_path: the image array which you can get from `Image.open(path)` OR a
-            image path
+            im_or_path: the image array which you can get from
+                `Image.open(path)` or a image path
            threshold: the threshold to use to calculate whether the object was
-            detected. Note: can be set to None to return all detection bounding
-            boxes.
-
-        Raises:
-            TypeError is the im object is a path or str to the image instead of
-            an nd.array
+                detected. Note: can be set to None to return all detection
+                bounding boxes.

        Return a list of DetectionBbox
        """
-        im = (
-            Image.open(im_or_path)
-            if isinstance(im_or_path, (str, Path))
-            else im_or_path
-        )
+        if isinstance(im_or_path, (str, Path)):
+            im = Image.open(im_or_path)
+            im_path = im_or_path
+        else:
+            im = im_or_path
+            im_path = None

+        # convert the image to the format required by the model
        transform = transforms.Compose([transforms.ToTensor()])
-        im = transform(im).cuda()
+        im = transform(im)
+        if self.device:
+            im = im.to(self.device)
+
        model = self.model.eval()  # eval mode
        with torch.no_grad():
-            pred = model([im])
+            pred = model([im])[0]

-        labels = self.dataset.labels if self.dataset else self.labels
-        det_bboxes = _get_det_bboxes(pred, labels=labels)
-
-        # limit to threshold if threshold is set
-        return _apply_threshold(det_bboxes, threshold)
+        # detach prediction results to cpu
+        pred = {k: v.detach().cpu().numpy() for k, v in pred.items()}
+        return _get_det_bboxes_and_mask(
+            _apply_threshold(pred, threshold=threshold),
+            self.labels,
+            im_path
+        )

    def predict_dl(
-        self, dl: DataLoader, threshold: Optional[float] = 0.5
+        self,
+        dl: DataLoader,
+        threshold: Optional[float] = 0.5,
    ) -> List[DetectionBbox]:
        """ Predict all images in a dataloader object.

        Args:
            dl: the dataloader to predict on
            threshold: iou threshold for a positive detection. Note: set
-            threshold to None to omit a threshold
+                threshold to None to omit a threshold

-        Returns a list of DetectionBbox
+        Returns a list of results
        """
        pred_generator = self.predict_batch(dl, threshold=threshold)
-        det_bboxes = [pred for preds in pred_generator for pred in preds]
-        return det_bboxes
+        return [pred for preds in pred_generator for pred in preds]

    def predict_batch(
-        self, dl: DataLoader, threshold: Optional[float] = 0.5
+        self,
+        dl: DataLoader,
+        threshold: Optional[float] = 0.5,
    ) -> Generator[List[DetectionBbox], None, None]:
        """ Batch predict

        Args
            dl: A DataLoader to load batches of images from
            threshold: iou threshold for a positive detection. Note: set
-            threshold to None to omit a threshold
+                threshold to None to omit a threshold

        Returns an iterator that yields a batch of detection bboxes for each
        image that is scored.
        """

-        labels = self.dataset.labels
        model = self.model.eval()

        for i, batch in enumerate(dl):
            ims, infos = batch
-            ims = [im.cuda() for im in ims]
+            ims = [im.to(self.device) for im in ims]
            with torch.no_grad():
-                raw_dets = model(list(ims))
+                raw_dets = model(ims)

-            det_bbox_batch = []
-            for raw_det, info in zip(raw_dets, infos):
-
-                im_idx = int(info["image_id"].numpy())
-                im_path = dl.dataset.dataset.im_paths[im_idx]
-
-                det_bboxes = _get_det_bboxes(
-                    [raw_det], labels=labels, im_path=im_path
+            results = []
+            for det, info in zip(raw_dets, infos):
+                im_id = int(info["image_id"].item())
+                # detach prediction results to cpu
+                pred = {k: v.detach().cpu().numpy() for k, v in det.items()}
+                bboxes_masks = _get_det_bboxes_and_mask(
+                    _apply_threshold(pred, threshold=threshold),
+                    self.labels,
+                    dl.dataset.dataset.im_paths[im_id]
                )
+                results.append({"idx": im_id, **bboxes_masks})

-                det_bboxes = _apply_threshold(det_bboxes, threshold)
-                det_bbox_batch.append(
-                    {"idx": im_idx, "det_bboxes": det_bboxes}
-                )
-            yield det_bbox_batch
+            yield results

    def save(
        self, name: str, path: str = None, overwrite: bool = True
@ -551,8 +677,8 @@ class DetectionLearner:
        Args:
            name: the name you wish to save your model under
            path: optional path to save your model to, will use `data_path`
-            otherwise
-            overwrite: overwite existing models
+                otherwise
+            overwrite: overwrite existing models

        Raise:
            Exception if model file already exists but overwrite is set to
@ -690,6 +816,6 @@ class DetectionLearner:
        model = get_pretrained_fasterrcnn(
            len(labels) + 1, min_size=im_size, max_size=im_size
        )
-        detection_learner = DetectionLearner(model=model)
+        detection_learner = DetectionLearner(model=model, labels=labels)
        detection_learner.load(name=name, path=path)
        return detection_learner
--- a/utils_cv/detection/plot.py
+++ b/utils_cv/detection/plot.py
@ -6,7 +6,8 @@ Helper module for visualizations
 """
 import os
 from pathlib import Path
-from typing import List, Union, Tuple, Callable, Any, Iterator
+from typing import List, Union, Tuple, Callable, Any, Iterator, Optional
+from pathlib import Path

 import numpy as np
 import PIL
@ -18,6 +19,7 @@ from .bbox import _Bbox, AnnotationBbox, DetectionBbox
 from .model import ims_eval_detections
 from .references.coco_eval import CocoEvaluator
 from ..common.misc import get_font
+from .mask import binarise_mask, colorise_binary_mask, transparentise_mask


 class PlotSettings:
@ -29,13 +31,15 @@ class PlotSettings:
        rect_color: Tuple[int, int, int] = (255, 0, 0),
        text_size: int = 25,
        text_color: Tuple[int, int, int] = (255, 255, 255),
+        mask_color: Tuple[int, int, int] = (2, 166, 101),
+        mask_alpha: float = 0.5,
    ):
-        self.rect_th, self.rect_color, self.text_size, self.text_color = (
-            rect_th,
-            rect_color,
-            text_size,
-            text_color,
-        )
+        self.rect_th = rect_th
+        self.rect_color = rect_color
+        self.text_size = text_size
+        self.text_color = text_color
+        self.mask_color = mask_color
+        self.mask_alpha = mask_alpha


 def plot_boxes(
@ -87,32 +91,74 @@ def plot_boxes(
    return im


-def display_bboxes(
+def plot_mask(
+    im: Union[str, Path, PIL.Image.Image],
+    mask: Union[str, Path, np.ndarray],
+    plot_settings: PlotSettings = PlotSettings(),
+) -> PIL.Image.Image:
+    """ Put mask onto image.
+
+    Assume the mask is already binary masks of [N, Height, Width], or
+    grayscale mask of [Height, Width] with different values
+    representing different objects, 0 as background.
+    """
+    if isinstance(im, (str, Path)):
+        im = Image.open(im)
+
+    # convert to RGBA for transparentising
+    im = im.convert('RGBA')
+    # colorise masks
+    binary_masks = binarise_mask(mask)
+    colored_masks = [
+        colorise_binary_mask(bmask, plot_settings.mask_color) for bmask in
+        binary_masks
+    ]
+    # merge masks into img one by one
+    for cmask in colored_masks:
+        tmask = Image.fromarray(
+            transparentise_mask(cmask, plot_settings.mask_alpha)
+        )
+        im = Image.alpha_composite(im, tmask)
+
+    return im
+
+
+def display_bboxes_mask(
    bboxes: List[_Bbox],
    im_path: Union[Path, str],
-    ax: Union[None, plt.axes] = None,
+    mask_path: Union[Path, str] = None,
+    ax: Optional[plt.axes] = None,
    plot_settings: PlotSettings = PlotSettings(),
    figsize: Tuple[int, int] = (12, 12),
 ) -> None:
-    """ Draw image with bounding boxes.
+    """ Draw image with bounding boxes and mask.

    Args:
        bboxes: A list of _Bbox, could be DetectionBbox or AnnotationBbox
        im_path: the location of image path to draw
+        mask_path: the location of mask path to draw
        ax: an optional ax to specify where you wish the figure to be drawn on
+        plot_settings: plotting parameters
+        figsize: figure size

-    Returns nothing, but plots the image with bounding boxes and labels.
+    Returns nothing, but plots the image with bounding boxes, labels and masks
+    if any.
    """
    # Read image
-    im = Image.open(str(im_path))
+    im = Image.open(im_path)

    # set an image title
    title = os.path.basename(im_path)

-    # plot boxes on im
-    im = plot_boxes(im, bboxes, title=title, plot_settings=plot_settings)
+    if mask_path is not None:
+        # plot masks on im
+        im = plot_mask(im_path, mask_path)

-    # display the output image
+    if bboxes is not None:
+        # plot boxes on im
+        im = plot_boxes(im, bboxes, title=title, plot_settings=plot_settings)
+
+    # display the image
    if ax is not None:
        ax.set_xticks([])
        ax.set_yticks([])
@ -253,7 +299,7 @@ def _get_precision_recall_settings(

    Args:
        iou_thrs: the IoU thresholds to return
-        rec_thrs: the recall thrsholds to return
+        rec_thrs: the recall thresholds to return
        cat_ids: label ids to use for evaluation
        area_rng: object area ranges for evaluation
        max_dets: thresholds on max detections per image
@ -261,12 +307,16 @@ def _get_precision_recall_settings(
    Return the settings as a tuple to be passed into:
    `coco_eval.eval['precision']`
    """
-    return (iou_thrs, rec_thrs, cat_ids, area_rng, max_dets)
+    return iou_thrs, rec_thrs, cat_ids, area_rng, max_dets


-def _plot_pr_curve_iou_range(ax: plt.axes, coco_eval: CocoEvaluator) -> None:
+def _plot_pr_curve_iou_range(
+    ax: plt.axes,
+    coco_eval: CocoEvaluator,
+    iou_type: Optional[str] = None,
+) -> None:
    """ Plots the PR curve over varying iou thresholds averaging over [K]
-    categoyies. """
+    categories. """
    x = np.arange(0.0, 1.01, 0.01)
    iou_thrs_idx = range(0, 10)
    iou_thrs = np.linspace(
@ -278,7 +328,7 @@ def _plot_pr_curve_iou_range(ax: plt.axes, coco_eval: CocoEvaluator) -> None:
    cmap = plt.cm.get_cmap("hsv", len(iou_thrs))

    ax = _setup_pr_axes(
-        ax, "Precision-Recall Curve @ different IoU Thresholds"
+        ax, f"Precision-Recall Curve ({iou_type}) @ different IoU Thresholds"
    )
    for i, c in zip(iou_thrs_idx, iou_thrs):
        arr = coco_eval.eval["precision"][_get_precision_recall_settings(i)]
@ -288,11 +338,15 @@ def _plot_pr_curve_iou_range(ax: plt.axes, coco_eval: CocoEvaluator) -> None:
    ax.legend(loc="lower left")


-def _plot_pr_curve_iou_mean(ax: plt.axes, coco_eval: CocoEvaluator) -> None:
+def _plot_pr_curve_iou_mean(
+    ax: plt.axes,
+    coco_eval: CocoEvaluator,
+    iou_type: Optional[str] = None,
+) -> None:
    """ Plots the PR curve, averaging over iou thresholds and [K] labels. """
    x = np.arange(0.0, 1.01, 0.01)
    ax = _setup_pr_axes(
-        ax, "Precision-Recall Curve - Mean over IoU Thresholds"
+        ax, f"Precision-Recall Curve ({iou_type}) - Mean over IoU Thresholds"
    )
    avg_arr = np.mean(  # mean over K labels
        np.mean(  # mean over iou thresholds
@ -333,12 +387,22 @@ def plot_pr_curves(
        raise Exception(
            "`accumulate()` has not been called on the passed in coco_eval object."
        )
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
-    _plot_pr_curve_iou_range(ax1, coco_eval)
-    _plot_pr_curve_iou_mean(ax2, coco_eval)
+
+    nrows = len(evaluator.coco_eval)
+    fig, axes = plt.subplots(nrows, 2, figsize=figsize)
+    for i, (k, coco_eval) in enumerate(evaluator.coco_eval.items()):
+        _plot_pr_curve_iou_range(
+            axes[i, 0] if nrows > 1 else axes[0], coco_eval, k
+        )
+        _plot_pr_curve_iou_mean(
+            axes[i, 1] if nrows > 1 else axes[1], coco_eval, k
+        )
+
    plt.show()


+
+
 # ===== Correct/missing detection counts curve =====


--- a/utils_cv/detection/references/coco_eval.py
+++ b/utils_cv/detection/references/coco_eval.py
@ -114,7 +114,9 @@ class CocoEvaluator(object):

            rles = [
                mask_util.encode(
-                    np.array(mask[0, :, :, np.newaxis], order="F")
+                    # Change according to the issue related to mask:
+                    #     https://github.com/pytorch/vision/issues/1355#issuecomment-544951911
+                    np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F")
                )[0]
                for mask in masks
            ]