Add evaluations for IC and OD (#1)

2020-11-24 10:17:44 -08:00 · 2020-11-24 10:17:44 -08:00 · 2a41819dbb
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,131 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+.idea/*
--- a/34
+++ b/34
@ -1,21 +1,21 @@
-    MIT License
+MIT License

-    Copyright (c) Microsoft Corporation.
+Copyright (c) 2020 Microsoft

-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:

-    The above copyright notice and this permission notice shall be included in all
-    copies or substantial portions of the Software.
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.

-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -1,33 +1,14 @@
-# Project
+# Introduction 
+This repo contains evaluation metric codes used in Microsoft Cognitive Services Computer Vision for tasks such as classification and object detection.

-> This repo has been populated by an initial template to help get you started. Please
-> make sure to update the content to build a great experience for community-building.
+# Functionalities
+This repo currently offers evaluation metrics for two vision tasks:

-As the maintainer of this project, please make a few updates:
-
- Improving this README.MD file to provide a great experience
- Updating SUPPORT.MD with content about this project's support experience
- Understanding the security reporting process in SECURITY.MD
- Remove this section from the README
-
-## Contributing
-
-This project welcomes contributions and suggestions.  Most contributions require you to agree to a
-Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
-the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
-
-When you submit a pull request, a CLA bot will automatically determine whether you need to provide
-a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
-provided by the bot. You will only need to do this once across all repos using our CLA.
-
-This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
-contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
-
-## Trademarks
-
-This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
-trademarks or logos is subject to and must follow 
-[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
-Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
-Any use of third-party trademarks or logos are subject to those third-party's policies.
+- Image classification:
+    - `evaluators.TopKAccuracyEvaluator`: computes the top-k accuracy, i.e., accuracy of the top k predictions with highest confidence.
+    - `evaluators.AveragePrecisionEvaluator`: computes the average precision, precision averaged across different confidence thresholds.
+    - `evaluators.ThresholdAccuracyEvaluator`: computes the threshold based accuracy, i.e., accuracy of the predictions with confidence over a certain threshold.
+    - `evaluators.EceLossEvaluator`: computes the [ECE loss](https://arxiv.org/pdf/1706.04599.pdf), i.e., the expected calibration error, given the model confidence and true labels for a set of data points. 
+- Object detection:
+    - `evaluators.MeanAveragePrecisionEvaluatorForSingleIOU`, `evaluators.MeanAveragePrecisionEvaluatorForMultipleIOUs`: computes the mean average precision (mAP), i.e. mean average precision across different classes, under single or multiple [IoU(s)](https://en.wikipedia.org/wiki/Jaccard_index).
+    
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,41 +0,0 @@
-<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
-
-## Security
-
-Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
-
-If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
-
-## Reporting Security Issues
-
-**Please do not report security vulnerabilities through public GitHub issues.**
-
-Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
-
-If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
-
-You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
-
-Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
-
-  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
-  * Full paths of source file(s) related to the manifestation of the issue
-  * The location of the affected source code (tag/branch/commit or direct URL)
-  * Any special configuration required to reproduce the issue
-  * Step-by-step instructions to reproduce the issue
-  * Proof-of-concept or exploit code (if possible)
-  * Impact of the issue, including how an attacker might exploit the issue
-
-This information will help us triage your report more quickly.
-
-If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
-
-## Preferred Languages
-
-We prefer all communications to be in English.
-
-## Policy
-
-Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
-
-<!-- END MICROSOFT SECURITY.MD BLOCK -->
--- a/SUPPORT.md
+++ b/SUPPORT.md
@ -1,25 +0,0 @@
-# TODO: The maintainer of this repo has not yet edited this file
-
-**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
-
- **No CSS support:** Fill out this template with information about how to file issues and get help.
- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
-
-*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
-
-# Support
-
-## How to file issues and get help  
-
-This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
-issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
-feature request as a new Issue.
-
-For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
-FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
-CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
-
-## Microsoft Support Policy  
-
-Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,34 @@
+import setuptools
+from os import path
+
+VERSION = '0.1.0'
+
+here = path.abspath(path.dirname(__file__))
+
+# Get the long description from the README file
+with open(path.join(here, 'README.md'), 'r') as f:
+    long_description = f.read()
+
+setuptools.setup(name='vision-evaluation',
+                 author='Ping Jin, Shohei Ono, I-Ting Fang',
+                 description="Evaluation code for vision tasks.",
+                 long_description=long_description,
+                 long_description_content_type="text/markdown",
+                 url='https://github.com/pj-ms/vision-evaluation',
+                 version=VERSION,
+                 license='MIT',
+                 python_requires='>=3.6',
+                 packages=setuptools.find_packages(),
+                 keywords='vision metric evaluation classification detection',
+                 classifiers=[
+                     'Development Status :: 4 - Beta',
+                     'Intended Audience :: Developers',
+                     'License :: OSI Approved :: MIT License',
+                     'Programming Language :: Python :: 3.6',
+                     'Programming Language :: Python :: 3.7',
+                     'Programming Language :: Python :: 3.8',
+                 ],
+                 install_requires=[
+                     'numpy~=1.18.3',
+                     'sklearn',
+                 ])
--- a/test/test_evaluators.py
+++ b/test/test_evaluators.py
@ -0,0 +1,186 @@
+import unittest
+import numpy as np
+from vision_evaluation.evaluators import AveragePrecisionEvaluator, TopKAccuracyEvaluator, ThresholdAccuracyEvaluator, MeanAveragePrecisionEvaluatorForSingleIOU, EceLossEvaluator
+
+
+class TestClassificationEvaluator(unittest.TestCase):
+    TARGETS = np.array([1, 0, 0, 0, 1, 1, 0, 0, 0, 1])
+    PREDICTIONS = np.array([[1, 0],
+                            [0, 1],
+                            [0.5, 0.5],
+                            [0.1, 0.9],
+                            [0.44, 0.56],
+                            [0.09, 0.91],
+                            [0.91, 0.09],
+                            [0.37, 0.63],
+                            [0.34, 0.66],
+                            [0.89, 0.11]])
+
+    def test_top_k_accuracy_evaluator(self):
+        top1_acc_evaluator = TopKAccuracyEvaluator(1)
+        top1_acc_evaluator.add_predictions(self.PREDICTIONS, self.TARGETS)
+
+        top5_acc_evaluator = TopKAccuracyEvaluator(5)
+        top5_acc_evaluator.add_predictions(self.PREDICTIONS, self.TARGETS)
+
+        self.assertEqual(top1_acc_evaluator.get_report(average='micro')["top1_accuracy"], 0.4)
+        self.assertEqual(top5_acc_evaluator.get_report(average='micro')["top5_accuracy"], 1.0)
+
+        self.assertEqual(top1_acc_evaluator.get_report(average='macro')["top1_accuracy"], 0.4)
+        self.assertEqual(top5_acc_evaluator.get_report(average='macro')["top5_accuracy"], 1.0)
+
+    def test_average_precision_evaluator(self):
+        evaluator = AveragePrecisionEvaluator()
+        evaluator.add_predictions(self.PREDICTIONS, self.TARGETS)
+        self.assertEqual(evaluator.get_report(average='micro')["average_precision"], 0.4476823176823177)
+        self.assertEqual(evaluator.get_report(average='macro')["average_precision"], 0.47574404761904765)
+
+    def test_ece_loss_evaluator(self):
+        evaluator = EceLossEvaluator()
+        evaluator.add_predictions(self.PREDICTIONS, self.TARGETS)
+        self.assertEqual(0.584, evaluator.get_report()["calibration_ece"])
+
+    def test_threshold_accuracy_evaluator(self):
+        thresh03_evaluator = ThresholdAccuracyEvaluator(0.3)
+        thresh03_evaluator.add_predictions(self.PREDICTIONS, self.TARGETS)
+        self.assertEqual(0.4, thresh03_evaluator.get_report()["accuracy_0.3"])
+
+        thresh05_evaluator = ThresholdAccuracyEvaluator(0.5)
+        thresh05_evaluator.add_predictions(self.PREDICTIONS, self.TARGETS)
+        self.assertEqual(0.3, thresh05_evaluator.get_report()["accuracy_0.5"])
+
+
+class TestMeanAveragePrecisionEvaluatorForSingleIOU(unittest.TestCase):
+    def test_perfect_one_image(self):
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.5)
+
+        predictions = [[[0, 1.0, 0, 0, 1, 1],
+                        [1, 1.0, 0.5, 0.5, 1, 1],
+                        [2, 1.0, 0.1, 0.1, 0.5, 0.5]]]
+
+        targets = [[[0, 0, 0, 1, 1],
+                    [1, 0.5, 0.5, 1, 1],
+                    [2, 0.1, 0.1, 0.5, 0.5]]]
+
+        evaluator.add_predictions(predictions, targets)
+        report = evaluator.get_report()
+        self.assertEqual(report["mAP_50"], 1.0)
+        self.assertTrue(isinstance(report["mAP_50"], float))
+
+    def test_wrong_one_image(self):
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.5)
+
+        predictions = [[[0, 1.0, 0, 0, 1, 1],
+                        [0, 1.0, 0.5, 0.5, 1, 1],
+                        [1, 1.0, 0.5, 0.5, 1, 1]]]
+
+        targets = [[[0, 0, 0, 1, 1],
+                    [1, 0.5, 0.5, 1, 1]]]
+
+        evaluator.add_predictions(predictions, targets)
+        report = evaluator.get_report()
+        self.assertEqual(report["mAP_50"], 0.75)
+        self.assertTrue(isinstance(report["mAP_50"], float))
+
+    def test_perfect_two_images(self):
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.5)
+
+        predictions = [[[0, 1.0, 0, 0, 1, 1],
+                        [1, 1.0, 0.5, 0.5, 1, 1]],
+                       [[2, 1.0, 0.1, 0.1, 0.5, 0.5]]]
+
+        targets = [[[0, 0, 0, 1, 1],
+                    [1, 0.5, 0.5, 1, 1]],
+                   [[2, 0.1, 0.1, 0.5, 0.5]]]
+
+        evaluator.add_predictions(predictions, targets)
+        report = evaluator.get_report()
+        self.assertEqual(report["mAP_50"], 1.0)
+        self.assertTrue(isinstance(report["mAP_50"], float))
+
+    def test_two_batches(self):
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.5)
+
+        predictions = [[[0, 1.0, 0, 0, 1, 1],
+                        [1, 1.0, 0.5, 0.5, 1, 1]],
+                       [[2, 1.0, 0.1, 0.1, 0.5, 0.5]]]
+
+        targets = [[[0, 0, 0, 1, 1],
+                    [1, 0.5, 0.5, 1, 1]],
+                   [[2, 0.1, 0.1, 0.5, 0.5]]]
+
+        evaluator.add_predictions(predictions, targets)
+
+        predictions = [[[0, 1.0, 0.9, 0.9, 1, 1],  # Wrong
+                        [1, 1.0, 0.5, 0.5, 1, 1]],
+                       [[2, 1.0, 0.1, 0.1, 0.5, 0.5]]]
+
+        targets = [[[0, 0, 0, 1, 1],
+                    [1, 0.5, 0.5, 1, 1]],
+                   [[2, 0.1, 0.1, 0.5, 0.5]]]
+
+        evaluator.add_predictions(predictions, targets)
+        report = evaluator.get_report()
+        self.assertEqual(report["mAP_50"], 0.75)
+        self.assertTrue(isinstance(report["mAP_50"], float))
+
+    def test_iou_threshold(self):
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.5)
+
+        predictions = [[[0, 1.0, 0.5, 0.5, 1, 1],  # IOU 0.25
+                        [1, 1.0, 0.5, 0.5, 1, 1]]]
+
+        targets = [[[0, 0, 0, 1, 1],
+                    [1, 0.5, 0.5, 1, 1]]]
+
+        evaluator.add_predictions(predictions, targets)
+        report = evaluator.get_report()
+        self.assertEqual(report["mAP_50"], 0.5)
+        self.assertTrue(isinstance(report["mAP_50"], float))
+
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.2)
+
+        predictions = [[[0, 1.0, 0.5, 0.5, 1, 1],  # IOU 0.25
+                        [1, 1.0, 0.5, 0.5, 1, 1]]]
+
+        targets = [[[0, 0, 0, 1, 1],
+                    [1, 0.5, 0.5, 1, 1]]]
+
+        evaluator.add_predictions(predictions, targets)
+        report = evaluator.get_report()
+        self.assertEqual(report["mAP_20"], 1.0)
+        self.assertTrue(isinstance(report["mAP_20"], float))
+
+    def test_no_predictions(self):
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.5)
+
+        predictions = [[]]
+        targets = [[[0, 0, 0, 1, 1],
+                    [1, 0.5, 0.5, 1, 1],
+                    [2, 0.1, 0.1, 0.5, 0.5]]]
+
+        evaluator.add_predictions(predictions, targets)
+        report = evaluator.get_report()
+        self.assertEqual(report["mAP_50"], 0.0)
+        self.assertTrue(isinstance(report["mAP_50"], float))
+
+    def test_no_targets(self):
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.5)
+
+        predictions = [[[0, 1.0, 0, 0, 1, 1],
+                        [1, 1.0, 0.5, 0.5, 1, 1],
+                        [2, 1.0, 0.1, 0.1, 0.5, 0.5]]]
+
+        targets = [[]]
+
+        evaluator.add_predictions(predictions, targets)
+        report = evaluator.get_report()
+        self.assertEqual(report["mAP_50"], 0.0)
+        self.assertTrue(isinstance(report["mAP_50"], float))
+
+    def test_empty_result(self):
+        evaluator = MeanAveragePrecisionEvaluatorForSingleIOU(iou=0.5)
+        report = evaluator.get_report()
+        self.assertIn('mAP_50', report)
+        self.assertEqual(report["mAP_50"], 0.0)
+        self.assertTrue(isinstance(report["mAP_50"], float))
--- a/tox.ini
+++ b/tox.ini
@ -0,0 +1,3 @@
+[flake8]
+exclude = .git,build,dist
+max-line-length = 200
--- a/vision_evaluation/init.py
+++ b/vision_evaluation/init.py
@ -0,0 +1,3 @@
+from .evaluators import MeanAveragePrecisionEvaluatorForMultipleIOUs, TopKAccuracyEvaluator, ThresholdAccuracyEvaluator, AveragePrecisionEvaluator, EceLossEvaluator
+
+__all__ = ['MeanAveragePrecisionEvaluatorForMultipleIOUs', 'TopKAccuracyEvaluator', 'ThresholdAccuracyEvaluator', "AveragePrecisionEvaluator", "EceLossEvaluator"]
--- a/vision_evaluation/evaluators.py
+++ b/vision_evaluation/evaluators.py
@ -0,0 +1,339 @@
+import collections
+import statistics
+import sklearn.metrics
+import numpy as np
+from abc import ABC
+
+
+def _top_k_prediction_indices(prediction, k):
+    top_k_preds = np.argsort(-prediction, axis=1)[:, :k]
+    return top_k_preds
+
+
+def _targets_to_mat(targets, n_class):
+    if len(targets.shape) == 1:
+        target_mat = np.zeros((len(targets), n_class), dtype=int)
+        for i, t in enumerate(targets):
+            target_mat[i, t] = 1
+    else:
+        target_mat = targets
+
+    return target_mat
+
+
+class Evaluator(ABC):
+    """Class to evaluate model outputs and report the result.
+    """
+
+    def __init__(self):
+        self.reset()
+
+    def add_predictions(self, predictions, targets):
+        raise NotImplementedError
+
+    def get_report(self, **kwargs):
+        raise NotImplementedError
+
+    def add_custom_field(self, name, value):
+        self.custom_fields[name] = str(value)
+
+    def reset(self):
+        self.custom_fields = {}
+
+
+class TopKAccuracyEvaluator(Evaluator):
+    def __init__(self, k):
+        self.k = k
+        super(TopKAccuracyEvaluator, self).__init__()
+
+    def reset(self):
+        super(TopKAccuracyEvaluator, self).reset()
+        self.total_num = 0
+        self.topk_correct_num = 0
+
+    def add_predictions(self, predictions, targets):
+        """ Evaluate a batch of predictions.
+        Args:
+            predictions: the model output numpy array. Shape (N, num_class)
+            targets: the golden truths. Shape (N,)
+        """
+        assert len(predictions) == len(targets)
+        assert len(targets.shape) == 1
+
+        n_sample = len(predictions)
+        n_class = predictions.shape[1]
+
+        k = min(self.k, n_class)
+        top_k_predictions = _top_k_prediction_indices(predictions, k)
+        self.topk_correct_num += len([1 for sample_idx in range(n_sample) if targets[sample_idx] in top_k_predictions[sample_idx]])
+
+        self.total_num += len(predictions)
+
+    def get_report(self, **kwargs):
+        return {f'top{self.k}_accuracy': float(self.topk_correct_num) / self.total_num if self.total_num else 0.0}
+
+
+class AveragePrecisionEvaluator(Evaluator, ABC):
+    def reset(self):
+        super(AveragePrecisionEvaluator, self).reset()
+        self.all_targets = np.array([])
+        self.all_predictions = np.array([])
+
+    def add_predictions(self, predictions, targets):
+        target_mat = _targets_to_mat(targets, predictions.shape[1])
+
+        if self.all_predictions.size != 0:
+            self.all_predictions = np.append(self.all_predictions, predictions, axis=0)
+        else:
+            self.all_predictions = np.copy(predictions)
+
+        if self.all_targets.size != 0:
+            self.all_targets = np.append(self.all_targets, target_mat, axis=0)
+        else:
+            self.all_targets = np.copy(target_mat)
+
+    def calculate_average_precision_score(self, average='macro'):
+        """
+        average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+        If ``None``, the scores for each class are returned. Otherwise,
+        this determines the type of averaging performed on the data:
+
+        ``'micro'``:
+            Calculate metrics globally by considering each element of the label
+            indicator matrix as a label.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label).
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average.
+        """
+        assert self.all_targets.size == self.all_predictions.size
+        ap = 0.0
+        if self.all_targets.size > 0:
+            non_empty_idx = np.where(np.invert(np.all(self.all_targets == 0, axis=0)))[0]
+            if non_empty_idx.size != 0:
+                ap = sklearn.metrics.average_precision_score(self.all_targets[:, non_empty_idx], self.all_predictions[:, non_empty_idx], average=average)
+
+        return ap
+
+    def get_report(self, **kwargs):
+        return {'average_precision': self.calculate_average_precision_score(kwargs['average'])}
+
+
+class EceLossEvaluator(Evaluator):
+    """
+    Computes the expected calibration error (ECE) given the model confidence and true labels for a set of data points.
+
+    https://arxiv.org/pdf/1706.04599.pdf
+    """
+
+    def __init__(self, n_bins=15):
+        # Calibration ECE, Divide the probability into nbins
+        self.n_bins = n_bins
+        bins = np.linspace(0, 1, self.n_bins + 1)
+        self.bin_lower_bounds = bins[:-1]
+        self.bin_upper_bounds = bins[1:]
+        super(EceLossEvaluator, self).__init__()
+
+    def add_predictions(self, predictions, targets):
+        """ Evaluate a batch of predictions.
+        Args:
+            predictions: the model output numpy array. Shape (N, num_class)
+            targets: the golden truths. Shape (N,)
+        """
+
+        # calibration_ece
+
+        self.total_num += len(predictions)
+
+        indices = _top_k_prediction_indices(predictions, 1).flatten()
+        confidence = predictions[np.arange(len(predictions)), indices]
+        correct = (indices == targets)
+        for bin_i in range(self.n_bins):
+            bin_lower_bound, bin_upper_bound = self.bin_lower_bounds[bin_i], self.bin_upper_bounds[bin_i]
+            in_bin = np.logical_and(confidence > bin_lower_bound, confidence <= bin_upper_bound)
+            self.total_correct_in_bin[bin_i] += correct[in_bin].astype(int).sum()
+            self.sum_confidence_in_bin[bin_i] += confidence[in_bin].astype(float).sum()
+
+    def get_report(self, **kwargs):
+        return {'calibration_ece': float(np.sum(np.abs(self.total_correct_in_bin - self.sum_confidence_in_bin)) / self.total_num) if self.total_num else 0.0}
+
+    def reset(self):
+        super(EceLossEvaluator, self).reset()
+        self.total_num = 0
+        self.total_correct_in_bin = np.zeros(self.n_bins)
+        self.sum_confidence_in_bin = np.zeros(self.n_bins)
+
+
+class ThresholdAccuracyEvaluator(Evaluator):
+    def __init__(self, threshold):
+        super(ThresholdAccuracyEvaluator, self).__init__()
+        self._threshold = threshold
+
+    def add_predictions(self, predictions, targets):
+        """ Evaluate a batch of predictions.
+        Args:
+            predictions: the model output array. Shape (N, num_class)
+            targets: the ground truths. Shape (N, num_class)
+        """
+        assert len(predictions) == len(targets)
+
+        target_mat = _targets_to_mat(targets, predictions.shape[1])
+
+        prediction_over_thres = predictions > self._threshold
+        num = np.multiply(prediction_over_thres, target_mat).sum(1)  # shape (N,)
+        den = (np.add(prediction_over_thres, target_mat) >= 1).sum(1)  # shape (N,)
+        den[den == 0] = 1  # To avoid zero-division. If den==0, num should be zero as well.
+        self.correct_num += (num / den).sum()
+        self.total_num += len(predictions)
+
+    def get_report(self, average='macro'):
+        return {f'accuracy_{self._threshold}': float(self.correct_num) / self.total_num if self.total_num else 0.0}
+
+    def reset(self):
+        super(ThresholdAccuracyEvaluator, self).reset()
+        self.correct_num = 0
+        self.total_num = 0
+
+
+class MeanAveragePrecisionEvaluatorForSingleIOU(Evaluator):
+    def __init__(self, iou=0.5):
+        super(MeanAveragePrecisionEvaluatorForSingleIOU, self).__init__()
+        self.iou = iou
+
+    def add_predictions(self, predictions, targets):
+        """ Evaluate list of image with object detection results using single IOU evaluation.
+        Args:
+            predictions: list of predictions [[[label_idx, probability, L, T, R, B], ...], [...], ...]
+            targets: list of image targets [[[label_idx, L, T, R, B], ...], ...]
+        """
+
+        assert len(predictions) == len(targets)
+
+        eval_predictions = collections.defaultdict(list)
+        eval_ground_truths = collections.defaultdict(dict)
+        for img_idx, prediction in enumerate(predictions):
+            for bbox in prediction:
+                label = int(bbox[0])
+                eval_predictions[label].append([img_idx, float(bbox[1]), float(bbox[2]), float(bbox[3]), float(bbox[4]), float(bbox[5])])
+
+        for img_idx, target in enumerate(targets):
+            for bbox in target:
+                label = int(bbox[0])
+                if img_idx not in eval_ground_truths[label]:
+                    eval_ground_truths[label][img_idx] = []
+                eval_ground_truths[label][img_idx].append([float(bbox[1]), float(bbox[2]), float(bbox[3]), float(bbox[4])])
+
+        class_indices = set(list(eval_predictions.keys()) + list(eval_ground_truths.keys()))
+        for class_index in class_indices:
+            is_correct, probabilities = self._evaluate_predictions(eval_ground_truths[class_index], eval_predictions[class_index], self.iou)
+            true_num = sum([len(t) for t in eval_ground_truths[class_index].values()])
+
+            self.is_correct[class_index].extend(is_correct)
+            self.probabilities[class_index].extend(probabilities)
+            self.true_num[class_index] += true_num
+
+    @staticmethod
+    def _calculate_area(rect):
+        w = rect[2] - rect[0] + 1e-5
+        h = rect[3] - rect[1] + 1e-5
+        return float(w * h) if w > 0 and h > 0 else 0.0
+
+    @staticmethod
+    def _calculate_iou(rect0, rect1):
+        rect_intersect = [max(rect0[0], rect1[0]),
+                          max(rect0[1], rect1[1]),
+                          min(rect0[2], rect1[2]),
+                          min(rect0[3], rect1[3])]
+        calc_area = MeanAveragePrecisionEvaluatorForSingleIOU._calculate_area
+        area_intersect = calc_area(rect_intersect)
+        return area_intersect / (calc_area(rect0) + calc_area(rect1) - area_intersect)
+
+    def _is_true_positive(self, prediction, ground_truth, already_detected, iou_threshold):
+        image_id = prediction[0]
+        prediction_rect = prediction[2:6]
+        if image_id not in ground_truth:
+            return False, already_detected
+
+        ious = np.array([self._calculate_iou(prediction_rect, g) for g in ground_truth[image_id]])
+        best_bb = np.argmax(ious)
+        best_iou = ious[best_bb]
+
+        if best_iou < iou_threshold or (image_id, best_bb) in already_detected:
+            return False, already_detected
+
+        already_detected.add((image_id, best_bb))
+        return True, already_detected
+
+    def _evaluate_predictions(self, ground_truths, predictions, iou_threshold):
+        """ Evaluate the correctness of the given predictions.
+        Args:
+            ground_truths: List of ground truths for the class. {image_id: [[left, top, right, bottom], [...]], ...}
+            predictions: List of predictions for the class. [[image_id, probability, left, top, right, bottom], [...], ...]
+            iou_threshold: Minimum IOU threshold to be considered as a same bounding box.
+        """
+
+        # Sort the predictions by the probability
+        sorted_predictions = sorted(predictions, key=lambda x: -x[1])
+        already_detected = set()
+        is_correct = []
+        for prediction in sorted_predictions:
+            correct, already_detected = self._is_true_positive(prediction, ground_truths, already_detected,
+                                                               iou_threshold)
+            is_correct.append(correct)
+
+        is_correct = np.array(is_correct)
+        probabilities = np.array([p[1] for p in sorted_predictions])
+
+        return is_correct, probabilities
+
+    @staticmethod
+    def _calculate_average_precision(is_correct, probabilities, true_num, average='macro'):
+        if true_num == 0:
+            return 0
+        if not is_correct or not any(is_correct):
+            return 0
+        recall = float(np.sum(is_correct)) / true_num
+        return sklearn.metrics.average_precision_score(is_correct, probabilities, average=average) * recall
+
+    def get_report(self, average='macro'):
+        all_aps = []
+        for class_index in self.is_correct:
+            ap = MeanAveragePrecisionEvaluatorForSingleIOU._calculate_average_precision(self.is_correct[class_index], self.probabilities[class_index], self.true_num[class_index], average)
+            all_aps.append(ap)
+
+        mean_ap = float(statistics.mean(all_aps)) if all_aps else 0.0
+        return {"mAP_{}".format(int(self.iou * 100)): mean_ap}
+
+    def reset(self):
+        self.is_correct = collections.defaultdict(list)
+        self.probabilities = collections.defaultdict(list)
+        self.true_num = collections.defaultdict(int)
+        super(MeanAveragePrecisionEvaluatorForSingleIOU, self).reset()
+
+
+class MeanAveragePrecisionEvaluatorForMultipleIOUs(Evaluator):
+    DEFAULT_IOU_VALUES = [0.3, 0.5, 0.75, 0.9]
+
+    def __init__(self, ious=DEFAULT_IOU_VALUES):
+        self.evaluators = [MeanAveragePrecisionEvaluatorForSingleIOU(iou)
+                           for iou in ious]
+        super(MeanAveragePrecisionEvaluatorForMultipleIOUs, self).__init__()
+
+    def add_predictions(self, predictions, targets):
+        for evaluator in self.evaluators:
+            evaluator.add_predictions(predictions, targets)
+
+    def get_report(self, **kwargs):
+        report = {}
+        for evaluator in self.evaluators:
+            report.update(evaluator.get_report(kwargs['average']))
+        return report
+
+    def reset(self):
+        for evaluator in self.evaluators:
+            evaluator.reset()
+        super(MeanAveragePrecisionEvaluatorForMultipleIOUs, self).reset()