Performance function name in scatterplots (#36)

* Create performanceFunctionLabel prop * Add performance_metric parameter to compatibility_analysis API, display the name in scatter plot. * Rename performanceFunctionLabel prop -> performanceMetric for consistency * Don't rename model_accuracy to Accuracy. Metric name centered. * Documentation for performance metric function Co-authored-by: Nicholas King <v-nicki@microsoft.com>
2020-10-23 13:53:14 -07:00 · 2020-10-23 13:53:14 -07:00 · 071c32fd40
--- a/backwardcompatibilityml/helpers/training.py
+++ b/backwardcompatibilityml/helpers/training.py
@ -5,8 +5,8 @@ import copy
 import json
 import torch
 import numpy as np
-from sklearn.metrics import accuracy_score
 import backwardcompatibilityml.scores as scores
+from backwardcompatibilityml.metrics import model_accuracy


 def train_epoch(epoch, network, optimizer, loss_function, training_set, batch_size_train,
@ -426,36 +426,13 @@ def compatibility_scores(h1, h2, dataset, device="cpu"):
    return btc_dataset, bec_dataset


-def model_accuracy(model, dataset, device="cpu"):
-    number_of_batches = len(dataset)
-    model_performance = 0
-    with torch.no_grad():
-        for data, target in dataset:
-            if device != "cpu":
-                data = data.to(device)
-                target = target.to(device)
-            _, _, output_logsoftmax = model(data)
-            output_labels = torch.argmax(output_logsoftmax, 1)
-            if device != "cpu":
-                output_labels = output_labels.cpu()
-                target = target.cpu()
-            performance = accuracy_score(output_labels.numpy(), target.numpy())
-            model_performance += performance
-            # _clean_from_gpu([data, target])
-
-        model_performance /= number_of_batches
-
-    return model_performance
-
-
-def evaluate_model_performance_and_compatibility_on_dataset(h1, h2, dataset, performance_metric=None,
+def evaluate_model_performance_and_compatibility_on_dataset(h1, h2, dataset, performance_metric,
                                                            device="cpu"):
    """
    Args:
        h1: The reference model being used.
        h2: The model being traind / updated.
-        performance_metric: Optional performance metric to be used when evaluating the model.
-            If not specified then accuracy is used.
+        performance_metric: Performance metric to be used when evaluating the model.
        device: A string with values either "cpu" or "cuda" to indicate the
            device that Pytorch is performing training on. By default this
            value is "cpu". But in case your models reside on the GPU, make sure
@ -498,25 +475,7 @@ def evaluate_model_performance_and_compatibility_on_dataset(h1, h2, dataset, per
            "incompatibleFraction": error_fraction
        })

-    if performance_metric is not None:
-        h2_performance = performance_metric(h2, dataset)
-    else:
-        h2_performance = 0
-        with torch.no_grad():
-            for data, target in dataset:
-                if device != "cpu":
-                    data = data.to(device)
-                    target = target.to(device)
-                _, _, output_logsoftmax = h2(data)
-                output_labels = torch.argmax(output_logsoftmax, 1)
-                if device != "cpu":
-                    output_labels = output_labels.cpu()
-                    target = target.cpu()
-                performance = accuracy_score(output_labels.numpy(), target.numpy())
-                h2_performance += performance
-                # _clean_from_gpu([data, target])
-
-            h2_performance /= number_of_batches
+    h2_performance = performance_metric(h2, dataset, device)

    btc, bec = compatibility_scores(h1, h2, dataset, device=device)

@ -535,7 +494,7 @@ def evaluate_model_performance_and_compatibility_on_dataset(h1, h2, dataset, per
    }


-def evaluate_model_performance_and_compatibility(h1, h2, training_set, test_set, performance_metric=None,
+def evaluate_model_performance_and_compatibility(h1, h2, training_set, test_set, performance_metric,
                                                 device="cpu"):
    """
    Calculate the error overlap of h1 and h2 on a batched dataset.
@ -544,8 +503,7 @@ def evaluate_model_performance_and_compatibility(h1, h2, training_set, test_set,
    Args:
        h1: The reference model being used.
        h2: The model being traind / updated.
-        performance_metric: Optional performance metric to be used when evaluating the model.
-            If not specified then accuracy is used.
+        performance_metric: Performance metric to be used when evaluating the model.
        training_set: The list of batched training samples as (input, target) pairs.
        test_set: The list of batched testing samples as (input, target) pairs.
        device: A string with values either "cpu" or "cuda" to indicate the
@ -559,11 +517,11 @@ def evaluate_model_performance_and_compatibility(h1, h2, training_set, test_set,
    """
    training_set_performance_and_compatibility =\
        evaluate_model_performance_and_compatibility_on_dataset(
-            h1, h2, training_set, performance_metric=performance_metric,
+            h1, h2, training_set, performance_metric,
            device=device)
    testing_set_performance_and_compatibility =\
        evaluate_model_performance_and_compatibility_on_dataset(
-            h1, h2, test_set, performance_metric=performance_metric,
+            h1, h2, test_set, performance_metric,
            device=device)

    return {
@ -646,7 +604,7 @@ def compatibility_sweep(sweeps_folder_path, number_of_epochs, h1, h2,
                        training_set, test_set, batch_size_train, batch_size_test,
                        OptimizerClass, optimizer_kwargs,
                        NewErrorLossClass, StrictImitationLossClass,
-                        performance_metric=None,
+                        performance_metric=model_accuracy,
                        lambda_c_stepsize=0.25, percent_complete_queue=None,
                        new_error_loss_kwargs=None,
                        strict_imitation_loss_kwargs=None,
@ -676,8 +634,13 @@ def compatibility_sweep(sweeps_folder_path, number_of_epochs, h1, h2,
        StrictImitationLossClass: The class of the Strict Imitation style loss
            function to be instantiated and used to perform compatibility
            constrained training of our model h2.
-        performance_metric: Optional performance metric to be used when evaluating the model.
-            If not specified then accuracy is used.
+        performance_metric: A function to evaluate model performance. The function is
+            expected to have the following signature:
+                metric(model, dataset, device)
+                    model: The model being evaluated
+                    dataset: The dataset as a list of (input, target) pairs
+                    device: The device Pytorch is using for training - "cpu" or "cuda"
+            If unspecified, then accuracy is used.
        lambda_c_stepsize: The increments of lambda_c to use as we sweep the parameter
            space between 0.0 and 1.0.
        percent_complete_queue: Optional thread safe queue to use for logging the
@ -714,7 +677,7 @@ def compatibility_sweep(sweeps_folder_path, number_of_epochs, h1, h2,

        training_set_performance_and_compatibility =\
            evaluate_model_performance_and_compatibility_on_dataset(
-                h1, h2_new_error, training_set, performance_metric=performance_metric,
+                h1, h2_new_error, training_set, performance_metric,
                device=device)
        training_set_performance_and_compatibility["lambda_c"] = lambda_c
        training_set_performance_and_compatibility["training"] = True
@ -739,7 +702,7 @@ def compatibility_sweep(sweeps_folder_path, number_of_epochs, h1, h2,

        testing_set_performance_and_compatibility =\
            evaluate_model_performance_and_compatibility_on_dataset(
-                h1, h2_new_error, test_set, performance_metric=performance_metric,
+                h1, h2_new_error, test_set, performance_metric,
                device=device)
        testing_set_performance_and_compatibility["lambda_c"] = lambda_c
        testing_set_performance_and_compatibility["training"] = False
@ -774,7 +737,7 @@ def compatibility_sweep(sweeps_folder_path, number_of_epochs, h1, h2,

        training_set_performance_and_compatibility =\
            evaluate_model_performance_and_compatibility_on_dataset(
-                h1, h2_strict_imitation, training_set, performance_metric=performance_metric,
+                h1, h2_strict_imitation, training_set, performance_metric,
                device=device)
        training_set_performance_and_compatibility["lambda_c"] = lambda_c
        training_set_performance_and_compatibility["training"] = True
@ -799,7 +762,7 @@ def compatibility_sweep(sweeps_folder_path, number_of_epochs, h1, h2,

        testing_set_performance_and_compatibility =\
            evaluate_model_performance_and_compatibility_on_dataset(
-                h1, h2_new_error, test_set, performance_metric=performance_metric,
+                h1, h2_new_error, test_set, performance_metric,
                device=device)
        testing_set_performance_and_compatibility["lambda_c"] = lambda_c
        testing_set_performance_and_compatibility["training"] = False
--- a/backwardcompatibilityml/metrics.py
+++ b/backwardcompatibilityml/metrics.py
@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import torch
+from sklearn.metrics import accuracy_score
+
+
+def model_accuracy(model, dataset, device="cpu"):
+    model_performance = 0
+    number_of_batches = len(dataset)
+    with torch.no_grad():
+        for data, target in dataset:
+            if device != "cpu":
+                data = data.to(device)
+                target = target.to(device)
+            _, _, output_logsoftmax = model(data)
+            output_labels = torch.argmax(output_logsoftmax, 1)
+            if device != "cpu":
+                output_labels = output_labels.cpu()
+                target = target.cpu()
+            performance = accuracy_score(output_labels.numpy(), target.numpy())
+            model_performance += performance
+            # _clean_from_gpu([data, target])
+
+        model_performance /= number_of_batches
+    return model_performance
--- a/backwardcompatibilityml/sweep_management.py
+++ b/backwardcompatibilityml/sweep_management.py
@ -6,6 +6,7 @@ import json
 import threading
 from queue import Queue
 from backwardcompatibilityml.helpers import training
+from backwardcompatibilityml.metrics import model_accuracy


 class SweepManager(object):
@ -58,6 +59,7 @@ class SweepManager(object):
                 NewErrorLossClass, StrictImitationLossClass, lambda_c_stepsize=0.25,
                 new_error_loss_kwargs=None,
                 strict_imitation_loss_kwargs=None,
+                 performance_metric=model_accuracy,
                 device="cpu"):
        self.folder_name = folder_name
        self.number_of_epochs = number_of_epochs
@ -71,6 +73,7 @@ class SweepManager(object):
        self.optimizer_kwargs = optimizer_kwargs
        self.NewErrorLossClass = NewErrorLossClass
        self.StrictImitationLossClass = StrictImitationLossClass
+        self.performance_metric = performance_metric
        self.lambda_c_stepsize = lambda_c_stepsize
        self.new_error_loss_kwargs = new_error_loss_kwargs
        self.strict_imitation_loss_kwargs = strict_imitation_loss_kwargs
@ -83,7 +86,8 @@ class SweepManager(object):
                  self.training_set, self.test_set,
                  self.batch_size_train, self.batch_size_test,
                  self.OptimizerClass, self.optimizer_kwargs,
-                  self.NewErrorLossClass, self.StrictImitationLossClass,),
+                  self.NewErrorLossClass, self.StrictImitationLossClass,
+                  self.performance_metric,),
            kwargs={
                "lambda_c_stepsize": self.lambda_c_stepsize,
                "percent_complete_queue": self.percent_complete_queue,
@ -116,6 +120,7 @@ class SweepManager(object):
    def get_sweep_summary(self):
        sweep_summary = {
            "h1_performance": None,
+            "performance_metric": self.performance_metric.__name__,
            "data": []
        }

--- a/backwardcompatibilityml/widget/compatibility_analysis.py
+++ b/backwardcompatibilityml/widget/compatibility_analysis.py
@ -12,6 +12,7 @@ import torch.optim as optim
 from flask import Response
 from backwardcompatibilityml import loss
 from backwardcompatibilityml.sweep_management import SweepManager
+from backwardcompatibilityml.metrics import model_accuracy
 from rai_core_flask.flask_helper import FlaskHelper
 from rai_core_flask.environments import (
    AzureNBEnvironment,
@ -169,6 +170,13 @@ class CompatibilityAnalysis(object):
        StrictImitationLossClass: The class of the Strict Imitation style loss
            function to be instantiated and used to perform compatibility
            constrained training of our model h2.
+        performance_metric: A function to evaluate model performance. The function is
+            expected to have the following signature:
+                metric(model, dataset, device)
+                    model: The model being evaluated
+                    dataset: The dataset as a list of (input, target) pairs
+                    device: The device Pytorch is using for training - "cpu" or "cuda"
+            If unspecified, then accuracy is used.
        port: An integer value to indicate the port to which the Flask service
            should bind.
        device: A string with values either "cpu" or "cuda" to indicate the
@ -182,6 +190,7 @@ class CompatibilityAnalysis(object):
                 batch_size_train, batch_size_test, lambda_c_stepsize=0.25,
                 OptimizerClass=None, optimizer_kwargs=None,
                 NewErrorLossClass=None, StrictImitationLossClass=None,
+                 performance_metric=model_accuracy,
                 port=None, new_error_loss_kwargs=None,
                 strict_imitation_loss_kwargs=None, device="cpu"):
        if OptimizerClass is None:
@ -211,6 +220,7 @@ class CompatibilityAnalysis(object):
            lambda_c_stepsize=lambda_c_stepsize,
            new_error_loss_kwargs=new_error_loss_kwargs,
            strict_imitation_loss_kwargs=strict_imitation_loss_kwargs,
+            performance_metric=performance_metric,
            device=device)

        self.flask_service = FlaskHelper(ip="0.0.0.0", port=port)
--- a/widget/MainContainer.tsx
+++ b/widget/MainContainer.tsx
@ -103,6 +103,7 @@ function Container({
            <PerformanceCompatibility
              data={data.data}
              h1Performance={data.h1_performance}
+              performanceMetric={data.performance_metric}
              training={training}
              testing={testing}
              newError={newError}
@ -115,6 +116,7 @@ function Container({
            <PerformanceCompatibility
              data={data.data}
              h1Performance={data.h1_performance}
+              performanceMetric={data.performance_metric}
              training={training}
              testing={testing}
              newError={newError}
--- a/widget/PerformanceCompatibility.tsx
+++ b/widget/PerformanceCompatibility.tsx
@ -25,6 +25,7 @@ type PerformanceCompatibilityProps = {
  strictImitation: boolean,
  selectedDataPoint: any,
  compatibilityScoreType: string,
+  performanceMetric: string,
  selectDataPoint: (d: any) => void,
  getModelEvaluationData: (evaluationId: number) => void
 }
@ -170,11 +171,11 @@ class PerformanceCompatibility extends Component<PerformanceCompatibilityProps,
      .append('text')
        .attr('id', 'yAxisLabel')
        .attr('transform','rotate(-90)')
-        .attr('x',-h/2)
+        .attr('x',-h/2+2.5*this.props.performanceMetric.length)
        .attr('y',-50)
        .attr('dy','.71em')
        .style('text-anchor','end')
-        .text('Performance')
+        .text(this.props.performanceMetric)
        .attr("font-family", "sans-serif")
        .attr("font-size", "20px")
        .attr("fill", "black");