remove old notebook (#231)

* remove old notebook * store onnx latency allow aml partial training with no snapdragon mode * fix docker file now that aml branch is merged. * fix bug in reset add notebook * add link to notebook.
2023-04-22 03:23:38 -07:00 · 2023-04-22 03:23:38 -07:00 · fc031eb447
--- a/archai/common/store.py
+++ b/archai/common/store.py
@ -390,7 +390,8 @@ class ArchaiStore:
            self._reset(e, except_list)

    def _reset(self, e, except_list=[]):
-        if self.is_locked_by_other(e):
+        name = e['name']
+        if self.is_locked_by_other(name):
            node = self.get_lock(e)
            print(f"Skipping {e['RowKey']} as it is locked by {node}")
        elif self._reset_metrics(e, except_list):
--- a/archai/discrete_search/evaluators/remote_azure_benchmark.py
+++ b/archai/discrete_search/evaluators/remote_azure_benchmark.py
@ -72,6 +72,15 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
        _ = self.store.get_existing_status(unknown_id)
        _ = self.store.list_blobs(unknown_id)

+    def _reset(self, entity):
+        changed = False
+        for k in ['mean', 'macs', 'params', 'stdev', 'total_inference_avg', 'error']:
+            if k in entity:
+                del entity[k]
+                changed = True
+        if changed:
+            self.store.update_status_entity(entity)
+
    @overrides
    def send(self, arch: ArchaiModel, budget: Optional[float] = None) -> None:
        # bug in azure ml sdk requires blob store folder names not begin with digits, so we prefix with 'id_'
@ -88,8 +97,8 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
                        print(f"Entry for {archid} already exists with {self.metric_key} = {value}")
                    return
                else:
-                    # complete but missing the mean, so reset everything so we can try again below.
-                    self.store.reset(archid, ['benchmark_only', 'model_date'])
+                    # complete but missing the mean, so reset the benchmark metrics so we can try again.
+                    self._reset(entity)
            else:
                # job is still running, let it continue
                if self.verbose:
--- a/tasks/face_segmentation/aml.py
+++ b/tasks/face_segmentation/aml.py
@ -129,7 +129,7 @@ def main(output_dir: Path, experiment_name: str, seed: int):
        ml_client,
        image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
        conda_file="conda.yaml",
-        version='1.0.20')
+        version='1.0.21')
    environment_name = f"{archai_job_env.name}:{archai_job_env.version}"

    # Register the datastore with AML
--- a/tasks/face_segmentation/aml/docker/quantizer/Dockerfile
+++ b/tasks/face_segmentation/aml/docker/quantizer/Dockerfile
@ -63,7 +63,7 @@ RUN wget -O azcopy_v10.tar.gz https://aka.ms/downloadazcopy-v10-linux && tar -xf
 # this echo is a trick to bypass docker build cache.
 # simply change the echo string every time you want docker build to pull down new bits.
 RUN echo '04/18/2023 12:22 PM' >/dev/null && git clone https://github.com/microsoft/archai.git
-RUN cd archai && git checkout clovett/aml && pip install -e .[dev]
+RUN cd archai && pip install -e .[dev]

 RUN echo "using this pip version: " && which pip
 RUN echo "using this python version: " && which python
--- a/tasks/face_segmentation/aml/notebooks/gallery/gallery_performance.ipynb
+++ b/tasks/face_segmentation/aml/notebooks/gallery/gallery_performance.ipynb
--- a/tasks/face_segmentation/aml/notebooks/gallery/pareto.py
+++ b/tasks/face_segmentation/aml/notebooks/gallery/pareto.py
@ -1,103 +0,0 @@
-import numpy as np
-from scipy import interpolate
-from scipy import spatial
-from functools import reduce
-
-
-def spline_fit(xs, ys):
-    indexes = np.argsort(np.array(xs), axis=0)
-    ixs = []
-    iys = []
-    for i in indexes:
-        ixs += [xs[i]]
-        iys += [ys[i]]
-
-    f = interpolate.UnivariateSpline(ixs, iys)
-    f.set_smoothing_factor(0.1)
-    xn = np.linspace(ixs[0], ixs[-1], 100)
-    return [xn, f(xn)]
-
-
-def filter_(pts, pt):
-    """
-    Get all points in pts that are not Pareto dominated by the point pt
-    """
-    weakly_worse = (pts <= pt).all(axis=-1)
-    strictly_worse = (pts < pt).any(axis=-1)
-    return pts[~(weakly_worse & strictly_worse)]
-
-
-def get_pareto_undominated_by(pts1, pts2=None):
-    """
-    Return all points in pts1 that are not Pareto dominated
-    by any points in pts2
-    """
-    if pts2 is None:
-        pts2 = pts1
-    return reduce(filter_, pts2, pts1)
-
-
-def get_pareto_frontier(pts):
-    """
-    Iteratively filter points based on the convex hull heuristic
-    """
-    pareto_groups = []
-
-    # loop while there are points remaining
-    while pts.shape[0]:
-        # brute force if there are few points:
-        if pts.shape[0] < 10:
-            pareto_groups.append(get_pareto_undominated_by(pts))
-            break
-
-        # compute vertices of the convex hull
-        hull_vertices = spatial.ConvexHull(pts).vertices
-
-        # get corresponding points
-        hull_pts = pts[hull_vertices]
-
-        # get points in pts that are not convex hull vertices
-        nonhull_mask = np.ones(pts.shape[0], dtype=bool)
-        nonhull_mask[hull_vertices] = False
-        pts = pts[nonhull_mask]
-
-        # get points in the convex hull that are on the Pareto frontier
-        pareto = get_pareto_undominated_by(hull_pts)
-        pareto_groups.append(pareto)
-
-        # filter remaining points to keep those not dominated by
-        # Pareto points of the convex hull
-        pts = get_pareto_undominated_by(pts, pareto)
-
-    return np.vstack(pareto_groups)
-
-
-def pareto_curve(Xs, Ys):
-    # in our case faster is better so invert the X axis
-    pts = np.array(list(zip(Xs, Ys))) * np.array((-1, 1))
-    result = get_pareto_frontier(pts)
-    indices = []
-    for pt in result:
-        i = np.where(pts == pt)[0][0]
-        indices += [i]
-    return indices
-
-
-def get_pareto_edges(xs, ys):
-    left = None
-    top = None
-    bottom = None
-    right = None
-    for i in range(len(xs)):
-        x = xs[i]
-        y = ys[i]
-        if left is None or x < left:
-            left = x
-        if bottom is None or y < bottom:
-            bottom = y
-        if top is None or y > top:
-            top = y
-        if right is None or x > right:
-            right = x
-
-    return (left, bottom, right, top)
--- a/tasks/face_segmentation/aml/notebooks/results.ipynb
+++ b/tasks/face_segmentation/aml/notebooks/results.ipynb
--- a/tasks/face_segmentation/aml/readme.md
+++ b/tasks/face_segmentation/aml/readme.md
@ -24,9 +24,8 @@ creating a docker image for running in an Azure Kubernetes cluster to do model q
 the Qualcomm Neural Processing SDK. Quantization is time consuming so having an elastic scale speeds
 things up a lot.

-1. [Notebooks](notebook/gallery_performance.md) contains a Jupyter Notebook that can visualize the
-results from your Azure "status" table.
-
+1. [Notebook](notebooks/results.ipynb) a simple Jupyter notebook for visualizing the
+results in your Azure table.

 ## Workflow

--- a/tasks/face_segmentation/aml/training/onnx_latency.py
+++ b/tasks/face_segmentation/aml/training/onnx_latency.py
@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from typing import Any, Dict, List, Optional, Tuple, Union
+from overrides import overrides
+from archai.discrete_search.evaluators import AvgOnnxLatency
+from archai.discrete_search.api.archai_model import ArchaiModel
+from archai.common.store import ArchaiStore
+
+
+class AvgOnnxLatencyEvaluator(AvgOnnxLatency):
+    """Evaluate the average ONNX Latency (in seconds) of an architecture and store the result.
+    The latency is measured by running the model on random inputs and averaging the latency over
+    `num_trials` trials.
+
+    """
+
+    def __init__(
+        self,
+        input_shape: Union[Tuple[int, ...], List[Tuple[int, ...]]],
+        num_trials: Optional[int] = 1,
+        input_dtype: Optional[str] = "torch.FloatTensor",
+        rand_range: Optional[Tuple[float, float]] = (0.0, 1.0),
+        export_kwargs: Optional[Dict[str, Any]] = None,
+        device: Optional[str] = 'cpu',
+        inf_session_kwargs: Optional[Dict[str, Any]] = None,
+        store: ArchaiStore = None,
+        metric_key: str = 'onnx_latency'
+    ) -> None:
+        super(AvgOnnxLatencyEvaluator, self).__init__(
+            input_shape,
+            num_trials,
+            input_dtype,
+            rand_range,
+            export_kwargs,
+            device,
+            inf_session_kwargs)
+        self.store = store
+        self.metric_key = metric_key
+
+    @overrides
+    def evaluate(self, model: ArchaiModel, budget: Optional[float] = None) -> float:
+        result = super(AvgOnnxLatencyEvaluator, self).evaluate(model, budget)
+        if self.store is not None:
+            archid = f'id_{model.archid}'
+            e = self.store.get_status(archid)
+            e['status'] = 'complete'
+            e[self.metric_key] = result
+            self.store.merge_status_entity(e)
+        return result
--- a/tasks/face_segmentation/confs/aml_search.yaml
+++ b/tasks/face_segmentation/confs/aml_search.yaml
@ -92,9 +92,6 @@ aml:
        - matplotlib
        - mldesigner
        - mlflow
-        - onnx>=1.10.2
-        - onnxruntime>=1.10.0
-        - psutil
        - torch
        - torchvision
        - torchaudio
--- a/tasks/face_segmentation/search.py
+++ b/tasks/face_segmentation/search.py
@ -16,13 +16,14 @@ from archai.discrete_search.algos import (
    MoBananasSearch, EvolutionParetoSearch, LocalSearch,
    RandomSearch, RegularizedEvolutionSearch
 )
-from archai.discrete_search.evaluators import TorchNumParameters, AvgOnnxLatency, RayParallelEvaluator
+from archai.discrete_search.evaluators import TorchNumParameters, RayParallelEvaluator
 from archai.discrete_search.evaluators.remote_azure_benchmark import RemoteAzureBenchmarkEvaluator

 from search_space.hgnet import HgnetSegmentationSearchSpace
 from training.partial_training_evaluator import PartialTrainingValIOU
 from aml.training.aml_training_evaluator import AmlPartialTrainingEvaluator
 from aml.util.setup import configure_store
+from aml.training.onnx_latency import AvgOnnxLatencyEvaluator

 AVAILABLE_ALGOS = {
    'mo_bananas': MoBananasSearch,
@ -100,24 +101,27 @@ def main():
        constraint=(1e6, max_parameters)
    )

+    aml_training = False
+    store = None
+    if 'aml' in config:
+        aml_config = config['aml']
+        experiment_name = aml_config.get('experiment_name', 'facesynthetics')
+        store: ArchaiStore = configure_store(aml_config)
+        aml_training = 'training_cluster' in aml_config
+
    # Adds a constrained objective on model latency so we don't pick models that are too slow.
    so.add_objective(
        'CPU ONNX Latency (s)',
-        AvgOnnxLatency(
-            input_shape=input_shape, export_kwargs={'opset_version': 11}
+        AvgOnnxLatencyEvaluator(
+            input_shape=input_shape, export_kwargs={'opset_version': 11}, store=store
        ),
        higher_is_better=False,
        compute_intensive=False,
        constraint=[0, max_latency]
    )

-    aml_training = False
-
    if target_name == 'snp':
        # Gets connection string from env variable
-        aml_config = config['aml']
-        experiment_name = aml_config.get('experiment_name', 'facesynthetics')
-        store: ArchaiStore = configure_store(aml_config)
        evaluator = RemoteAzureBenchmarkEvaluator(
            input_shape=input_shape,
            store=store,
@ -133,8 +137,6 @@ def main():
            compute_intensive=True
        )

-        aml_training = 'training_cluster' in aml_config
-
    if aml_training:
        # do the partial training on an AML gpu cluster
        partial_tr_obj = AmlPartialTrainingEvaluator(