Add script to do full training of final set of models to face segmentation task (#238)

* Add a --test option that runs only the data prep step to test the environment is working. * force train.py to grab the lock on the row (removing rare failure case). * Fix snpe kubernetes scaling using the anti-node affinity pattern. * Publish new docker image. * Add mlflow integration to train.py. * Add script that does full training pipeline for final pareto models. * switch to bokeh so I can get nice tooltips on each dot in the scatter plot. * add axis titles. * Add device F1 scoring to train_pareto * Add more to readmes. * add image * Add helper script to do final F1 scoring on Qualcomm devices. * fix lint errors. * fix bugs * rev environment version. * fix lint error * rename snp_test script and fix bugs * add iteration 20 * fix bug * Add gif animations * Fix bugs in snp_test * fix bugs - snp_test needs to reset the .dlc files. * make loop.sh executable * only reset the models we are actually going to test. * add final snpe f1 score chart * Improve calc_pareto_frontier helper * Show final dots that fell off pareto as gray. * full training is complete, this is the final results.
2023-05-05 20:40:26 -07:00 · 2023-05-05 20:40:26 -07:00 · f807260cf4
--- a/.gitignore
+++ b/.gitignore
@ -167,3 +167,5 @@ android-ndk-r25c-linux.zip
 tasks/face_segmentation/aml/docker/quantizer/quantizer.yaml
 tasks/face_segmentation/.vscode/launch.json
 tasks/face_segmentation/conda.yaml
+tasks/face_segmentation/aml/notebooks/*.gif
+tasks/face_segmentation/aml/notebooks/*.png
--- a/archai/discrete_search/evaluators/remote_azure_benchmark.py
+++ b/archai/discrete_search/evaluators/remote_azure_benchmark.py
@ -48,6 +48,7 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
            overwrite: Whether to overwrite existing models.
            max_retries: Maximum number of retries in `fetch_all`.
            retry_interval: Interval between each retry attempt.
+            reset: Whether to reset the metrics.
            onnx_export_kwargs: Dictionary containing key-value arguments for `torch.onnx.export`.
            verbose: Whether to print debug messages.
        """
@ -99,7 +100,7 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
                        print(f"Entry for {archid} already exists with {self.metric_key} = {value}")
                    return
                else:
-                    # complete but missing the mean, so reset the benchmark metrics so we can try again.
+                    # force quantization to happen again in case the model has been retrained.
                    self._reset(entity)
            else:
                # job is still running, let it continue
@ -111,8 +112,8 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
        entity = self.store.get_status(archid)  # this is a get or create operation.
        if self.benchmark_only:
            entity["benchmark_only"] = 1
-        entity["model_date"] = self.store.get_utc_date()
-        entity["model_name"] = "model.onnx"
+        elif 'benchmark_only' in entity:
+            del entity['benchmark_only']
        self.store.update_status_entity(entity)  # must be an update, not a merge.
        self.store.lock_entity(entity, "uploading")

@ -134,14 +135,20 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
                    )

                    self.store.upload_blob(f'{self.experiment_name}/{archid}', file_name, "model.onnx")
+                    entity["model_date"] = self.store.get_utc_date()
+                    entity["model_name"] = "model.onnx"
                    entity["status"] = "new"
            except Exception as e:
                entity["error"] = str(e)
+                entity["status"] = "error"
        else:
            # then the blob store must already have a model.onnx file!
-            blobs = self.store.list_blobs(f'{self.experiment_name}/{archid}')
-            if 'model.onnx' not in blobs:
-                entity["error"] = "model.onnx is missing"
+            blobs = self.store.list_blobs(f'{self.experiment_name}/{archid}/model.onnx')
+            if len(blobs) < 1:
+                print(f"model.onnx is missing for architecture {archid}")
+                return
+            else:
+                entity['status'] = 'ready'

        self.store.unlock_entity(entity)
        self.archids.append(archid)
--- a/tasks/face_segmentation/aml.py
+++ b/tasks/face_segmentation/aml.py
@ -129,7 +129,7 @@ def main(output_dir: Path, experiment_name: str, seed: int, data_prep_only: bool
        ml_client,
        image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
        conda_file="conda.yaml",
-        version='1.0.25')
+        version='1.0.26')
    environment_name = f"{archai_job_env.name}:{archai_job_env.version}"

    # Register the datastore with AML
--- a/tasks/face_segmentation/aml/azure/loop.sh
+++ b/tasks/face_segmentation/aml/azure/loop.sh
--- a/tasks/face_segmentation/aml/azure/runner.py
+++ b/tasks/face_segmentation/aml/azure/runner.py
@ -718,8 +718,10 @@ def monitor(experiment, dataset, use_device, benchmark_only, subset_list, no_qua
        # other jobs were add/completed in parallel while this was executing.
        priority, entity = queue.dequeue()
        name = entity['name']
+        locked = False
        try:
            entity = lock_job(entity)
+            locked = True
            benchmark_only_flag = is_benchmark_only(entity, benchmark_only)
            gc.collect()
            tracemalloc.start()
@ -741,7 +743,8 @@ def monitor(experiment, dataset, use_device, benchmark_only, subset_list, no_qua
            else:
                # bug in the script somewhere... don't leave the node locked.
                log_error(error_type, value, stack)
-                unlock_job(entity)
+                if locked:
+                    unlock_job(entity)
                sys.exit(1)

        time.sleep(10)  # give other machines a chance to grab work so we don't get stuck in retry loops.
--- a/tasks/face_segmentation/aml/images/animation.gif
+++ b/tasks/face_segmentation/aml/images/animation.gif
--- a/tasks/face_segmentation/aml/images/final_results.png
+++ b/tasks/face_segmentation/aml/images/final_results.png
--- a/tasks/face_segmentation/aml/images/pareto.gif
+++ b/tasks/face_segmentation/aml/images/pareto.gif
--- a/tasks/face_segmentation/aml/notebooks/results.ipynb
+++ b/tasks/face_segmentation/aml/notebooks/results.ipynb
--- a/tasks/face_segmentation/aml/readme.md
+++ b/tasks/face_segmentation/aml/readme.md
@ -10,7 +10,7 @@ The code is organized into:
 of selected models on a GPU cluster in Azure ML.

 1. [SNPE Device](snpe/readme.md) code that uses [Microsoft
-Olive](https://github.com/microsoft/olive) to drive the the
+Olive](https://github.com/microsoft/olive) to drive the
 [Qualcomm Neural Processing SDK](https://developer.qualcomm.com/software/qualcomm-neural-processing-sdk) to talk
 to the device, convert ONNX models to .dlc, quantize them, and test them on one or more
 [Qualcomm 888 dev kits](https://developer.qualcomm.com/hardware/snapdragon-888-hdk).
@ -27,23 +27,42 @@ things up a lot.
 1. [Notebook](notebooks/results.ipynb) a simple Jupyter notebook for visualizing the
 results found in your Azure table.

-   The jupyter notebook can be used to visualize the results of the search iterations as they are
-happening.  The following is a snapshot after 10 iterations are completed where the darker colors
-are the early iterations and the brighter colors are the most recent iterations. The pareto frontier
-models are in yellow. This clearly shows the general trend of model improvement over time on each new
-iteration.
+## Results

-   ![snapshot](images/iteration10.png)
+The jupyter notebook can be used to visualize the results of the search iterations as they are
+happening.  The following is a an animation of the complete 20 search iterations where the darker
+colors are the early iterations and the brighter colors are the most recent iterations. The pareto
+frontier models are highlighted in yellow. This clearly shows the general trend of model improvement
+over time on each new iteration.
+
+![snapshot](images/animation.gif)
+
+The following animation shows only the pareto models from each search iteration. These are the
+models that get mutated during the evolutionary pareto search, all the other models have lower
+validation scores and are discarded:
+
+![snapshot](images/pareto.gif)
+
+When the search completes you can run [train_pareto.py](../../train_pareto.py) to fully train the
+pareto models then you can run [snp_test.py](../../snp_test.py) to compute the F1 scores for these
+fully trained models on your Qualcomm hardware, the following is a plot you can get from the
+notebook showing the final results.  Notice that the Qualcomm hardware mostly matches our earlier
+`val_iou` pareto curve, but not exactly. The dots shown in gray have fallen off the pareto frontier.
+This is why it is always good to test your models on the target hardware.  Even better if that
+testing can be done in the search loop so that the search finds models that work well on the target
+hardware, as we have done in this face segmentation example:
+
+![errors](images/final_results.png)

 ## Workflow

-The overall workflow begins with the top level [aml.py](../../aml.py) script which
-starts with an Archai Search that contains an `AmlPartialTrainingEvaluator` and a
-`RemoteAzureBenchmarkEvaluator`.  The remote benchmark evaluator performs inference latency testing
-on Qualcomm hardware.  The `AmlPartialTrainingEvaluator` then kicks off one new Azure ML
-training pipeline for each batch of new model architectures that need to be partially trained, it
-stores the validation IOU results in an Azure blob store and an Azure table so the search can get
-those results and use them to figure out the next iteration of the search algorithm:
+The overall workflow begins with the top level [aml.py](../../aml.py) script which starts with an
+Archai Search that contains an `AmlPartialTrainingEvaluator` and a `RemoteAzureBenchmarkEvaluator`.
+The remote benchmark evaluator performs inference latency testing on Qualcomm hardware.  The
+`AmlPartialTrainingEvaluator` then kicks off one new Azure ML training pipeline for each batch of
+new model architectures that need to be partially trained, it stores the validation IOU results in
+an Azure blob store and an Azure table so the search can get those results and use them to figure
+out the next iteration of the search algorithm:

 ![system](images/system.png)

@ -51,18 +70,20 @@ See [AML Training Readme](training/readme.md) for more information.

 ## Remote Inference Testing

-The remote inference testing workflow looks like this, the `RemoteAzureBenchmarkEvaluator` uploads models to the same
-Azure blob store, and adds a row to the status table.  This triggers remote instances of the [runner.py](azure/runner.py) script
-to process these new models on an attached Qualcomm device.  Optionally some of the work can be done in the cloud
-using a Kubernetes cluster, this includes model quantization and accuracy testing using the ONNX runtime.
-The workflow looks like this:
+The remote inference testing workflow looks like this, the `RemoteAzureBenchmarkEvaluator` uploads
+models to the same Azure blob store, and adds a row to the status table.  This triggers remote
+instances of the [runner.py](azure/runner.py) script to process these new models on an attached
+Qualcomm device.  Optionally some of the work can be done in the cloud using a Kubernetes cluster,
+this includes model quantization and accuracy testing using the ONNX runtime. The workflow looks
+like this:

 ![snpe](images/snpe.png)

-Each instance of `runner.py` looks for work, and executes it in priority order where the prioritization is defined by
-the `find_work_prioritized` function in the runner.  This script is completely restartable, and can distribute the work
-across multiple instances of the runner script.  Each instance will pick up where a previous one left off based on what
-it finds in your Azure status table. The prioritization maps to the columns of the status table as follows:
+Each instance of `runner.py` looks for work, and executes it in priority order where the
+prioritization is defined by the `find_work_prioritized` function in the runner.  This script is
+completely restartable, and can distribute the work across multiple instances of the runner script.
+Each instance will pick up where a previous one left off based on what it finds in your Azure status
+table. The prioritization maps to the columns of the status table as follows:

 1. **macs:** convert to .dlc and post Macs score and `snpe-dlc-viewer` output and do model quantization (runs on Linux) - priority 20
 1. **total_inference_avg** run `snpe_bench.py` with quantized model on Qualcomm device DSP - priority 30
--- a/tasks/face_segmentation/aml/util/pareto.py
+++ b/tasks/face_segmentation/aml/util/pareto.py
@ -1,17 +1,21 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+import numpy as np


 def calc_pareto_frontier(points):
    """ Given an array of points where the first 2 coordinates define a 2D point
-    return a list of array indexes that define the pareto frontier for these points """
+    return a sorted version of those points and a list of array indexes into that
+    sorted list that define the pareto frontier for these points """
+    points = np.array(points)
+    sorted = points[points[:, 0].argsort()]
    pareto = []
    pareto += [0]
-    p1 = points[0]
-    for i in range(1, len(points)):
-        p2 = points[i]
+    p1 = sorted[0]
+    for i in range(1, len(sorted)):
+        p2 = sorted[i]
        if p2[1] > p1[1]:
            pareto += [i]
            p1 = p2

-    return pareto
+    return (sorted, pareto)
--- a/tasks/face_segmentation/confs/aml_search.yaml
+++ b/tasks/face_segmentation/confs/aml_search.yaml
@ -91,6 +91,7 @@ aml:
        - matplotlib
        - mldesigner
        - mlflow
+        - tqdm
        - tensorwatch
        - torch
        - torchvision
--- a/tasks/face_segmentation/score_pareto.py
+++ b/tasks/face_segmentation/score_pareto.py
@ -4,12 +4,26 @@ import argparse
 import sys
 from archai.discrete_search.api import ArchaiModel
 from archai.common.config import Config
-from aml.training.aml_training_evaluator import AmlPartialTrainingEvaluator
-from search_space.hgnet import HgnetSegmentationSearchSpace
 from archai.discrete_search.evaluators.remote_azure_benchmark import RemoteAzureBenchmarkEvaluator
 from aml.util.setup import configure_store


+def reset_dlc(store, experiment_name, entity):
+    """ Reset the qualcomm dlc files and associated metrics for the given entity."""
+    changed = False
+    name = entity['name']
+    prefix = f'{experiment_name}/{name}'
+    print(f"Resetting .dlc files for model {name}")
+    store.delete_blobs(prefix, 'model.dlc')
+    store.delete_blobs(prefix, 'model.quant.dlc')
+    for k in ['mean', 'macs', 'params', 'stdev', 'total_inference_avg', 'error', 'f1_1k', 'f1_10k', 'f1_1k_f', 'f1_onnx', 'pipeline_id']:
+        if k in entity:
+            del entity[k]
+            changed = True
+    if changed:
+        store.update_status_entity(entity)
+
+
 def main():
    # input and output arguments
    parser = argparse.ArgumentParser(description="Runs Snapdragon F1 scoring on the final fully trained models produced by train_pareto.py.")
@ -23,7 +37,12 @@ def main():
    metric_key = 'final_val_iou'
    search_config = config['search']
    ss_config = search_config['search_space']
+    ss_params = ss_config['params']
+    in_channels = ss_params['in_channels']
+    img_size = ss_params['img_size']
    target_config = search_config.get('target', {})
+    # change the metric key to the one used for Snapdragon F1 scoring
+    target_config['metric_key'] = 'f1_1k'
    target_name = target_config.pop('name', 'cpu')
    device_evaluator = None

@ -38,28 +57,23 @@ def main():
            fully_trained += [e]

    if len(fully_trained) == 0:
-        print(f"No fully trained models found with required metric '{metric_key}'")
+        print(f"No 'complete' models found with required metric '{metric_key}'")
        sys.exit(1)

    # the RemoteAzureBenchmarkEvaluator only needs the archid actually, doesn't need the nn.Module.
    models = []
-    for i in fully_trained:
-        id = e['name']
-        e['status'] = 'preparing'
+    for e in fully_trained:
+        name = e['name']
+        # if this has not been F1 scored yet then add it to our list.
        if 'benchmark_only' in e:
-            del e['benchmark_only']
-        store.update_status_entity(e)
-        models += [ArchaiModel(None, archid=id[3:])]
+            models += [ArchaiModel(None, archid=name[3:])]
+            # make sure we re-quantize the new fully trained model.
+            reset_dlc(store, experiment_name, e)

    # kick off remote device training without the benchmark_only flag so we get the
    # F1 scores for these fully trained models.  Note the above results_path ensures the trained
    # models are uploaded back to our models blob store.
-    search_space = HgnetSegmentationSearchSpace(
-        seed=42,  # not important in this case.
-        **ss_config.get('params', {}),
-    )
-
-    input_shape = (1, search_space.in_channels, *search_space.img_size[::-1])
+    input_shape = (1, in_channels, *img_size[::-1])
    device_evaluator = RemoteAzureBenchmarkEvaluator(
        input_shape=input_shape,
        store=store,
--- a/tasks/face_segmentation/train_pareto.py
+++ b/tasks/face_segmentation/train_pareto.py
@ -28,8 +28,8 @@ def main():
    config = Config(args.config, resolve_env_vars=True)
    aml_config = config['aml']
    store = configure_store(aml_config)
-
-    evaluator = AmlPartialTrainingEvaluator(config, args.output, args.epochs, args.timeout)
+    output_path = Path(os.path.realpath(args.output))
+    evaluator = AmlPartialTrainingEvaluator(config, output_path, args.epochs, args.timeout)
    store = evaluator.store

    experiment_name = aml_config['experiment_name']
@ -54,9 +54,7 @@ def main():
        print(f"No models found with required metrics '{metric_key}' and '{target_metric_key}'")
        sys.exit(1)

-    points = np.array(points)
-    sorted = points[points[:, 0].argsort()]
-    pareto = calc_pareto_frontier(sorted)
+    sorted, pareto = calc_pareto_frontier(points)
    print(f'Found {len(pareto)} models on pareto frontier')

    # change the key so the evaluator updates a different field this time and