Add script to do full training of final set of models to face segmentation task (#238)

* Add a --test option that runs only the data prep step to test the environment is working.
* force train.py to grab the lock on the row (removing rare failure case).
* Fix snpe kubernetes scaling using the anti-node affinity pattern.
* Publish new docker image.
* Add mlflow integration to train.py.
* Add script that does full training pipeline for final pareto models.
* switch to bokeh so I can get nice tooltips on each dot in the scatter plot.
* add axis titles.
* Add device F1 scoring to train_pareto
* Add more to readmes.
* add image
* Add helper script to do final F1 scoring on Qualcomm devices.
* fix lint errors.
* fix bugs
* rev environment version.
* fix lint error
* rename snp_test script and fix bugs
* add iteration 20
* fix bug
* Add gif animations
* Fix bugs in snp_test
* fix bugs - snp_test needs to reset the .dlc files.
* make loop.sh executable
* only reset the models we are actually going to test.
* add final snpe f1 score chart
* Improve calc_pareto_frontier helper
* Show final dots that fell off pareto as gray.
* full training is complete, this is the final results.
This commit is contained in:
Chris Lovett 2023-05-05 20:40:26 -07:00 коммит произвёл GitHub
Родитель bff0b3eb49
Коммит f807260cf4
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
14 изменённых файлов: 466 добавлений и 91 удалений

2
.gitignore поставляемый
Просмотреть файл

@ -167,3 +167,5 @@ android-ndk-r25c-linux.zip
tasks/face_segmentation/aml/docker/quantizer/quantizer.yaml
tasks/face_segmentation/.vscode/launch.json
tasks/face_segmentation/conda.yaml
tasks/face_segmentation/aml/notebooks/*.gif
tasks/face_segmentation/aml/notebooks/*.png

Просмотреть файл

@ -48,6 +48,7 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
overwrite: Whether to overwrite existing models.
max_retries: Maximum number of retries in `fetch_all`.
retry_interval: Interval between each retry attempt.
reset: Whether to reset the metrics.
onnx_export_kwargs: Dictionary containing key-value arguments for `torch.onnx.export`.
verbose: Whether to print debug messages.
"""
@ -99,7 +100,7 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
print(f"Entry for {archid} already exists with {self.metric_key} = {value}")
return
else:
# complete but missing the mean, so reset the benchmark metrics so we can try again.
# force quantization to happen again in case the model has been retrained.
self._reset(entity)
else:
# job is still running, let it continue
@ -111,8 +112,8 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
entity = self.store.get_status(archid) # this is a get or create operation.
if self.benchmark_only:
entity["benchmark_only"] = 1
entity["model_date"] = self.store.get_utc_date()
entity["model_name"] = "model.onnx"
elif 'benchmark_only' in entity:
del entity['benchmark_only']
self.store.update_status_entity(entity) # must be an update, not a merge.
self.store.lock_entity(entity, "uploading")
@ -134,14 +135,20 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
)
self.store.upload_blob(f'{self.experiment_name}/{archid}', file_name, "model.onnx")
entity["model_date"] = self.store.get_utc_date()
entity["model_name"] = "model.onnx"
entity["status"] = "new"
except Exception as e:
entity["error"] = str(e)
entity["status"] = "error"
else:
# then the blob store must already have a model.onnx file!
blobs = self.store.list_blobs(f'{self.experiment_name}/{archid}')
if 'model.onnx' not in blobs:
entity["error"] = "model.onnx is missing"
blobs = self.store.list_blobs(f'{self.experiment_name}/{archid}/model.onnx')
if len(blobs) < 1:
print(f"model.onnx is missing for architecture {archid}")
return
else:
entity['status'] = 'ready'
self.store.unlock_entity(entity)
self.archids.append(archid)

Просмотреть файл

@ -129,7 +129,7 @@ def main(output_dir: Path, experiment_name: str, seed: int, data_prep_only: bool
ml_client,
image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
conda_file="conda.yaml",
version='1.0.25')
version='1.0.26')
environment_name = f"{archai_job_env.name}:{archai_job_env.version}"
# Register the datastore with AML

0
tasks/face_segmentation/aml/azure/loop.sh Normal file → Executable file
Просмотреть файл

Просмотреть файл

@ -718,8 +718,10 @@ def monitor(experiment, dataset, use_device, benchmark_only, subset_list, no_qua
# other jobs were add/completed in parallel while this was executing.
priority, entity = queue.dequeue()
name = entity['name']
locked = False
try:
entity = lock_job(entity)
locked = True
benchmark_only_flag = is_benchmark_only(entity, benchmark_only)
gc.collect()
tracemalloc.start()
@ -741,7 +743,8 @@ def monitor(experiment, dataset, use_device, benchmark_only, subset_list, no_qua
else:
# bug in the script somewhere... don't leave the node locked.
log_error(error_type, value, stack)
unlock_job(entity)
if locked:
unlock_job(entity)
sys.exit(1)
time.sleep(10) # give other machines a chance to grab work so we don't get stuck in retry loops.

Двоичные данные
tasks/face_segmentation/aml/images/animation.gif Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 295 KiB

Двоичные данные
tasks/face_segmentation/aml/images/final_results.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 22 KiB

Двоичные данные
tasks/face_segmentation/aml/images/pareto.gif Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 218 KiB

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -10,7 +10,7 @@ The code is organized into:
of selected models on a GPU cluster in Azure ML.
1. [SNPE Device](snpe/readme.md) code that uses [Microsoft
Olive](https://github.com/microsoft/olive) to drive the the
Olive](https://github.com/microsoft/olive) to drive the
[Qualcomm Neural Processing SDK](https://developer.qualcomm.com/software/qualcomm-neural-processing-sdk) to talk
to the device, convert ONNX models to .dlc, quantize them, and test them on one or more
[Qualcomm 888 dev kits](https://developer.qualcomm.com/hardware/snapdragon-888-hdk).
@ -27,23 +27,42 @@ things up a lot.
1. [Notebook](notebooks/results.ipynb) a simple Jupyter notebook for visualizing the
results found in your Azure table.
The jupyter notebook can be used to visualize the results of the search iterations as they are
happening. The following is a snapshot after 10 iterations are completed where the darker colors
are the early iterations and the brighter colors are the most recent iterations. The pareto frontier
models are in yellow. This clearly shows the general trend of model improvement over time on each new
iteration.
## Results
![snapshot](images/iteration10.png)
The jupyter notebook can be used to visualize the results of the search iterations as they are
happening. The following is a an animation of the complete 20 search iterations where the darker
colors are the early iterations and the brighter colors are the most recent iterations. The pareto
frontier models are highlighted in yellow. This clearly shows the general trend of model improvement
over time on each new iteration.
![snapshot](images/animation.gif)
The following animation shows only the pareto models from each search iteration. These are the
models that get mutated during the evolutionary pareto search, all the other models have lower
validation scores and are discarded:
![snapshot](images/pareto.gif)
When the search completes you can run [train_pareto.py](../../train_pareto.py) to fully train the
pareto models then you can run [snp_test.py](../../snp_test.py) to compute the F1 scores for these
fully trained models on your Qualcomm hardware, the following is a plot you can get from the
notebook showing the final results. Notice that the Qualcomm hardware mostly matches our earlier
`val_iou` pareto curve, but not exactly. The dots shown in gray have fallen off the pareto frontier.
This is why it is always good to test your models on the target hardware. Even better if that
testing can be done in the search loop so that the search finds models that work well on the target
hardware, as we have done in this face segmentation example:
![errors](images/final_results.png)
## Workflow
The overall workflow begins with the top level [aml.py](../../aml.py) script which
starts with an Archai Search that contains an `AmlPartialTrainingEvaluator` and a
`RemoteAzureBenchmarkEvaluator`. The remote benchmark evaluator performs inference latency testing
on Qualcomm hardware. The `AmlPartialTrainingEvaluator` then kicks off one new Azure ML
training pipeline for each batch of new model architectures that need to be partially trained, it
stores the validation IOU results in an Azure blob store and an Azure table so the search can get
those results and use them to figure out the next iteration of the search algorithm:
The overall workflow begins with the top level [aml.py](../../aml.py) script which starts with an
Archai Search that contains an `AmlPartialTrainingEvaluator` and a `RemoteAzureBenchmarkEvaluator`.
The remote benchmark evaluator performs inference latency testing on Qualcomm hardware. The
`AmlPartialTrainingEvaluator` then kicks off one new Azure ML training pipeline for each batch of
new model architectures that need to be partially trained, it stores the validation IOU results in
an Azure blob store and an Azure table so the search can get those results and use them to figure
out the next iteration of the search algorithm:
![system](images/system.png)
@ -51,18 +70,20 @@ See [AML Training Readme](training/readme.md) for more information.
## Remote Inference Testing
The remote inference testing workflow looks like this, the `RemoteAzureBenchmarkEvaluator` uploads models to the same
Azure blob store, and adds a row to the status table. This triggers remote instances of the [runner.py](azure/runner.py) script
to process these new models on an attached Qualcomm device. Optionally some of the work can be done in the cloud
using a Kubernetes cluster, this includes model quantization and accuracy testing using the ONNX runtime.
The workflow looks like this:
The remote inference testing workflow looks like this, the `RemoteAzureBenchmarkEvaluator` uploads
models to the same Azure blob store, and adds a row to the status table. This triggers remote
instances of the [runner.py](azure/runner.py) script to process these new models on an attached
Qualcomm device. Optionally some of the work can be done in the cloud using a Kubernetes cluster,
this includes model quantization and accuracy testing using the ONNX runtime. The workflow looks
like this:
![snpe](images/snpe.png)
Each instance of `runner.py` looks for work, and executes it in priority order where the prioritization is defined by
the `find_work_prioritized` function in the runner. This script is completely restartable, and can distribute the work
across multiple instances of the runner script. Each instance will pick up where a previous one left off based on what
it finds in your Azure status table. The prioritization maps to the columns of the status table as follows:
Each instance of `runner.py` looks for work, and executes it in priority order where the
prioritization is defined by the `find_work_prioritized` function in the runner. This script is
completely restartable, and can distribute the work across multiple instances of the runner script.
Each instance will pick up where a previous one left off based on what it finds in your Azure status
table. The prioritization maps to the columns of the status table as follows:
1. **macs:** convert to .dlc and post Macs score and `snpe-dlc-viewer` output and do model quantization (runs on Linux) - priority 20
1. **total_inference_avg** run `snpe_bench.py` with quantized model on Qualcomm device DSP - priority 30

Просмотреть файл

@ -1,17 +1,21 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import numpy as np
def calc_pareto_frontier(points):
""" Given an array of points where the first 2 coordinates define a 2D point
return a list of array indexes that define the pareto frontier for these points """
return a sorted version of those points and a list of array indexes into that
sorted list that define the pareto frontier for these points """
points = np.array(points)
sorted = points[points[:, 0].argsort()]
pareto = []
pareto += [0]
p1 = points[0]
for i in range(1, len(points)):
p2 = points[i]
p1 = sorted[0]
for i in range(1, len(sorted)):
p2 = sorted[i]
if p2[1] > p1[1]:
pareto += [i]
p1 = p2
return pareto
return (sorted, pareto)

Просмотреть файл

@ -91,6 +91,7 @@ aml:
- matplotlib
- mldesigner
- mlflow
- tqdm
- tensorwatch
- torch
- torchvision

Просмотреть файл

@ -4,12 +4,26 @@ import argparse
import sys
from archai.discrete_search.api import ArchaiModel
from archai.common.config import Config
from aml.training.aml_training_evaluator import AmlPartialTrainingEvaluator
from search_space.hgnet import HgnetSegmentationSearchSpace
from archai.discrete_search.evaluators.remote_azure_benchmark import RemoteAzureBenchmarkEvaluator
from aml.util.setup import configure_store
def reset_dlc(store, experiment_name, entity):
""" Reset the qualcomm dlc files and associated metrics for the given entity."""
changed = False
name = entity['name']
prefix = f'{experiment_name}/{name}'
print(f"Resetting .dlc files for model {name}")
store.delete_blobs(prefix, 'model.dlc')
store.delete_blobs(prefix, 'model.quant.dlc')
for k in ['mean', 'macs', 'params', 'stdev', 'total_inference_avg', 'error', 'f1_1k', 'f1_10k', 'f1_1k_f', 'f1_onnx', 'pipeline_id']:
if k in entity:
del entity[k]
changed = True
if changed:
store.update_status_entity(entity)
def main():
# input and output arguments
parser = argparse.ArgumentParser(description="Runs Snapdragon F1 scoring on the final fully trained models produced by train_pareto.py.")
@ -23,7 +37,12 @@ def main():
metric_key = 'final_val_iou'
search_config = config['search']
ss_config = search_config['search_space']
ss_params = ss_config['params']
in_channels = ss_params['in_channels']
img_size = ss_params['img_size']
target_config = search_config.get('target', {})
# change the metric key to the one used for Snapdragon F1 scoring
target_config['metric_key'] = 'f1_1k'
target_name = target_config.pop('name', 'cpu')
device_evaluator = None
@ -38,28 +57,23 @@ def main():
fully_trained += [e]
if len(fully_trained) == 0:
print(f"No fully trained models found with required metric '{metric_key}'")
print(f"No 'complete' models found with required metric '{metric_key}'")
sys.exit(1)
# the RemoteAzureBenchmarkEvaluator only needs the archid actually, doesn't need the nn.Module.
models = []
for i in fully_trained:
id = e['name']
e['status'] = 'preparing'
for e in fully_trained:
name = e['name']
# if this has not been F1 scored yet then add it to our list.
if 'benchmark_only' in e:
del e['benchmark_only']
store.update_status_entity(e)
models += [ArchaiModel(None, archid=id[3:])]
models += [ArchaiModel(None, archid=name[3:])]
# make sure we re-quantize the new fully trained model.
reset_dlc(store, experiment_name, e)
# kick off remote device training without the benchmark_only flag so we get the
# F1 scores for these fully trained models. Note the above results_path ensures the trained
# models are uploaded back to our models blob store.
search_space = HgnetSegmentationSearchSpace(
seed=42, # not important in this case.
**ss_config.get('params', {}),
)
input_shape = (1, search_space.in_channels, *search_space.img_size[::-1])
input_shape = (1, in_channels, *img_size[::-1])
device_evaluator = RemoteAzureBenchmarkEvaluator(
input_shape=input_shape,
store=store,

Просмотреть файл

@ -28,8 +28,8 @@ def main():
config = Config(args.config, resolve_env_vars=True)
aml_config = config['aml']
store = configure_store(aml_config)
evaluator = AmlPartialTrainingEvaluator(config, args.output, args.epochs, args.timeout)
output_path = Path(os.path.realpath(args.output))
evaluator = AmlPartialTrainingEvaluator(config, output_path, args.epochs, args.timeout)
store = evaluator.store
experiment_name = aml_config['experiment_name']
@ -54,9 +54,7 @@ def main():
print(f"No models found with required metrics '{metric_key}' and '{target_metric_key}'")
sys.exit(1)
points = np.array(points)
sorted = points[points[:, 0].argsort()]
pareto = calc_pareto_frontier(sorted)
sorted, pareto = calc_pareto_frontier(points)
print(f'Found {len(pareto)} models on pareto frontier')
# change the key so the evaluator updates a different field this time and