зеркало из https://github.com/microsoft/archai.git
Add script to do full training of final set of models to face segmentation task (#238)
* Add a --test option that runs only the data prep step to test the environment is working. * force train.py to grab the lock on the row (removing rare failure case). * Fix snpe kubernetes scaling using the anti-node affinity pattern. * Publish new docker image. * Add mlflow integration to train.py. * Add script that does full training pipeline for final pareto models. * switch to bokeh so I can get nice tooltips on each dot in the scatter plot. * add axis titles. * Add device F1 scoring to train_pareto * Add more to readmes. * add image * Add helper script to do final F1 scoring on Qualcomm devices. * fix lint errors. * fix bugs * rev environment version. * fix lint error * rename snp_test script and fix bugs * add iteration 20 * fix bug * Add gif animations * Fix bugs in snp_test * fix bugs - snp_test needs to reset the .dlc files. * make loop.sh executable * only reset the models we are actually going to test. * add final snpe f1 score chart * Improve calc_pareto_frontier helper * Show final dots that fell off pareto as gray. * full training is complete, this is the final results.
This commit is contained in:
Родитель
bff0b3eb49
Коммит
f807260cf4
|
@ -167,3 +167,5 @@ android-ndk-r25c-linux.zip
|
|||
tasks/face_segmentation/aml/docker/quantizer/quantizer.yaml
|
||||
tasks/face_segmentation/.vscode/launch.json
|
||||
tasks/face_segmentation/conda.yaml
|
||||
tasks/face_segmentation/aml/notebooks/*.gif
|
||||
tasks/face_segmentation/aml/notebooks/*.png
|
||||
|
|
|
@ -48,6 +48,7 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
|
|||
overwrite: Whether to overwrite existing models.
|
||||
max_retries: Maximum number of retries in `fetch_all`.
|
||||
retry_interval: Interval between each retry attempt.
|
||||
reset: Whether to reset the metrics.
|
||||
onnx_export_kwargs: Dictionary containing key-value arguments for `torch.onnx.export`.
|
||||
verbose: Whether to print debug messages.
|
||||
"""
|
||||
|
@ -99,7 +100,7 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
|
|||
print(f"Entry for {archid} already exists with {self.metric_key} = {value}")
|
||||
return
|
||||
else:
|
||||
# complete but missing the mean, so reset the benchmark metrics so we can try again.
|
||||
# force quantization to happen again in case the model has been retrained.
|
||||
self._reset(entity)
|
||||
else:
|
||||
# job is still running, let it continue
|
||||
|
@ -111,8 +112,8 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
|
|||
entity = self.store.get_status(archid) # this is a get or create operation.
|
||||
if self.benchmark_only:
|
||||
entity["benchmark_only"] = 1
|
||||
entity["model_date"] = self.store.get_utc_date()
|
||||
entity["model_name"] = "model.onnx"
|
||||
elif 'benchmark_only' in entity:
|
||||
del entity['benchmark_only']
|
||||
self.store.update_status_entity(entity) # must be an update, not a merge.
|
||||
self.store.lock_entity(entity, "uploading")
|
||||
|
||||
|
@ -134,14 +135,20 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
|
|||
)
|
||||
|
||||
self.store.upload_blob(f'{self.experiment_name}/{archid}', file_name, "model.onnx")
|
||||
entity["model_date"] = self.store.get_utc_date()
|
||||
entity["model_name"] = "model.onnx"
|
||||
entity["status"] = "new"
|
||||
except Exception as e:
|
||||
entity["error"] = str(e)
|
||||
entity["status"] = "error"
|
||||
else:
|
||||
# then the blob store must already have a model.onnx file!
|
||||
blobs = self.store.list_blobs(f'{self.experiment_name}/{archid}')
|
||||
if 'model.onnx' not in blobs:
|
||||
entity["error"] = "model.onnx is missing"
|
||||
blobs = self.store.list_blobs(f'{self.experiment_name}/{archid}/model.onnx')
|
||||
if len(blobs) < 1:
|
||||
print(f"model.onnx is missing for architecture {archid}")
|
||||
return
|
||||
else:
|
||||
entity['status'] = 'ready'
|
||||
|
||||
self.store.unlock_entity(entity)
|
||||
self.archids.append(archid)
|
||||
|
|
|
@ -129,7 +129,7 @@ def main(output_dir: Path, experiment_name: str, seed: int, data_prep_only: bool
|
|||
ml_client,
|
||||
image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
|
||||
conda_file="conda.yaml",
|
||||
version='1.0.25')
|
||||
version='1.0.26')
|
||||
environment_name = f"{archai_job_env.name}:{archai_job_env.version}"
|
||||
|
||||
# Register the datastore with AML
|
||||
|
|
|
@ -718,8 +718,10 @@ def monitor(experiment, dataset, use_device, benchmark_only, subset_list, no_qua
|
|||
# other jobs were add/completed in parallel while this was executing.
|
||||
priority, entity = queue.dequeue()
|
||||
name = entity['name']
|
||||
locked = False
|
||||
try:
|
||||
entity = lock_job(entity)
|
||||
locked = True
|
||||
benchmark_only_flag = is_benchmark_only(entity, benchmark_only)
|
||||
gc.collect()
|
||||
tracemalloc.start()
|
||||
|
@ -741,7 +743,8 @@ def monitor(experiment, dataset, use_device, benchmark_only, subset_list, no_qua
|
|||
else:
|
||||
# bug in the script somewhere... don't leave the node locked.
|
||||
log_error(error_type, value, stack)
|
||||
unlock_job(entity)
|
||||
if locked:
|
||||
unlock_job(entity)
|
||||
sys.exit(1)
|
||||
|
||||
time.sleep(10) # give other machines a chance to grab work so we don't get stuck in retry loops.
|
||||
|
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 295 KiB |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 22 KiB |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 218 KiB |
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -10,7 +10,7 @@ The code is organized into:
|
|||
of selected models on a GPU cluster in Azure ML.
|
||||
|
||||
1. [SNPE Device](snpe/readme.md) code that uses [Microsoft
|
||||
Olive](https://github.com/microsoft/olive) to drive the the
|
||||
Olive](https://github.com/microsoft/olive) to drive the
|
||||
[Qualcomm Neural Processing SDK](https://developer.qualcomm.com/software/qualcomm-neural-processing-sdk) to talk
|
||||
to the device, convert ONNX models to .dlc, quantize them, and test them on one or more
|
||||
[Qualcomm 888 dev kits](https://developer.qualcomm.com/hardware/snapdragon-888-hdk).
|
||||
|
@ -27,23 +27,42 @@ things up a lot.
|
|||
1. [Notebook](notebooks/results.ipynb) a simple Jupyter notebook for visualizing the
|
||||
results found in your Azure table.
|
||||
|
||||
The jupyter notebook can be used to visualize the results of the search iterations as they are
|
||||
happening. The following is a snapshot after 10 iterations are completed where the darker colors
|
||||
are the early iterations and the brighter colors are the most recent iterations. The pareto frontier
|
||||
models are in yellow. This clearly shows the general trend of model improvement over time on each new
|
||||
iteration.
|
||||
## Results
|
||||
|
||||
![snapshot](images/iteration10.png)
|
||||
The jupyter notebook can be used to visualize the results of the search iterations as they are
|
||||
happening. The following is a an animation of the complete 20 search iterations where the darker
|
||||
colors are the early iterations and the brighter colors are the most recent iterations. The pareto
|
||||
frontier models are highlighted in yellow. This clearly shows the general trend of model improvement
|
||||
over time on each new iteration.
|
||||
|
||||
![snapshot](images/animation.gif)
|
||||
|
||||
The following animation shows only the pareto models from each search iteration. These are the
|
||||
models that get mutated during the evolutionary pareto search, all the other models have lower
|
||||
validation scores and are discarded:
|
||||
|
||||
![snapshot](images/pareto.gif)
|
||||
|
||||
When the search completes you can run [train_pareto.py](../../train_pareto.py) to fully train the
|
||||
pareto models then you can run [snp_test.py](../../snp_test.py) to compute the F1 scores for these
|
||||
fully trained models on your Qualcomm hardware, the following is a plot you can get from the
|
||||
notebook showing the final results. Notice that the Qualcomm hardware mostly matches our earlier
|
||||
`val_iou` pareto curve, but not exactly. The dots shown in gray have fallen off the pareto frontier.
|
||||
This is why it is always good to test your models on the target hardware. Even better if that
|
||||
testing can be done in the search loop so that the search finds models that work well on the target
|
||||
hardware, as we have done in this face segmentation example:
|
||||
|
||||
![errors](images/final_results.png)
|
||||
|
||||
## Workflow
|
||||
|
||||
The overall workflow begins with the top level [aml.py](../../aml.py) script which
|
||||
starts with an Archai Search that contains an `AmlPartialTrainingEvaluator` and a
|
||||
`RemoteAzureBenchmarkEvaluator`. The remote benchmark evaluator performs inference latency testing
|
||||
on Qualcomm hardware. The `AmlPartialTrainingEvaluator` then kicks off one new Azure ML
|
||||
training pipeline for each batch of new model architectures that need to be partially trained, it
|
||||
stores the validation IOU results in an Azure blob store and an Azure table so the search can get
|
||||
those results and use them to figure out the next iteration of the search algorithm:
|
||||
The overall workflow begins with the top level [aml.py](../../aml.py) script which starts with an
|
||||
Archai Search that contains an `AmlPartialTrainingEvaluator` and a `RemoteAzureBenchmarkEvaluator`.
|
||||
The remote benchmark evaluator performs inference latency testing on Qualcomm hardware. The
|
||||
`AmlPartialTrainingEvaluator` then kicks off one new Azure ML training pipeline for each batch of
|
||||
new model architectures that need to be partially trained, it stores the validation IOU results in
|
||||
an Azure blob store and an Azure table so the search can get those results and use them to figure
|
||||
out the next iteration of the search algorithm:
|
||||
|
||||
![system](images/system.png)
|
||||
|
||||
|
@ -51,18 +70,20 @@ See [AML Training Readme](training/readme.md) for more information.
|
|||
|
||||
## Remote Inference Testing
|
||||
|
||||
The remote inference testing workflow looks like this, the `RemoteAzureBenchmarkEvaluator` uploads models to the same
|
||||
Azure blob store, and adds a row to the status table. This triggers remote instances of the [runner.py](azure/runner.py) script
|
||||
to process these new models on an attached Qualcomm device. Optionally some of the work can be done in the cloud
|
||||
using a Kubernetes cluster, this includes model quantization and accuracy testing using the ONNX runtime.
|
||||
The workflow looks like this:
|
||||
The remote inference testing workflow looks like this, the `RemoteAzureBenchmarkEvaluator` uploads
|
||||
models to the same Azure blob store, and adds a row to the status table. This triggers remote
|
||||
instances of the [runner.py](azure/runner.py) script to process these new models on an attached
|
||||
Qualcomm device. Optionally some of the work can be done in the cloud using a Kubernetes cluster,
|
||||
this includes model quantization and accuracy testing using the ONNX runtime. The workflow looks
|
||||
like this:
|
||||
|
||||
![snpe](images/snpe.png)
|
||||
|
||||
Each instance of `runner.py` looks for work, and executes it in priority order where the prioritization is defined by
|
||||
the `find_work_prioritized` function in the runner. This script is completely restartable, and can distribute the work
|
||||
across multiple instances of the runner script. Each instance will pick up where a previous one left off based on what
|
||||
it finds in your Azure status table. The prioritization maps to the columns of the status table as follows:
|
||||
Each instance of `runner.py` looks for work, and executes it in priority order where the
|
||||
prioritization is defined by the `find_work_prioritized` function in the runner. This script is
|
||||
completely restartable, and can distribute the work across multiple instances of the runner script.
|
||||
Each instance will pick up where a previous one left off based on what it finds in your Azure status
|
||||
table. The prioritization maps to the columns of the status table as follows:
|
||||
|
||||
1. **macs:** convert to .dlc and post Macs score and `snpe-dlc-viewer` output and do model quantization (runs on Linux) - priority 20
|
||||
1. **total_inference_avg** run `snpe_bench.py` with quantized model on Qualcomm device DSP - priority 30
|
||||
|
|
|
@ -1,17 +1,21 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
import numpy as np
|
||||
|
||||
|
||||
def calc_pareto_frontier(points):
|
||||
""" Given an array of points where the first 2 coordinates define a 2D point
|
||||
return a list of array indexes that define the pareto frontier for these points """
|
||||
return a sorted version of those points and a list of array indexes into that
|
||||
sorted list that define the pareto frontier for these points """
|
||||
points = np.array(points)
|
||||
sorted = points[points[:, 0].argsort()]
|
||||
pareto = []
|
||||
pareto += [0]
|
||||
p1 = points[0]
|
||||
for i in range(1, len(points)):
|
||||
p2 = points[i]
|
||||
p1 = sorted[0]
|
||||
for i in range(1, len(sorted)):
|
||||
p2 = sorted[i]
|
||||
if p2[1] > p1[1]:
|
||||
pareto += [i]
|
||||
p1 = p2
|
||||
|
||||
return pareto
|
||||
return (sorted, pareto)
|
||||
|
|
|
@ -91,6 +91,7 @@ aml:
|
|||
- matplotlib
|
||||
- mldesigner
|
||||
- mlflow
|
||||
- tqdm
|
||||
- tensorwatch
|
||||
- torch
|
||||
- torchvision
|
||||
|
|
|
@ -4,12 +4,26 @@ import argparse
|
|||
import sys
|
||||
from archai.discrete_search.api import ArchaiModel
|
||||
from archai.common.config import Config
|
||||
from aml.training.aml_training_evaluator import AmlPartialTrainingEvaluator
|
||||
from search_space.hgnet import HgnetSegmentationSearchSpace
|
||||
from archai.discrete_search.evaluators.remote_azure_benchmark import RemoteAzureBenchmarkEvaluator
|
||||
from aml.util.setup import configure_store
|
||||
|
||||
|
||||
def reset_dlc(store, experiment_name, entity):
|
||||
""" Reset the qualcomm dlc files and associated metrics for the given entity."""
|
||||
changed = False
|
||||
name = entity['name']
|
||||
prefix = f'{experiment_name}/{name}'
|
||||
print(f"Resetting .dlc files for model {name}")
|
||||
store.delete_blobs(prefix, 'model.dlc')
|
||||
store.delete_blobs(prefix, 'model.quant.dlc')
|
||||
for k in ['mean', 'macs', 'params', 'stdev', 'total_inference_avg', 'error', 'f1_1k', 'f1_10k', 'f1_1k_f', 'f1_onnx', 'pipeline_id']:
|
||||
if k in entity:
|
||||
del entity[k]
|
||||
changed = True
|
||||
if changed:
|
||||
store.update_status_entity(entity)
|
||||
|
||||
|
||||
def main():
|
||||
# input and output arguments
|
||||
parser = argparse.ArgumentParser(description="Runs Snapdragon F1 scoring on the final fully trained models produced by train_pareto.py.")
|
||||
|
@ -23,7 +37,12 @@ def main():
|
|||
metric_key = 'final_val_iou'
|
||||
search_config = config['search']
|
||||
ss_config = search_config['search_space']
|
||||
ss_params = ss_config['params']
|
||||
in_channels = ss_params['in_channels']
|
||||
img_size = ss_params['img_size']
|
||||
target_config = search_config.get('target', {})
|
||||
# change the metric key to the one used for Snapdragon F1 scoring
|
||||
target_config['metric_key'] = 'f1_1k'
|
||||
target_name = target_config.pop('name', 'cpu')
|
||||
device_evaluator = None
|
||||
|
||||
|
@ -38,28 +57,23 @@ def main():
|
|||
fully_trained += [e]
|
||||
|
||||
if len(fully_trained) == 0:
|
||||
print(f"No fully trained models found with required metric '{metric_key}'")
|
||||
print(f"No 'complete' models found with required metric '{metric_key}'")
|
||||
sys.exit(1)
|
||||
|
||||
# the RemoteAzureBenchmarkEvaluator only needs the archid actually, doesn't need the nn.Module.
|
||||
models = []
|
||||
for i in fully_trained:
|
||||
id = e['name']
|
||||
e['status'] = 'preparing'
|
||||
for e in fully_trained:
|
||||
name = e['name']
|
||||
# if this has not been F1 scored yet then add it to our list.
|
||||
if 'benchmark_only' in e:
|
||||
del e['benchmark_only']
|
||||
store.update_status_entity(e)
|
||||
models += [ArchaiModel(None, archid=id[3:])]
|
||||
models += [ArchaiModel(None, archid=name[3:])]
|
||||
# make sure we re-quantize the new fully trained model.
|
||||
reset_dlc(store, experiment_name, e)
|
||||
|
||||
# kick off remote device training without the benchmark_only flag so we get the
|
||||
# F1 scores for these fully trained models. Note the above results_path ensures the trained
|
||||
# models are uploaded back to our models blob store.
|
||||
search_space = HgnetSegmentationSearchSpace(
|
||||
seed=42, # not important in this case.
|
||||
**ss_config.get('params', {}),
|
||||
)
|
||||
|
||||
input_shape = (1, search_space.in_channels, *search_space.img_size[::-1])
|
||||
input_shape = (1, in_channels, *img_size[::-1])
|
||||
device_evaluator = RemoteAzureBenchmarkEvaluator(
|
||||
input_shape=input_shape,
|
||||
store=store,
|
|
@ -28,8 +28,8 @@ def main():
|
|||
config = Config(args.config, resolve_env_vars=True)
|
||||
aml_config = config['aml']
|
||||
store = configure_store(aml_config)
|
||||
|
||||
evaluator = AmlPartialTrainingEvaluator(config, args.output, args.epochs, args.timeout)
|
||||
output_path = Path(os.path.realpath(args.output))
|
||||
evaluator = AmlPartialTrainingEvaluator(config, output_path, args.epochs, args.timeout)
|
||||
store = evaluator.store
|
||||
|
||||
experiment_name = aml_config['experiment_name']
|
||||
|
@ -54,9 +54,7 @@ def main():
|
|||
print(f"No models found with required metrics '{metric_key}' and '{target_metric_key}'")
|
||||
sys.exit(1)
|
||||
|
||||
points = np.array(points)
|
||||
sorted = points[points[:, 0].argsort()]
|
||||
pareto = calc_pareto_frontier(sorted)
|
||||
sorted, pareto = calc_pareto_frontier(points)
|
||||
print(f'Found {len(pareto)} models on pareto frontier')
|
||||
|
||||
# change the key so the evaluator updates a different field this time and
|
||||
|
|
Загрузка…
Ссылка в новой задаче