Add Azure ML running to the face segmentation task. (#217)

* add code owners

* initial commit, beginnings of AML version of face synthetics search pipeline.

* Add download_and_extract_zip
Add download capability to FaceSyntheticsDataset
Fix face segmentation data prep script.

* fix bugs

* cleanup launch.json

* cleanup launch.json
add download capability to FaceSyntheticsDataset
add download_and_extract_zip helper

* fix file count test

* work in progress

* work in progress

* unify snpe status table and aml training table.

* fix experiment referencing

* fix experiment referencing

* work in progress

* fix complete status

* fix bugs

* fix bug

* fix metric key, we have 2, one for remote snpe, and another for aml training pipelines.

* pass seed through to the search.py script.

* fix use of AzureMLOnBehalfOfCredential

* fix bugs

* fix bugs

* publish new image

* fix bugs

* fix bugs

* fix bug

* maerge

* revert

* new version

* fix bugs

* rename the top level folder from 'snpe' to 'aml' and move all AML code into this folder except the top level entry point 'aml.py'
make the keys returned from the JobCompletionMonitor wait method configurable
Rename AmlPartialTrainingEvaluator and make it restartable.
Turn off save_pareto_model_weights
Remove redundant copy of JobCompletionMonitor

* rev the versions.

* updates to readme information.

* only inference testing targets are 'cpu' and 'snp', trigger the aml partial training by a different key in the config file.

* add iteration info

* new version.

* fix ordering of results from AmlPartialTrainingEvaluator

* change AML batch size default to 64 for faster training
don't store MODEL_STORAGE_CONNECTION_STRING

* Fix bug in merge_status_entity, add more unit test coverage

* new version

* Store training time in status table.

* improve diagram.

* save iteration in status table.

* pick up new version of archai to fix randomness bug in the EvolutionParetoSearch so that these search jobs are restartable.
This commit is contained in:
Chris Lovett 2023-04-21 10:47:58 -07:00 коммит произвёл GitHub
Родитель 4fc5a6d068
Коммит aae38db1a5
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
73 изменённых файлов: 1332 добавлений и 511 удалений

3
.gitignore поставляемый
Просмотреть файл

@ -164,5 +164,6 @@ output/
snpe-2.5.0.4052.zip
android-ndk-r25b-linux.zip
android-ndk-r25c-linux.zip
tasks/face_segmentation/snpe/docker/quantizer/quantizer.yaml
tasks/face_segmentation/aml/docker/quantizer/quantizer.yaml
tasks/face_segmentation/.vscode/launch.json
tasks/face_segmentation/conda.yaml

Просмотреть файл

@ -30,4 +30,6 @@
/research/lm_eval_harness @gugarosa
# Tasks
/tasks/face_segmentation @piero2c
/tasks/face_segmentation/aml @lovettchris
/tasks/text_generation @gugarosa

Просмотреть файл

@ -56,8 +56,12 @@ class JobCompletionMonitor:
failed += 1
self.store.update_status_entity(e)
else:
msg = f"{e['val_acc']}" if 'val_acc' in e else ''
print(f'Training job {id} completed with validation accuracy: {msg}')
if len(self.metric_keys) > 0 and self.metric_keys[0] in e:
key = self.metric_keys[0]
metric = e[key]
print(f'Training job {id} completed with {key} = {metric}')
else:
print(f'Training job {id} completed')
if len(waiting) == 0:
break
@ -131,12 +135,16 @@ def main():
parser.add_argument('--config', help='bin hexed config json info for MLClient')
parser.add_argument('--timeout', type=int, help='pipeline timeout in seconds (default 1 hour)', default=3600)
parser.add_argument('--model_path', required=True, help='mounted path containing the pending.json file')
parser.add_argument('--output', required=True, help='folder to write the results to)')
parser.add_argument('--output', required=True, help='folder to write the results to')
parser.add_argument('--metrics', type=str, help='metrics to return from the azure table')
args = parser.parse_args()
output = args.output
timeout = args.timeout
model_path = args.model_path
metrics = []
if args.metrics:
metrics = [x.strip() for x in args.metrics.split(',')]
print(f"Monitor running with model_path={model_path}")
if not os.path.isdir(model_path):
@ -177,7 +185,7 @@ def main():
store = ArchaiStore(storage_account_name, storage_account_key)
monitor = JobCompletionMonitor(store, ml_client, timeout=timeout)
monitor = JobCompletionMonitor(store, ml_client, metrics, timeout=timeout)
results = monitor.wait(model_ids)
if output is not None:
# save the results with updated validation accuracies to models.json

Просмотреть файл

@ -455,7 +455,8 @@
" \"Full Training Pipeline\", hex_config, scripts_dir, gpu_compute_name, \n",
" datastore_path, output_path, experiment_name, environment_name, full_epochs, save_models=True)\n",
" \n",
"monitor_component = make_monitor_command(hex_config, scripts_dir, results_path, environment_name, timeout)"
"keys = ['val_acc'] \n",
"monitor_component = make_monitor_command(hex_config, scripts_dir, results_path, environment_name, keys, timeout)"
]
},
{

Просмотреть файл

@ -11,7 +11,7 @@ from archai.discrete_search.api.model_evaluator import AsyncModelEvaluator
from azure.ai.ml import MLClient, command, Input, Output, dsl
from archai.common.store import ArchaiStore
from shutil import copyfile
from monitor import JobCompletionMonitor
from archai.common.monitor import JobCompletionMonitor
from training_pipeline import start_training_pipeline
from utils import copy_code_folder
@ -73,7 +73,9 @@ class AmlTrainingValAccuracy(AsyncModelEvaluator):
print(f'AmlTrainingValAccuracy: Started training pipeline: {job_id}')
# wait for all the parallel training jobs to finish
monitor = JobCompletionMonitor(self.store, self.ml_client, job_id, self.timeout)
metric_key = 'vac_acc'
keys = [metric_key]
monitor = JobCompletionMonitor(self.store, self.ml_client, keys, job_id, self.timeout)
results = monitor.wait(model_names)
# save the results to the output folder (which is mapped by the AML pipeline to our
@ -93,7 +95,7 @@ class AmlTrainingValAccuracy(AsyncModelEvaluator):
# not so good.
accuracies = []
for i, m in enumerate(results['models']):
val_acc = m['val_acc']
val_acc = m[metric_key]
accuracies += [val_acc]
print(f'AmlTrainingValAccuracy: fetch_all returning : {accuracies}')

Просмотреть файл

@ -46,12 +46,13 @@ def make_train_model_command(output_path, code_dir, environment_name, id,
)
def make_monitor_command(hex_config, code_dir, results_uri, environment_name, timeout=3600):
def make_monitor_command(hex_config, code_dir, results_uri, environment_name, metrics=[], timeout=3600):
""" This command waits up to some timeout for all the given training jobs to complete
and returns the validation accuracy results """
fixed_args = f'--config "{hex_config}" ' + \
f'--timeout {timeout} '
f'--timeout {timeout} ' + \
f'--metrics {",".join(metrics)}'
# this node depends on training and is therefore also not deterministic, so we set is_deterministic=False
# which disables pipeline caching.

Просмотреть файл

@ -1,188 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import time
import argparse
import json
import os
from typing import List, Dict
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
from azure.identity import DefaultAzureCredential
from archai.common.store import ArchaiStore
from azure.ai.ml import MLClient
from azure.ai.ml import dsl
class JobCompletionMonitor:
""" This helper class uses the ArchaiStore to monitor the status of some long running
training operations and the status of the Azure ML pipeline those jobs are running in
and waits for them to finish (either successfully or with a failure)"""
def __init__(self, store : ArchaiStore, ml_client : MLClient, pipeline_id=None, timeout=3600):
"""
Initialize a JobCompletionMonitor instance.
:param store: an instance of ArchaiStore to monitor the status of some long running training operations
:param ml_client: an instance of MLClient to check the status of the Azure ML pipeline those jobs are running in
:param pipeline_id: (optional) the ID of the Azure ML pipeline to monitor, if not provided we can get this from the ArchaiStore.
:param timeout: (optional) the timeout in seconds
"""
self.store = store
self.ml_client = ml_client
self.timeout = timeout
self.pipeline_id = pipeline_id
def wait(self, model_ids: List[str]) -> List[Dict[str, str]]:
"""
Wait for all the training jobs to finish and return a list of dictionaries
containing details about each model, including their training validation accuracies.
:param model_ids: a list of training job IDs
:return: a list of dictionaries containing details about each model
"""
completed = {}
waiting = list(model_ids)
start = time.time()
failed = 0
while len(waiting) > 0:
for i in range(len(waiting) - 1, -1, -1):
id = waiting[i]
e = self.store.get_status(id)
if self.pipeline_id is None and 'pipeline_id' in e:
self.pipeline_id = e['pipeline_id']
if e is not None and 'status' in e and (e['status'] == 'completed' or e['status'] == 'failed'):
del waiting[i]
completed[id] = e
if e['status'] == 'failed':
error = e['error']
print(f'Training job {id} failed with error: {error}')
failed += 1
self.store.update_status_entity(e)
else:
msg = f"{e['val_acc']}" if 'val_acc' in e else ''
print(f'Training job {id} completed with validation accuracy: {msg}')
if len(waiting) == 0:
break
# check the overall pipeline status just in case training jobs failed to even start.
pipeline_status = None
try:
if self.pipeline_id is not None:
train_job = self.ml_client.jobs.get(self.pipeline_id)
if train_job is not None:
pipeline_status = train_job.status
except Exception as e:
print(f'Error getting pipeline status for pipeline {self.pipeline_id}: {e}')
if pipeline_status is not None:
if pipeline_status == 'Completed':
# ok, all jobs are done, which means if we still have waiting tasks then they failed to
# even start.
break
elif pipeline_status == 'Failed' or pipeline_status == 'Canceled':
for id in waiting:
e = self.store.get_status(id)
if 'error' not in e:
e['error'] = f'Pipeline {pipeline_status}'
if 'status' not in e or e['status'] != 'completed':
e['status'] = pipeline_status.lower()
self.store.update_status_entity(e)
raise Exception('Partial Training Pipeline failed')
if len(waiting) > 0:
if time.time() > self.timeout + start:
break
print("AmlTrainingValAccuracy: Waiting 20 seconds for partial training to complete...")
time.sleep(20)
# awesome - they all completed!
if len(completed) == 0:
if time.time() > self.timeout + start:
raise Exception(f'Partial Training Pipeline timed out after {self.timeout} seconds')
else:
raise Exception('Partial Training Pipeline failed to start')
if failed == len(completed):
raise Exception('Partial Training Pipeline failed all jobs')
# stitch together the models.json file from our status table.
print('Top model results: ')
models = []
for id in model_ids:
row = {'id': id}
e = completed[id] if id in completed else {}
for key in ['nb_layers', 'kernel_size', 'hidden_dim', 'val_acc', 'job_id', 'status', 'error', 'epochs']:
if key in e:
row[key] = e[key]
models += [row]
results = {
'models': models
}
timespan = time.strftime('%H:%M:%S', time.gmtime(time.time() - start))
print(f'Training: Distributed training completed in {timespan} ')
print(f'Training: returning {len(results)} results:')
print(json.dumps(results, indent=2))
return results
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--config', help='bin hexed config json info for MLClient')
parser.add_argument('--timeout', type=int, help='pipeline timeout in seconds (default 1 hour)', default=3600)
parser.add_argument('--model_path', required=True, help='mounted path containing the pending.json file')
parser.add_argument('--output', required=True, help='folder to write the results to)')
args = parser.parse_args()
output = args.output
timeout = args.timeout
model_path = args.model_path
print(f"Monitor running with model_path={model_path}")
if not os.path.isdir(model_path):
raise Exception("### directory not found")
models_file = os.path.join(model_path, 'pending.json')
if not os.path.isfile(models_file):
raise Exception("### 'pending.json' not found in --model_path")
models = json.load(open(models_file))
model_ids = [m['id'] for m in models['models']]
identity = AzureMLOnBehalfOfCredential()
if args.config:
print("Using AzureMLOnBehalfOfCredential...")
workspace_config = str(bytes.fromhex(args.config), encoding='utf-8')
print(f"Config: {workspace_config}")
config = json.loads(workspace_config)
else:
print("Using DefaultAzureCredential...")
config_file = "../.azureml/config.json"
print(f"Config: {config_file}")
config = json.load(open(config_file, 'r'))
identity = DefaultAzureCredential()
subscription = config['subscription_id']
resource_group = config['resource_group']
workspace_name = config['workspace_name']
storage_account_key = config['storage_account_key']
storage_account_name = config['storage_account_name']
ml_client = MLClient(
identity,
subscription,
resource_group,
workspace_name
)
store = ArchaiStore(storage_account_name, storage_account_key)
monitor = JobCompletionMonitor(store, ml_client, timeout=timeout)
results = monitor.wait(model_ids)
if output is not None:
# save the results with updated validation accuracies to models.json
with open(os.path.join(output, 'models.json'), 'w') as f:
f.write(json.dumps(results, indent=2))
if __name__ == "__main__":
main()

Просмотреть файл

@ -11,7 +11,9 @@
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true,
"args":["001c3ee6c74e05e63252b1ba3dfc58ca3cbb4f56"]
"args":[
"--search_config", "output/confs/aml_search.yaml"
]
}
]
}

Просмотреть файл

@ -38,7 +38,7 @@ configurations based on the desired target (CPU or Snapdragon processor), `searc
* [cpu_search.yaml](confs/cpu_search.yaml)
* [snp_search.yaml](confs/snp_search.yaml)
Note: to use `snp_search.yaml` you will need to follow the [SNP setup instructions](snpe/readme.md).
Note: to use `snp_search.yaml` you will need to follow the [Azure ML setup instructions](aml/readme.md).
By default, `search.py` will run multiple partial training jobs using Ray (2 jobs per GPU). To change the number of gpus
per job, set `--gpus_per_job`, or use the `--serial_training` flag to disable parallel training jobs altogether.

Просмотреть файл

@ -0,0 +1,204 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from argparse import ArgumentParser
from pathlib import Path
import os
import sys
import yaml
from typing import Optional, Dict
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import UserIdentityConfiguration
from azure.ai.ml import MLClient
from azure.ai.ml import command, Input, Output, dsl
from archai.common.config import Config
import archai.common.azureml_helper as aml_helper
from archai.common.store import ArchaiStore
from archai.common.file_utils import TemporaryFiles
from shutil import copyfile, rmtree
from aml.util.setup import register_datastore, configure_store, create_cluster, copy_code_folder
confs_path = Path(__file__).absolute().parent / 'confs'
def data_prep_component(environment_name, datastore_path):
return command(
name="data_prep",
display_name="Data preparation for training",
description="Downloads the remote dataset to our blob store.",
inputs={
"name": Input(type='string')
},
outputs={
"data": Output(type="uri_folder", path=datastore_path, mode="rw_mount")
},
# The source folder of the component
code='data_prep',
command="""python3 prep_data_store.py \
--path ${{outputs.data}} \
""",
environment=environment_name,
)
def search_component(config, environment_name, seed, modelstore_path, output_path: Path):
# we need a folder containing all the specific code we need here, which is not everything in this repo.
scripts_path = output_path / 'scripts'
os.makedirs(str(scripts_path), exist_ok=True)
config_dir = scripts_path / 'confs'
os.makedirs(str(config_dir), exist_ok=True)
copyfile('search.py', str(scripts_path / 'search.py'))
copyfile('train.py', str(scripts_path / 'train.py'))
copy_code_folder('search_space', str(scripts_path / 'search_space'))
copy_code_folder('training', str(scripts_path / 'training'))
copy_code_folder(os.path.join('aml', 'training'), str(scripts_path / 'aml' / 'training'))
copy_code_folder(os.path.join('aml', 'util'), str(scripts_path / 'aml' / 'util'))
config.save(str(config_dir / 'aml_search.yaml'))
aml_config = config['aml']
timeout = int(aml_config.get('timeout', 3600))
con_str = aml_config['connection_str']
fixed_args = f'--seed {seed} --timeout {timeout} --search_config confs/aml_search.yaml'
return command(
name="search",
display_name="Archai search job",
description="Searches for the best face segmentation model.",
is_deterministic=False,
inputs={
"data": Input(type="uri_folder")
},
outputs={
"results": Output(type="uri_folder", path=modelstore_path, mode="rw_mount")
},
identity=UserIdentityConfiguration(),
# The source folder of the component
code=str(scripts_path),
environment_variables={'MODEL_STORAGE_CONNECTION_STRING': con_str},
command="""python3 search.py \
--dataset_dir ${{inputs.data}} \
--output_dir ${{outputs.results}} \
""" + fixed_args,
environment=environment_name,
)
def main(output_dir: Path, experiment_name: str, seed: int):
if output_dir.exists():
rmtree(str(output_dir))
output_dir.mkdir(parents=True)
# Filters extra args that have the prefix `search_space`
config_file = str(confs_path / 'aml_search.yaml')
config = Config(config_file, resolve_env_vars=True)
aml_config = config['aml']
con_str = aml_config.get('connection_str', '$')
if '$' in con_str:
print("Please set environment variable MODEL_STORAGE_CONNECTION_STRING containing the Azure" +
"storage account connection string for the Azure storage account you want to use to " +
"control this experiment.")
return 1
workspace_name = aml_config['workspace_name']
subscription_id = aml_config['subscription_id']
resource_group_name = aml_config['resource_group']
# extract conda.yaml.
with open('conda.yaml', 'w') as f:
yaml.dump(aml_config['environment'].to_dict(), f)
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
print(f'Using storage account: {storage_account_name}')
ml_client = MLClient(
credential=DefaultAzureCredential(),
subscription_id=subscription_id,
resource_group_name=resource_group_name,
workspace_name=workspace_name
)
print(f'Using workspace "{ml_client.workspace_name}" in resource group "{ml_client.resource_group_name}"')
# Create aml computer clusters
cpu_compute_name = create_cluster(ml_client, aml_config, 'search_cluster')
create_cluster(ml_client, aml_config, 'training_cluster')
archai_job_env = aml_helper.create_environment_from_file(
ml_client,
image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
conda_file="conda.yaml",
version='1.0.20')
environment_name = f"{archai_job_env.name}:{archai_job_env.version}"
# Register the datastore with AML
data_store_name = 'datasets'
data_container_name = 'datasets'
model_store_name = 'models'
model_container_name = aml_config.get('blob_container_name', 'models')
# register our azure datastores
results_path = register_datastore(ml_client, model_store_name, model_container_name, storage_account_name, storage_account_key, experiment_name)
datastore_path = register_datastore(ml_client, data_store_name, data_container_name, storage_account_name, storage_account_key, experiment_name)
# save this in the output folder so it can be found by pipeline components.
aml_config['experiment_name'] = experiment_name
aml_config['environment_name'] = environment_name
aml_config['datastore_path'] = datastore_path
aml_config['results_path'] = results_path
# make sure the datasets container exists
store = configure_store(aml_config, data_container_name)
# make sure the models container exists
store = configure_store(aml_config, model_container_name)
with TemporaryFiles() as tmp_files:
filename = tmp_files.get_temp_file()
aml_config['connection_str'] = "${MODEL_STORAGE_CONNECTION_STRING}"
config.save(filename)
aml_config['connection_str'] = con_str
store.upload_blob(f"{experiment_name}/config", filename, 'aml_search.yaml')
@dsl.pipeline(
compute=cpu_compute_name,
description="FaceSynthetics Archai search pipeline",
)
def archai_search_pipeline():
data_prep_job = data_prep_component(environment_name, datastore_path)(
name=experiment_name
)
search_job = search_component(config, environment_name, seed, results_path, output_dir)(
data=data_prep_job.outputs.data
)
return {
"results": search_job.outputs.results
}
pipeline_job = ml_client.jobs.create_or_update(
archai_search_pipeline(),
# Project's name
experiment_name=experiment_name,
)
import webbrowser
webbrowser.open(pipeline_job.services["Studio"].endpoint)
job_name = pipeline_job.name
print(f'Started pipeline: {job_name}')
return 0
if __name__ == '__main__':
parser = ArgumentParser("""This script runs the search in an Azure ML workspace.""")
parser.add_argument('--output_dir', type=Path, help='Output directory for downloading results.', default='output')
parser.add_argument('--experiment_name', default='facesynthetics')
parser.add_argument('--seed', type=int, help='Random seed', default=42)
args = parser.parse_args()
rc = main(args.output_dir, args.experiment_name, args.seed)
sys.exit(rc)

Просмотреть файл

Просмотреть файл

@ -42,11 +42,12 @@ def cleanup_stale_pods(store: ArchaiStore):
if __name__ == '__main__':
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
con_str = os.getenv(CONNECTION_NAME)
if not con_str:
print(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
cleanup_stale_pods(store)

Просмотреть файл

@ -15,15 +15,16 @@ def delete(con_str):
args = parser.parse_args()
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
store.delete_blobs(args.name, args.file)
if not args.file:
store.delete_status(args.name)
if __name__ == '__main__':
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
con_str = os.getenv(CONNECTION_NAME)
if not con_str:
print(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
delete(con_str)
delete(con_str, experiment_name)

Просмотреть файл

@ -8,7 +8,7 @@ from archai.common.store import ArchaiStore
CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'
def download(con_str):
def download(con_str, experiment_name):
parser = argparse.ArgumentParser(
description="Download assets from azure blob store using friendly name.")
parser.add_argument('--name', help='Friendly name of model to download (if not provided it downloads them all')
@ -16,7 +16,7 @@ def download(con_str):
args = parser.parse_args()
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
friendly_name = args.name
if not friendly_name:
friendly_names = [e['name'] for e in store.get_all_status_entities()]
@ -32,8 +32,9 @@ def download(con_str):
if __name__ == '__main__':
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
con_str = os.getenv(CONNECTION_NAME)
if not con_str:
print(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
download(con_str)
download(con_str, experiment_name)

Просмотреть файл

@ -8,7 +8,7 @@ from archai.common.store import ArchaiStore
CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'
def list_models(con_str):
def list_models(con_str, experiment_name):
parser = argparse.ArgumentParser(
description="List all azure blob store assets.")
parser.add_argument('--prefix', type=str, required=True, default=None,
@ -17,14 +17,15 @@ def list_models(con_str):
prefix = args.prefix
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
for blob in store.list_blobs(prefix):
print(blob)
if __name__ == '__main__':
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
con_str = os.getenv(CONNECTION_NAME)
if not con_str:
print(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
list_models(con_str)
list_models(con_str, experiment_name)

Просмотреть файл

Просмотреть файл

@ -12,7 +12,7 @@ named `Connection string`.
In Linux you should use double quotes around the connection string like this:
```
export MODEL_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=mymodels;AccountKey=...==;EndpointSuffix=core.windows.net"
export MODEL_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=...EndpointSuffix=core.windows.net"
```
You'll use it a lot so it is handy if you put it in your `~/.profile`.
@ -44,13 +44,14 @@ Then to get the ball rolling create a temp folder and run this:
```
mkdir -p ~/experiment
python ~/git/archai/tasks/face_Segmentation/snpe/azure/runner.py --working ~/experiment
python ~/git/archai/tasks/face_Segmentation/aml/azure/runner.py --working ~/experiment
```
This will monitor the Azure blob store for new work to do, and run those jobs in priority order. If you also provide a
`--device` option pointing to the `adb device` for a Qualcomm 888 Dev Board then it will also run the quantized models
on that device and report the performance and F1 score results.
on that device and report the performance and F1 score results. If the row in the azure table contains the property
`benchmark_only` equal to 1, then it will skip computing any F1 scores.
If you setup a quantization only runner in the cloud using the `docker/quantizer` image, you can pass
`--no_quantization` argument when you have a `--device` so that the local runs do not do quantization. This will stop
your linux machine from getting overloaded with quantization work so it can focus on the SNPE device workloads.
your linux machine from getting overloaded with quantization work so it can focus on managing the SNPE device workloads.

Просмотреть файл

@ -8,13 +8,13 @@ from archai.common.store import ArchaiStore
CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'
def reset(con_str):
def reset(con_str, experiment_name):
parser = argparse.ArgumentParser(
description='Reset the named entity.')
parser.add_argument('name', help='The friendly name to reset or "*" to reset all rows', default=None)
args = parser.parse_args()
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
entities = []
if args.name == "*":
@ -29,15 +29,15 @@ def reset(con_str):
for e in entities:
name = e['name']
print(f"Resetting {name}")
store.reset(e['name'], ['benchmark_only', 'model_date'])
store.delete_blobs(name, 'model.dlc')
store.delete_blobs(name, 'model.quant.dlc')
if __name__ == '__main__':
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
con_str = os.getenv(CONNECTION_NAME)
if not con_str:
print(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
reset(con_str)
reset(con_str, experiment_name)

Просмотреть файл

@ -134,7 +134,7 @@ def record_error(entity, error_message):
store.merge_status_entity(entity)
def convert(name, entity, long_name, model_path):
def convert(experiment, name, entity, long_name, model_path):
global store
log("Converting model: " + long_name)
entity['model_name'] = long_name
@ -153,12 +153,12 @@ def convert(name, entity, long_name, model_path):
store.merge_status_entity(entity)
log("Uploading converted model: " + model)
store.upload_blob(name, model)
blob_name = f'{experiment}/{name}'
store.upload_blob(blob_name, model)
return model
def quantize(name, entity, onnx_model, model):
def quantize(experiment, name, entity, onnx_model, model):
global store
log("Quantizing model: " + name + "...")
log(" (Please be patient this can take a while, up to 10 minutes or more)")
@ -176,7 +176,8 @@ def quantize(name, entity, onnx_model, model):
# save the quantized .dlc since it takes so long to produce.
log("Uploading quantized model: " + model)
store.upload_blob(name, model)
blob_name = f'{experiment}/{name}'
store.upload_blob(blob_name, model)
return model
@ -302,7 +303,7 @@ def get_avg_latency(latencies):
return sum / count
def benchmark(entity, onnx_model, model, name, test_input):
def benchmark(experiment, entity, onnx_model, model, name, test_input):
global BENCHMARK_RUN_COUNT, CLEAR_RANDOM_INPUTS, store, usage
# next highest priority is to get benchmark times
@ -329,7 +330,7 @@ def benchmark(entity, onnx_model, model, name, test_input):
add_usage(usage, get_device(), start, end)
for file in glob.glob(os.path.join(output_dir, 'perf_results*.csv')):
store.upload_blob(name, file)
store.upload_blob(f'{experiment}/{name}', file)
total_inference_avg = get_total_inference_avg(entity)
total_inference_avg += [ifs]
@ -359,7 +360,7 @@ def ensure_complete(entity):
store.merge_status_entity(entity)
def run_model(name, dataset, use_device, benchmark_only, no_quantization):
def run_model(experiment, name, dataset, use_device, benchmark_only, no_quantization):
global store, usage
log("===================================================================================================")
log(f"Checking model: {name} on node {get_unique_node_id()}")
@ -385,7 +386,8 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
entity = store.get_status(name)
downloaded = store.download(name, model_dir, r'.*\.onnx$')
blob_name = f'{experiment}/{name}'
downloaded = store.download(blob_name, model_dir, r'.*\.onnx$')
if len(downloaded) == 0 or not os.path.isfile(downloaded[0]):
record_error(entity, 'missing model')
log(f"### no model found for {name}")
@ -395,8 +397,8 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
# see if we have converted the model or not.
# do this first no matter what.
converted = len(store.list_blobs(f'{name}/model.dlc')) > 0
is_quantized = len(store.list_blobs(f'{name}/model.quant.dlc')) > 0
converted = len(store.list_blobs(f'{blob_name}/model.dlc')) > 0
is_quantized = len(store.list_blobs(f'{blob_name}/model.quant.dlc')) > 0
if not is_quantized:
# oh, the quant model disappeared so clear the flag so it gets
# quantized again by a machine that can do that.
@ -411,11 +413,11 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
converted = False
if not converted:
model = convert(name, entity, long_name, onnx_model)
model = convert(experiment, name, entity, long_name, onnx_model)
if model == 'error':
return
elif converted:
downloaded = store.download(name, snpe_model_dir, 'model.dlc')
downloaded = store.download(blob_name, snpe_model_dir, 'model.dlc')
if len(downloaded) == 0:
raise Exception('### internal error, the model.dlc download failed!')
elif not is_quantized and not converted:
@ -426,7 +428,7 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
# see if we have a quantized model or not.
model = os.path.join(snpe_model_dir, 'model.dlc')
if not is_quantized:
model = quantize(name, entity, onnx_model, model)
model = quantize(experiment, name, entity, onnx_model, model)
if model == 'error':
return
entity['quantized'] = True
@ -439,7 +441,7 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
quantized_model = os.path.join(snpe_model_dir, 'model.quant.dlc')
if not os.path.isfile(quantized_model):
downloaded = store.download(name, snpe_model_dir, 'model.quant.dlc')
downloaded = store.download(blob_name, snpe_model_dir, 'model.quant.dlc')
if len(downloaded) == 0 or not os.path.isfile(downloaded[0]):
raise Exception("??? quantized model should exist at this point...")
quantized_model = downloaded[0]
@ -453,14 +455,14 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
csv_file = os.path.join(snpe_model_dir, 'model.quant.info.csv')
with open(csv_file, 'w') as f:
f.write(csv_data)
store.upload_blob(name, csv_file)
store.upload_blob(blob_name, csv_file)
return
input_shape = eval(entity['shape'])
if use_device:
check_dataset(input_shape, 'test', 1000)
test_input = os.path.realpath(os.path.join('data', 'test'))
if benchmark(entity, onnx_model, quantized_model, name, test_input):
if benchmark(experiment, entity, onnx_model, quantized_model, name, test_input):
return
if benchmark_only:
@ -533,8 +535,8 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
log(f"### Saving {prop} score of {f1score}")
entity[prop] = f1score
store.merge_status_entity(entity)
store.upload_blob(name, test_results, f"test_results_{prop}.csv")
store.upload_blob(name, chart, f"pr_curve_{prop}.png")
store.upload_blob(blob_name, test_results, f"test_results_{prop}.csv")
store.upload_blob(blob_name, chart, f"pr_curve_{prop}.png")
if 'f1_1k' in entity and 'f1_10k' in entity and 'f1_1k_f' in entity and 'f1_onnx' in entity:
ensure_complete(entity)
@ -629,6 +631,7 @@ def find_work_prioritized(use_device, benchmark_only, subset_list, no_quantizati
elif use_device and (total_benchmark_runs < MAX_BENCHMARK_RUNS):
priority = 30 + total_benchmark_runs
elif is_benchmark_only(entity, benchmark_only):
# this model is done!
continue
elif not is_complete(entity, 'f1_onnx'):
priority = 60
@ -678,7 +681,7 @@ class MemoryMonitor:
return growth
def monitor(dataset, use_device, benchmark_only, subset_list, no_quantization):
def monitor(experiment, dataset, use_device, benchmark_only, subset_list, no_quantization):
global rss_start, store, usage
logging.basicConfig(filename=LOG_FILE_NAME, filemode='a',
@ -721,7 +724,7 @@ def monitor(dataset, use_device, benchmark_only, subset_list, no_quantization):
gc.collect()
tracemalloc.start()
snapshot1 = tracemalloc.take_snapshot()
run_model(name, dataset, use_device, benchmark_only_flag, no_quantization)
run_model(experiment, name, dataset, use_device, benchmark_only_flag, no_quantization)
gc.collect()
snapshot2 = tracemalloc.take_snapshot()
for i in snapshot2.compare_to(snapshot1, 'lineno')[:10]:
@ -759,12 +762,13 @@ def get_storage_account(con_str):
def setup_store():
global store, usage
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
conn_string = os.getenv(CONNECTION_NAME)
if not conn_string:
log(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(conn_string)
store = ArchaiStore(storage_account_name, storage_account_key, table_name='status')
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
usage = ArchaiStore(storage_account_name, storage_account_key, table_name='usage')
return conn_string
@ -851,5 +855,7 @@ if __name__ == '__main__':
check_stale_pods(args.cleanup_stale_pods)
dataset = os.getenv("INPUT_DATASET")
rc = monitor(dataset, device is not None, args.benchmark, subset, args.no_quantization)
experiment = os.getenv("EXPERIMENT_NAME", "facesynthetics")
rc = monitor(experiment, dataset, device is not None, args.benchmark, subset, args.no_quantization)
sys.exit(rc)

Просмотреть файл

@ -9,7 +9,7 @@ from archai.common.store import ArchaiStore
CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'
def status(con_str):
def status(con_str, experiment_name):
parser = argparse.ArgumentParser(description='Print status in .csv format')
parser.add_argument('--status', help='Optional match for the status column (default None).')
parser.add_argument('--name', help='Optional name of single status row to return (default None).')
@ -18,7 +18,7 @@ def status(con_str):
parser.add_argument('--cols', help='Comma separated list of columns to report (default is to print all)')
args = parser.parse_args()
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
entities = store.get_all_status_entities(args.status, args.not_equal)
if args.locked:
entities = [e for e in entities if 'node' in e and e['node']]
@ -32,8 +32,9 @@ def status(con_str):
if __name__ == '__main__':
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
con_str = os.getenv(CONNECTION_NAME)
if not con_str:
print(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
status(con_str)
status(con_str, experiment_name)

Просмотреть файл

@ -8,19 +8,20 @@ from archai.common.store import ArchaiStore
CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'
def unlock(con_str):
def unlock(con_str, experiment_name):
parser = argparse.ArgumentParser(
description='Unlock all jobs for given node or unlock all jobs.')
parser.add_argument('--node', help='Optional node name (default None).')
args = parser.parse_args()
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
store.unlock_all(args.node)
if __name__ == '__main__':
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
con_str = os.getenv(CONNECTION_NAME)
if not con_str:
print(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
unlock(con_str)
unlock(con_str, experiment_name)

Просмотреть файл

@ -8,7 +8,7 @@ from archai.common.store import ArchaiStore
CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'
def upload(con_str, args):
def upload(con_str, experiment_name, args):
parser = argparse.ArgumentParser(description='Upload a named model (and optional accompanying files) to your ' +
'azure blob store')
parser.add_argument('name', help='Friendly name of the folder to put this in.')
@ -19,13 +19,14 @@ def upload(con_str, args):
parser.add_argument('--reset', help='Reset stats for the model if it exists already.', action="store_true")
args = parser.parse_args(args)
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key)
store.upload(args.name, args.file, args.reset, priority=args.priority)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
store.upload(f'{experiment_name}/args.name', args.file, args.reset, priority=args.priority)
if __name__ == '__main__':
experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
con_str = os.getenv(CONNECTION_NAME)
if not con_str:
print(f"Please specify your {CONNECTION_NAME} environment variable.")
sys.exit(1)
upload(con_str)
upload(con_str, experiment_name)

Просмотреть файл

@ -62,13 +62,13 @@ RUN wget -O azcopy_v10.tar.gz https://aka.ms/downloadazcopy-v10-linux && tar -xf
# this echo is a trick to bypass docker build cache.
# simply change the echo string every time you want docker build to pull down new bits.
RUN echo '04/06/2023 06:28 PM' >/dev/null && git clone https://github.com/microsoft/archai.git
RUN cd archai && git checkout task_segmentation && pip install -e .[dev]
RUN echo '04/18/2023 12:22 PM' >/dev/null && git clone https://github.com/microsoft/archai.git
RUN cd archai && git checkout clovett/aml && pip install -e .[dev]
RUN echo "using this pip version: " && which pip
RUN echo "using this python version: " && which python
RUN pushd /home/archai/archai/tasks/face_segmentation/snpe && \
RUN pushd /home/archai/archai/tasks/face_segmentation/aml && \
python --version && \
pip install -r requirements.txt

Просмотреть файл

@ -33,7 +33,7 @@ talk to the right azure storage account. On linux this would be an export in yo
don't forget the double quotes.
```
export MODEL_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=mymodels;AccountKey=...==;EndpointSuffix=core.windows.net"
export MODEL_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=...==;EndpointSuffix=core.windows.net"
```
## Dockerfile
@ -44,11 +44,9 @@ you increase your Qualcomm device utilization.
The `setup.ps1` script shows what docker commands to run to build the image, how to login to your
azure docker container registry, how to take your image for that container registry and push it
to Azure. So you do not need to use the public docker.org container registry. You will decide
what version number to attach to your image here and the same version needs to be specified in the
following `quantizer.yaml`.
to Azure. So you do not need to use the public docker.org container registry.
You can also test your docker image locally by running:
You can test your docker image locally by running:
```
docker run -e MODEL_STORAGE_CONNECTION_STRING=$MODEL_STORAGE_CONNECTION_STRING -it <image_id>
```
@ -75,7 +73,7 @@ that shows a string like this:
```
az aks get-credentials --resource-group snpe-quantizaton-rg --name snpe-quantizer-aks
```
Run that locally and then you can push docker images to this registry.:
Run that locally and then you can push docker images to this registry:
```
docker push snpecontainerregistry001.azurecr.io/quantizer:1.27
@ -86,9 +84,7 @@ increment this version number each time it runs in case you need to push new ver
## quantizer.yaml
Then you can use `kubectl apply -f quantizer.yaml` to configure the AKS custer. Note that the version
of the image to use is specified in this file so you may need to edit the file and change the
version `1.13` to whatever you just tagged and pushed to the azure container registry.
Then you can use `kubectl apply -f quantizer.yaml` to deploy this new image version to your AKS custer.
Notice this yaml configures AKS to scale up to 100 nodes if necessary and the scaling is triggered
when a given node passes 40% CPU utilization. You can tweak these numbers however you like to fit
@ -132,4 +128,6 @@ face segmentation ONNX model to do a test run. You can train one of these model
## run.sh
This little script is used as the entry point to the Docker image, you will see this in the last
`RUN` command in the Dockerfile.
`RUN` command in the Dockerfile. The reason this `run.sh` contains a loop is because the Python
script checks for memory leaks and auto-terminates itself if it sees memory usage climb too high.
This way the quantizer pod can run pretty much forever, or at least until you deploy a new version.

Просмотреть файл

@ -17,7 +17,7 @@ pushd /home/archai/experiment
while true
do
python -u /home/archai/archai/tasks/face_segmentation/snpe/azure/runner.py
python -u /home/archai/archai/tasks/face_segmentation/aml/azure/runner.py
if [ $? != 0 ]; then
echo "Script returned an error code!"
fi

Просмотреть файл

@ -246,4 +246,4 @@ Write-Host " kubectl apply -f quantizer.yaml"
Write-Host ""
Write-Host "### To run the runner script locally please set the following environment variable: "
Write-HOst "set MODEL_STORAGE_CONNECTION_STRING=$conn_str"
Write-Host "set MODEL_STORAGE_CONNECTION_STRING=$conn_str"

Двоичные данные
tasks/face_segmentation/aml/images/portal.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 64 KiB

Просмотреть файл

@ -0,0 +1,98 @@
<?xml version="1.0" encoding="utf-8"?>
<DirectedGraph GraphDirection="TopToBottom" Layout="Sugiyama" Offset="-1829.8148940022802,-883.0494917160034" ZoomLevel="1" xmlns="http://schemas.microsoft.com/vs/2009/dgml">
<Nodes>
<Node Id="..." Bounds="-891.259155273438,-398.550804903068,50,25.96" UseManualLocation="True" />
<Node Id="...1" Bounds="-996.17041015625,-122.697003871574,50,25.96" Label="..." UseManualLocation="True" />
<Node Id="AzureBlobStore" Category="storage" Bounds="-1193.71833333333,-533.830712207031,106.03,73.6213311767578" Label="azure blob store" UseManualLocation="True" />
<Node Id="AzureStatusTable" Category="storage" Bounds="-1192.64078544617,-430.209281030274,112.973333333333,73.6213311767578" Label="azure status table" UseManualLocation="True" />
<Node Id="JupyterNotebook" Category="script" Bounds="-1374.80858579,-377.363886517334,115.163333333333,62.964467010498" Label="Jupyter Notebook" UseManualLocation="True" />
<Node Id="KubernetesCluster" Category="cluster" Bounds="-1195.17041666667,-264.359319506836,119.78,62.9644670104981" Label="Kubernetes Cluster" UseManualLocation="True" />
<Node Id="QualcommDevice" Category="device" Bounds="-767.554234593709,-540.257548474121,115.2,62.96" Label="Qualcomm device" UseManualLocation="True" />
<Node Id="QualcommDevice1" Category="device" Bounds="-767.259849828084,-447.297448474121,115.2,62.96" Label="Qualcomm device" UseManualLocation="True" />
<Node Id="Runner.py" Category="script" Bounds="-905.306400171916,-584.479938924064,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
<Node Id="Runner.py1" Category="script" Bounds="-905.012036539714,-491.515371913566,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
<Node Id="Runner.py2" Category="script" Bounds="-901.180981852214,-342.590704903068,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
<Node Id="Runner.py3" Category="script" Bounds="-903.170422770182,-249.62613789257,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
<Node Id="Runner.py4" Category="script" Bounds="-901.449353434245,-156.661570882072,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
<Node Id="SnpeQuantiation" Category="quantize" Bounds="-760.452083333333,-255.710307686227,109.126666666667,62.964467010498" Label="snpe quantiation" UseManualLocation="True" />
<Node Id="SnpeQuantization" Category="quantize" Bounds="-763.723191502889,-348.674874696725,114.553333333333,62.9644670104981" Label="snpe quantization" UseManualLocation="True" />
<Node Id="Test_onnx" Category="test" Bounds="-761.731038004557,-162.745740675729,70.5999999999999,47.96" Label="test_onnx" UseManualLocation="True" />
<Node Id="Upload" Category="script" Bounds="-1358,-486,56.9533333333333,62.964467010498" Label="upload" UseManualLocation="True" />
</Nodes>
<Links>
<Link Source="AzureBlobStore" Target="Runner.py" Bounds="-1087.68836914063,-543.888146126435,173.568538449327,35.9021208699053" />
<Link Source="AzureBlobStore" Target="Runner.py1" Bounds="-1087.68836914063,-489.782197817309,173.759053025964,23.7223370430618" />
<Link Source="AzureBlobStore" Target="Runner.py2" Bounds="-1088.21890281343,-461.505920916907,179.584023885674,121.517239678322" />
<Link Source="AzureStatusTable" Target="JupyterNotebook" Bounds="-1250.94000054214,-378.575417826448,58.2992201059451,15.298820648026" />
<Link Source="AzureStatusTable" Target="Runner.py" Bounds="-1079.66748046875,-523.59619140625,166.653381347656,105.640502929688" />
<Link Source="AzureStatusTable" Target="Runner.py1" Bounds="-1079.66748046875,-451.611999511719,165.948486328125,41.5036010742188" />
<Link Source="AzureStatusTable" Target="Runner.py2" Bounds="-1079.66748046875,-379.401794433594,169.876037597656,51.8286743164063" />
<Link Source="KubernetesCluster" Target="...1" Bounds="-1093.33052927498,-201.394849319458,97.6659896150068,73.2956685371913" />
<Link Source="KubernetesCluster" Target="Runner.py2" Bounds="-1075.39039550781,-298.365678712837,165.566640952845,48.0922930284574" />
<Link Source="KubernetesCluster" Target="Runner.py3" Bounds="-1075.39039550781,-229.576487801222,163.233609160429,8.99595989414743" />
<Link Source="KubernetesCluster" Target="Runner.py4" Bounds="-1075.39039550781,-208.904464362079,165.585551579934,66.2801678235037" />
<Link Source="Runner.py" Target="AzureStatusTable" Bounds="-1072.13012695313,-535.250061035156,166.82373046875,105.237548828125" />
<Link Source="Runner.py" Target="QualcommDevice" Bounds="-834.853066838583,-543.269532104768,58.6235405861986,16.1894387709768" />
<Link Source="Runner.py1" Target="AzureStatusTable" Bounds="-1070.91955566406,-447.686828613281,165.907531738281,41.6351928710938" />
<Link Source="Runner.py1" Target="QualcommDevice1" Bounds="-834.55870320638,-450.305950587842,58.6235177587191,16.1877924281616" />
<Link Source="Runner.py2" Target="AzureStatusTable" Bounds="-1071.05505371094,-370.519165039063,169.874084472656,51.8364562988281" />
<Link Source="Runner.py2" Target="SnpeQuantization" Bounds="-830.72764851888,-314.664864642865,58.0109886197575,2.21273140591086" />
<Link Source="Runner.py3" Target="SnpeQuantiation" Bounds="-832.717089436849,-221.841899441892,63.2713423409264,2.37545015813271" />
<Link Source="Runner.py4" Target="Test_onnx" Bounds="-830.996020100911,-134.464308505795,60.3071978911451,5.86127855164511" />
<Link Source="Upload" Target="AzureBlobStore" Bounds="-1301.04666666667,-483.110272187314,98.5479867833703,22.1825747724036" />
<Link Source="Upload" Target="AzureStatusTable" Bounds="-1301.04666666667,-445.517004029011,99.824346137553,31.5519803774536" />
</Links>
<Categories>
<Category Id="cluster" />
<Category Id="device" />
<Category Id="quantize" />
<Category Id="script" />
<Category Id="storage" />
<Category Id="test" />
</Categories>
<Properties>
<Property Id="Bounds" DataType="System.Windows.Rect" />
<Property Id="Expression" DataType="System.String" />
<Property Id="GraphDirection" DataType="Microsoft.VisualStudio.Diagrams.Layout.LayoutOrientation" />
<Property Id="GroupLabel" DataType="System.String" />
<Property Id="IsEnabled" DataType="System.Boolean" />
<Property Id="Label" Label="Label" Description="Displayable label of an Annotatable object" DataType="System.String" />
<Property Id="Layout" DataType="System.String" />
<Property Id="Offset" DataType="System.String" />
<Property Id="TargetType" DataType="System.Type" />
<Property Id="UseManualLocation" DataType="System.Boolean" />
<Property Id="Value" DataType="System.String" />
<Property Id="ValueLabel" DataType="System.String" />
<Property Id="ZoomLevel" DataType="System.String" />
</Properties>
<Styles>
<Style TargetType="Node" GroupLabel="test" ValueLabel="True">
<Condition Expression="HasCategory('test')" />
<Setter Property="Icon" Value="CodeMap_TestProject" />
</Style>
<Style TargetType="Node" GroupLabel="storage" ValueLabel="True">
<Condition Expression="HasCategory('storage')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Table.png" />
</Style>
<Style TargetType="Node" GroupLabel="script" ValueLabel="True">
<Condition Expression="HasCategory('script')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Script.png" />
</Style>
<Style TargetType="Node" GroupLabel="quantize" ValueLabel="True">
<Condition Expression="HasCategory('quantize')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Gears.png" />
</Style>
<Style TargetType="Node" GroupLabel="device" ValueLabel="True">
<Condition Expression="HasCategory('device')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Device.png" />
</Style>
<Style TargetType="Node" GroupLabel="cluster" ValueLabel="True">
<Condition Expression="HasCategory('cluster')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Network.png" />
</Style>
<Style TargetType="Node">
<Setter Property="HorizontalAlignment" Value="Center" />
<Setter Property="IconPlacement" Value="Top" />
</Style>
</Styles>
</DirectedGraph>

Двоичные данные
tasks/face_segmentation/aml/images/snpe.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 57 KiB

Двоичные данные
tasks/face_segmentation/aml/images/store.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 58 KiB

Просмотреть файл

@ -0,0 +1,65 @@
<?xml version="1.0" encoding="utf-8"?>
<DirectedGraph GraphDirection="TopToBottom" Layout="Sugiyama" Offset="-820.8108333333334,-384.5347442626953" ZoomLevel="1" xmlns="http://schemas.microsoft.com/vs/2009/dgml">
<Nodes>
<Node Id="AmlTraining" Category="aml" Bounds="178.281669209798,-42.1863471113841,158.853333333333,78.924467010498" Label="aml training &#xD;&#xA;(AmlPartialTrainingEvaluator)" UseManualLocation="True" />
<Node Id="AzureMLPipelines" Category="aml" Bounds="455.9075,-19.9970294464111,120.336666666667,62.9644670104981" Label="Azure ML Pipelines" UseManualLocation="True" />
<Node Id="BlobStore(models)" Category="storage" Bounds="372.473597513835,72.9675375640869,116.526666666667,89.5813311767578" Label="blob store &#xD;&#xA;(models &amp; results)" UseManualLocation="True" />
<Node Id="RemoteDeviceInferenceTesting" Category="remote" Bounds="72.2816666666666,170.359651075872,206.316666666667,78.9244670104981" Label="remote device inference testing &#xD;&#xA;(RemoteAzureBenchmarkEvaluator)" UseManualLocation="True" />
<Node Id="Search" Category="search" Bounds="0,0,53.9833333333333,62.964467010498" Label="search" UseManualLocation="True" />
<Node Id="TableResults" Category="storage" Bounds="210.715582682292,66.738219899114,84.0966666666666,73.6213311767578" Label="table results" UseManualLocation="True" />
</Nodes>
<Links>
<Link Source="AmlTraining" Target="AzureMLPipelines" Bounds="337.135002543131,1.64407801566514,109.786077444833,6.03785408365997" />
<Link Source="AmlTraining" Target="BlobStore(models)" Bounds="314.38133986177,36.7381198991139,50.7064047608595,35.3076040473083" />
<Link Source="AmlTraining" Target="TableResults" Bounds="254.894834328378,36.7381198991139,0.977495669259497,21.0098250749297" />
<Link Source="AzureMLPipelines" Target="BlobStore(models)" Bounds="448.526763916016,42.9674377441406,44.100830078125,25.0390167236328" />
<Link Source="BlobStore(models)" Target="AzureMLPipelines" Bounds="451.671051025391,47.7068099975586,44.4396362304688,25.2607269287109" />
<Link Source="RemoteDeviceInferenceTesting" Target="TableResults" Bounds="204.152602969458,147.637053103474,16.5328942653301,22.7225979723975" />
<Link Source="Search" Target="AmlTraining" Bounds="53.9833333333333,10.3717085435882,115.395650767872,17.1087060975231" />
<Link Source="Search" Target="RemoteDeviceInferenceTesting" Bounds="52.7301876549027,62.403421700363,84.1039893644507,101.039033450549" />
<Link Source="TableResults" Target="Search" Bounds="62.5571377644598,42.8347555073116,148.158444917832,47.2922740456947" />
</Links>
<Categories>
<Category Id="aml" />
<Category Id="remote" />
<Category Id="search" />
<Category Id="storage" />
</Categories>
<Properties>
<Property Id="Bounds" DataType="System.Windows.Rect" />
<Property Id="Expression" DataType="System.String" />
<Property Id="GraphDirection" DataType="Microsoft.VisualStudio.Diagrams.Layout.LayoutOrientation" />
<Property Id="GroupLabel" DataType="System.String" />
<Property Id="IsEnabled" DataType="System.Boolean" />
<Property Id="Label" Label="Label" Description="Displayable label of an Annotatable object" DataType="System.String" />
<Property Id="Layout" DataType="System.String" />
<Property Id="Offset" DataType="System.String" />
<Property Id="TargetType" DataType="System.Type" />
<Property Id="UseManualLocation" DataType="System.Boolean" />
<Property Id="Value" DataType="System.String" />
<Property Id="ValueLabel" DataType="System.String" />
<Property Id="ZoomLevel" DataType="System.String" />
</Properties>
<Styles>
<Style TargetType="Node" GroupLabel="storage" ValueLabel="True">
<Condition Expression="HasCategory('storage')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Table.png" />
</Style>
<Style TargetType="Node" GroupLabel="remote" ValueLabel="True">
<Condition Expression="HasCategory('remote')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/CellPhone.png" />
</Style>
<Style TargetType="Node" GroupLabel="search" ValueLabel="True">
<Condition Expression="HasCategory('search')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Gears.png" />
</Style>
<Style TargetType="Node" GroupLabel="aml" ValueLabel="True">
<Condition Expression="HasCategory('aml')" />
<Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Network.png" />
</Style>
<Style TargetType="Node">
<Setter Property="HorizontalAlignment" Value="Center" />
<Setter Property="IconPlacement" Value="Top" />
</Style>
</Styles>
</DirectedGraph>

Двоичные данные
tasks/face_segmentation/aml/images/system.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 32 KiB

Просмотреть файл

@ -112,12 +112,13 @@
}
],
"source": [
"experiment_name = 'facesynthetics'\n",
"con_str = os.getenv(\"MODEL_STORAGE_CONNECTION_STRING\")\n",
"if con_str is None:\n",
" print(\"Please set your MODEL_STORAGE_CONNECTION_STRING environment variable\")\n",
"else:\n",
" storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)\n",
" store = ArchaiStore(storage_account_name, storage_account_key)\n",
" store = ArchaiStore(storage_account_name, storage_account_key, 'models', experiment_name)\n",
" print(f'using {storage_account_name}')"
]
},

Просмотреть файл

@ -0,0 +1,95 @@
# Readme
This folder contains code that automates the search, partial training and inference latency testing in [Azure
ML](https://azure.microsoft.com/en-us/products/machine-learning/). The inference testing of ONNX models can be performed
across one or more machines that are connected via USB to Qualcomm 888 boards.
The code is organized into:
1. [Training](training/readme.md) code that plugs into the Archai Search to perform partial training
of selected models on a GPU cluster in Azure ML.
1. [SNPE Device](snpe/readme.md) code that uses [Microsoft
Olive](https://github.com/microsoft/olive) to drive the the
[Qualcomm Neural Processing SDK](https://developer.qualcomm.com/software/qualcomm-neural-processing-sdk) to talk
to the device, convert ONNX models to .dlc, quantize them, and test them on one or more
[Qualcomm 888 dev kits](https://developer.qualcomm.com/hardware/snapdragon-888-hdk).
1. [Azure Code](azure/readme.md) that talks to a configured Azure storage account for uploading
models to test, downloading them, uploading test results, and keeping an Azure status table that
summarizes results of all the work in progress.
1. [Docker](docker/quantizer/readme.md) contains scripts for setting up your Azure account and optionally
creating a docker image for running in an Azure Kubernetes cluster to do model quantization using
the Qualcomm Neural Processing SDK. Quantization is time consuming so having an elastic scale speeds
things up a lot.
1. [Notebooks](notebook/gallery_performance.md) contains a Jupyter Notebook that can visualize the
results from your Azure "status" table.
## Workflow
The overall workflow begins with the top level [aml.py](../../aml.py) script which
starts with an Archai Search that contains an `AmlPartialTrainingEvaluator` and a
`RemoteAzureBenchmarkEvaluator`. The remote benchmark evaluator performs inference latency testing
on Qualcomm hardware. The `AmlPartialTrainingEvaluator` then kicks off one new Azure ML
training pipeline for each batch of new model architectures that need to be partially trained, it
stores the validation IOU results in an Azure blob store and an Azure table so the search can get
those results and use them to figure out the next iteration of the search algorithm:
![system](images/system.png)
See [AML Training Readme](training/readme.md) for more information.
## Remote Inference Testing
The remote inference testing workflow looks like this, the `RemoteAzureBenchmarkEvaluator` uploads models to the same
Azure blob store, and adds a row to the status table. This triggers remote instances of the [runner.py](azure/runner.py) script
to process these new models on an attached Qualcomm device. Optionally some of the work can be done in the cloud
using a Kubernetes cluster, this includes model quantization and accuracy testing using the ONNX runtime.
The workflow looks like this:
![snpe](images/snpe.png)
Each instance of `runner.py` looks for work, and executes it in priority order where the prioritization is defined by
the `find_work_prioritized` function in the runner. This script is completely restartable, and can distribute the work
across multiple instances of the runner script. Each instance will pick up where a previous one left off based on what
it finds in your Azure status table. The prioritization maps to the columns of the status table as follows:
1. **macs:** convert to .dlc and post Macs score and `snpe-dlc-viewer` output and do model quantization (runs on Linux) - priority 20
1. **total_inference_avg** run `snpe_bench.py` with quantized model on Qualcomm device DSP - priority 30
1. **f1_onnx** compute f1 from onnxruntime on .onnx model on a 10k test set on Linux - priority 60
1. **f1_1k** compute f1 on quantized .dlc model on Qualcomm device DSP with a 1k test set - priority
is the mean f1 score so that quicker models are prioritized.
1. **f1_1k_f** compute f1 on floating point .dlc model on on Qualcomm device CPU with a 1k test set
- priority 10 * the mean f1 score so that quicker models are prioritized.
1. **f1_10k** compute f1 on quantized model on a 10k test set - priority = 100 * the mean f1 score
so that quicker models are prioritized.
Lower number means higher priority job and each machine will run the highest priority work first.
You can override the priority of a specific job by passing a `--proprity` parameter on the `upload.py` script or by
editing the Azure status table and adding a `priority` field to the JSON stored there. You can set any priority number
you want, if you specify priority 0 it will run before anything else which can be handy if you have a cool new model
that you want to bump to the top of the list.
Notice some of the above jobs can run on Linux and do not require Qualcomm device. So in order to maximize throughput on
machines that do have a Qualcomm devices you can allocate other Linux machines with no Qualcomm devices to do the other
work, namely, converting models, quantizing them, and running the `f1_onnx` test.
Folks across your team can use the `azure/upload.py` to submit jobs and let them run, or they can automate that as
shown in the `RemoteAzureBenchmarkEvaluator` in the `search.py` script.
You can use `status.py` to monitor progress or look at the Azure status table. Various status messages are posted
there so you can see which machine is doing what and is in what stage of the job.
Next you can go to the `notebook` page and get some pretty pictures of your Pareto Curves.
## Azure Portal
When everything is running you will see progress happening in your Azure status table. Here you see the snpe-quantizer
kubernetes cluster is quantizing a bunch of models while other machines are running the bench mark tests on the Qualcomm
hardware:
![portal](images/portal.png)

Просмотреть файл

@ -1,6 +1,5 @@
importlib-metadata!=4.7.0,<6,>=3.7.0
packaging<22.0,>=20.0
cryptography<39,>=38.0.0
psutil
tqdm
pandas

Просмотреть файл

@ -37,7 +37,7 @@ bits. If you plan to use Qualcomm hardware devices then set the `SNPE_ANDROID_R
1. **Install required packages including Olive **
```
pushd tasks/face_segmentation/snpe
pushd tasks/face_segmentation/aml
pip install -r requirements.txt
```

Просмотреть файл

@ -0,0 +1,149 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import json
from pathlib import Path
from typing import List, Optional, Union
from overrides import overrides
from archai.discrete_search.api.archai_model import ArchaiModel
from archai.discrete_search.api.model_evaluator import AsyncModelEvaluator
from archai.common.config import Config
from shutil import copyfile
from archai.common.monitor import JobCompletionMonitor
from aml.training.training_pipeline import start_training_pipeline
from azure.identity import DefaultAzureCredential
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
from azure.ai.ml import MLClient
from aml.util.setup import configure_store, get_valid_arch_id
def _get_entity_value(entity, key, default_value=''):
if key in entity:
return entity[key]
return default_value
class AmlPartialTrainingEvaluator(AsyncModelEvaluator):
""" The AmlPartialTrainingEvaluator launches partial training jobs"""
def __init__(self,
config : Config,
local_output: Path,
tr_epochs: int = 1,
timeout_seconds=3600):
self.config = config
self.tr_epochs = int(tr_epochs)
self.iteration = 1
aml_config = config['aml']
workspace_name = aml_config['workspace_name']
subscription_id = aml_config['subscription_id']
resource_group_name = aml_config['resource_group']
identity = DefaultAzureCredential()
if os.getenv('AZUREML_ROOT_RUN_ID'):
identity = AzureMLOnBehalfOfCredential()
self.ml_client = MLClient(
credential=identity,
subscription_id=subscription_id,
resource_group_name=resource_group_name,
workspace_name=workspace_name
)
self.local_output = local_output
self.models = []
self.timeout = timeout_seconds
self.store = configure_store(aml_config)
self.results = []
self.metric_key = self.config['training'].get('metric_key', 'val_iou')
@overrides
def send(self, arch: ArchaiModel, budget: Optional[float] = None) -> None:
self.models += [arch]
model_id = get_valid_arch_id(arch)
e = self.store.get_status(model_id)
if self.metric_key in e and e[self.metric_key]:
# seems to have already been trained then, so to make this a restartable job we pick up those results.
if 'iteration' not in e:
e['iteration'] = self.iteration
self.store.merge_status_entity(e)
metric = float(e[self.metric_key])
self.results += [{
'id': model_id,
self.metric_key: metric,
'status': _get_entity_value(e, 'status'),
'error': _get_entity_value(e, 'error')
}]
@overrides
def fetch_all(self) -> List[Union[float, None]]:
if len(self.results) > 0:
print(f'AmlPartialTrainingEvaluator: found {len(self.results)} were already trained.')
index = {}
for existing in self.results:
id = existing['id']
index[id] = existing
# pull out the models that have not yet been trained.
pending = []
for arch in self.models:
model_id = get_valid_arch_id(arch)
if model_id not in index:
pending += [arch]
if len(pending) > 0:
print(f"AmlPartialTrainingEvaluator: Starting training on {len(pending)} models")
# train all the models listed in the pending on a GPU cluster so we get much training
# happening in parallel which greatly reduces the overall Archai Search process.
description = f"AmlPartialTrainingEvaluator training {self.tr_epochs} epochs"
pipeline_job, model_names = start_training_pipeline(
description, self.iteration, self.ml_client, self.store, pending, self.config, self.tr_epochs, self.local_output)
job_id = pipeline_job.name
print(f'AmlPartialTrainingEvaluator: Started training pipeline: {job_id}')
# wait for all the parallel training jobs to finish
keys = [self.metric_key]
monitor = JobCompletionMonitor(self.store, self.ml_client, keys, job_id, self.timeout)
models = monitor.wait(model_names)['models']
for m in models:
id = m['id']
index[id] = m
# now reassemble all results in the right order (order of the send method calls)
models = []
for arch in self.models:
model_id = get_valid_arch_id(arch)
result = index[model_id]
models += [result]
results = {'models': models}
# save the results to the output folder (which is mapped by the AML pipeline to our
# blob store under the container 'models' in the folder named the same as the
# experiment_name)
results_path = f'{self.local_output}/models.json'
summary = json.dumps(results, indent=2)
with open(results_path, 'w') as f:
f.write(summary)
# save the archai log also which can be handy for debugging later.
log = 'archai.log'
if os.path.isfile(log):
copyfile(log, f'{self.local_output}/{log}')
# extract the array of results for our return value this is the metric that the
# Archai search needs to figure out which models to continue to evolve and which are
# not so good.
metrics = []
for m in results['models']:
metric = m[self.metric_key]
metrics += [metric]
self.models = [] # reset for next run.
print(f'AmlPartialTrainingEvaluator: fetch_all returning : {summary}')
self.iteration += 1
return metrics

Просмотреть файл

@ -0,0 +1,53 @@
## AML Training Readme
Two scripts in this folder provide Azure ML training pipelines to the Archai Search.
The [train.py](../../train.py) script plugs in the `AmlPartialTrainingEvaluator`
async model evaluator when invoked with the [aml_search.yaml](../../confs/aml_search.yaml) configuration.
The top level [aml.py](../../aml.py) script kicks off this process setting up all the
required Azure ML resources including:
- a conda environment used to build the docker image that Azure ML uses
- a cpu cluster for running the search
- a gpu cluster for running partial training
- the datastore for the training dataset
- the datastore for downloading config info and uploading results, including the trained models
Notice that the [aml_search.yaml](../../confs/aml_search.yaml) configuration
file requires the following environment variables be defined so that it can find your Azure subscription,
and Azure ML resource group and workspace, it also needs the connection string for your storage account.
```yaml
aml:
connection_str: ${MODEL_STORAGE_CONNECTION_STRING}
subscription_id: ${AZURE_SUBSCRIPTION_ID}
resource_group: ${AML_RESOURCE_GROUP}
workspace_name: ${AML_WORKSPACE_NAME}
```
The storage account can be created using the [setup.ps1](../docker/quantizer/setup.ps1) powershell script
which uses the [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli).
[aml_training_evaluator](aml_training_evaluator.py) provides a class named
`AmlPartialTrainingEvaluator` which is an `AsyncModelEvaluator` that creates a new Azure ML Pipeline
for partial training each new batch of models provided by the Archai Search algorithm. It runs these partial
training jobs on a GPU cluster to maximize throughput.
[training_pipeline](training_pipeline.py) provides a helper function named `start_training_pipeline`
that uses the Azure ML Python SDK to create the Azure ML training pipeline.
As the search progresses and training is completed you will find the following files in your
Azure storage account:
![store](../images/store.png)
The name of this folder is the model id, `id_113c4c240bfc5fd2eaf2f0129439251bb754ddc4` which can also
be found in your Azure storage table. The parent folder `facesynthetics` is the `experiment name` and is also
the name of your Azure storage table. This table contains the overall summary for all the models processed so far.
You can see here that Qualcomm Snapdragon processing was also done on this model, so you see the [.dlc
files](https://developer.qualcomm.com/sites/default/files/docs/snpe/overview.html) there for the model and for the
quantized version of the model and a `.csv` file containing information about inference times for this model (including
layer by layer information which is handy).
The json file contains the `ArchConfig` for the model which can be used to recreate the model
and fully train it using the [train.py](../../train.py) script.

Просмотреть файл

@ -0,0 +1,165 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import json
import os
from pathlib import Path
from typing import List
from archai.common.store import ArchaiStore
from azure.ai.ml import MLClient
from archai.discrete_search.api import ArchaiModel
from archai.discrete_search.search_spaces.config import ArchConfig
from azure.ai.ml import command, Input, Output, dsl
from azure.ai.ml.entities import UserIdentityConfiguration
from archai.common.config import Config
from aml.util.setup import copy_code_folder, get_valid_arch_id
from shutil import copyfile
def training_component(output_path: str, code_dir: Path, config, training_epochs: int, config_filename: str, model_id: str, arch: str):
# we need a folder containing all the specific code we need here, which is not everything in this repo.
training = config['training']
learning_rate = training['learning_rate']
batch_size = training['batch_size']
aml_config = config['aml']
environment_name = aml_config['environment_name']
con_str = aml_config['connection_str']
fixed_args = f'--lr {learning_rate} --batch_size {batch_size} ' +\
f'--epochs {int(training_epochs)} --model_id {model_id} --config {config_filename} ' +\
f'{arch}'
return command(
name="train",
display_name="Archai training job",
description="Trains a face segmentation model.",
inputs={
"data": Input(type="uri_folder", mode="download")
},
is_deterministic=False,
outputs={
"results": Output(type="uri_folder", path=output_path, mode="rw_mount")
},
environment_variables={'MODEL_STORAGE_CONNECTION_STRING': con_str},
identity=UserIdentityConfiguration(),
# The source folder of the component
code=str(code_dir),
command="""python3 train.py \
--dataset_dir ${{inputs.data}} \
--output_dir ${{outputs.results}} \
""" + fixed_args,
environment=environment_name,
)
def start_training_pipeline(description: str, iteration: int, ml_client: MLClient, store: ArchaiStore,
model_architectures: List[ArchaiModel],
config: Config, training_epochs: int, output_folder: Path):
""" Creates a new Azure ML Pipeline for training a set of models, updating the status of
these jobs in a given Azure Storage Table. This command does not wait for those jobs to
finish. For that use the monitor.py script which monitors the same Azure Storage Table
to find out when the jobs have all finished. The train.py script will update the table
when each training job completes. """
aml_config = config['aml']
training_cluster = aml_config['training_cluster']
compute_cluster_name = training_cluster['name']
datastore_path = aml_config['datastore_path']
root_uri = aml_config['results_path']
environment_name = aml_config['environment_name']
experiment_name = aml_config['experiment_name']
metric_key = config['training'].get('metric_key', 'val_iou')
print(f"Cluster: {compute_cluster_name}")
print(f"Dataset: {datastore_path}")
print(f"Output: {root_uri}")
print(f"Environment: {environment_name}")
print(f"Experiment: {experiment_name}")
print(f"Epochs: {training_epochs}")
print(f"Iteration: {iteration}")
code_dir = Path('temp_code')
os.makedirs(code_dir, exist_ok=True)
config_dir = code_dir / 'confs'
os.makedirs(config_dir, exist_ok=True)
archs_dir = code_dir / 'archs'
os.makedirs(archs_dir, exist_ok=True)
copyfile('train.py', str(code_dir / 'train.py'))
copy_code_folder('training', str(code_dir / 'training'))
copy_code_folder('search_space', str(code_dir / 'search_space'))
copy_code_folder(os.path.join('aml', 'training'), str(code_dir / 'aml' / 'training'))
copy_code_folder(os.path.join('aml', 'util'), str(code_dir / 'aml' / 'util'))
config.save(str(config_dir / 'aml_search.yaml'))
models = []
model_names = []
for arch in model_architectures:
model_id = get_valid_arch_id(arch)
model_names += [model_id]
print(f'Launching training job for model {model_id}')
# upload the model architecture to our blob store so we can find it later.
metadata: ArchConfig = arch.metadata['config']
filename = str(archs_dir / f'{model_id}.json')
metadata.to_file(filename)
store.upload_blob(f'{experiment_name}/{model_id}', filename, blob_name=f'{model_id}.json')
# create status entry in azure table
e = store.get_status(model_id)
e['experiment'] = experiment_name
e['epochs'] = training_epochs
e['iteration'] = iteration
e['status'] = 'preparing'
store.merge_status_entity(e)
models += [{
'id': model_id,
'status': 'training',
'epochs': training_epochs,
metric_key: e[metric_key] if metric_key in e else 0.0
}]
results = {
'models': models
}
@dsl.pipeline(
compute=compute_cluster_name,
description=description,
)
def parallel_training_pipeline(
data_input
):
outputs = {}
for arch in model_architectures:
model_id = get_valid_arch_id(arch)
output_path = f'{root_uri}/{model_id}'
filename = f'archs/{model_id}.json'
train_job = training_component(
output_path, code_dir, config, training_epochs, 'confs/aml_search.yaml', model_id, filename)(
data=data_input
)
outputs[model_id] = train_job.outputs.results
return outputs
training_pipeline = parallel_training_pipeline(
data_input=Input(type="uri_folder", path=datastore_path)
)
# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
training_pipeline,
experiment_name=experiment_name,
)
# Write the new list of pending models so that the make_monitor_command
# knows what to wait for.
print("Writing pending.json: ")
print(json.dumps(results, indent=2))
results_path = output_folder / 'pending.json'
with open(results_path, 'w') as f:
f.write(json.dumps(results, indent=2))
return (pipeline_job, model_names)

Просмотреть файл

@ -0,0 +1,82 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import sys
from glob import glob
from shutil import copyfile
from archai.common.config import Config
from archai.common.store import ArchaiStore
from azure.ai.ml.entities._credentials import AccountKeyConfiguration
from azure.ai.ml.entities import AzureBlobDatastore
import archai.common.azureml_helper as aml_helper
from archai.discrete_search.api.archai_model import ArchaiModel
def configure_store(aml_config: Config, blob_container_name: str = None) -> ArchaiStore:
con_str = aml_config.get('connection_str')
if not con_str:
print("Please set environment variable 'MODEL_STORAGE_CONNECTION_STRING' containing the Azure storage account connection " +
"string for the Azure storage account you want to use to control this experiment.")
sys.exit(1)
if blob_container_name is None:
blob_container_name = aml_config.get('blob_container_name', 'models')
experiment_name = aml_config.get('experiment_name', 'facesynthetics')
partition_key = aml_config.get('partition_key', 'main')
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
return ArchaiStore(storage_account_name, storage_account_key, blob_container_name, experiment_name, partition_key)
def register_datastore(ml_client, data_store_name, blob_container_name, storage_account_name, storage_account_key, experiment_name):
try:
credentials = AccountKeyConfiguration(account_key=storage_account_key)
model_store = ml_client.datastores.get(data_store_name)
if model_store.container_name != blob_container_name:
raise Exception(f'The container name does not match. Only the credentials on {data_store_name} can be updated')
if model_store.account_name != storage_account_name:
raise Exception(f'The storage account name does not match. Only the credentials on {data_store_name} can be updated')
model_store.credentials = credentials
except:
model_store = AzureBlobDatastore(
name=data_store_name,
description="Datastore pointing to a blob container.",
account_name=storage_account_name,
container_name=blob_container_name,
credentials=credentials,
)
ml_client.create_or_update(model_store)
return f'azureml://datastores/{data_store_name}/paths/{experiment_name}'
def create_cluster(ml_client, config, key):
section = config[key]
compute_name = section['name']
size = section['size']
location = section['location']
max_instances = section.get('max_instances', 1)
aml_helper.create_compute_cluster(ml_client, compute_name, size=size, location=location, max_instances=max_instances)
return compute_name
def copy_code_folder(src_dir, target_dir):
""" Copies the code folder into a separate folder. This is needed otherwise the pipeline will fail with
UserError: The code snapshot was modified in blob storage, which could indicate tampering.
If this was unintended, you can create a new snapshot for the run. To do so, edit any
content in the local source directory and resubmit the run.
"""
os.makedirs(target_dir, exist_ok=True)
for path in glob(os.path.join(src_dir, '*.py')):
file = os.path.basename(path)
print(f"copying source file : {file} to {target_dir}")
copyfile(path, os.path.join(target_dir, file))
for name in os.listdir(src_dir):
path = os.path.join(src_dir, name)
if os.path.isdir(path):
copy_code_folder(path, os.path.join(target_dir, name))
def get_valid_arch_id(arch: ArchaiModel):
# bug in azure ml sdk requires blob store folder names not begin with digits, so we prefix with 'id_'
return f'id_{arch.archid}'

Просмотреть файл

@ -1,25 +0,0 @@
import os
from archai.discrete_search.search_spaces.config import ArchConfig
from archai.discrete_search.evaluators import TorchNumParameters
from archai.discrete_search.api.archai_model import ArchaiModel
from search_space.hgnet import StackedHourglass, HgnetSegmentationSearchSpace
from archai.common.config import Config
constraint = (1e5, 5e7)
evaluator = TorchNumParameters()
search_config = Config('confs/cpu_search.yaml')['search']
ss_config = search_config['search_space']
search_space = HgnetSegmentationSearchSpace(seed=1680312796, **ss_config.get('params', {}))
targets = os.path.join('archs', 'snp_target')
for file in os.listdir(targets):
path = os.path.join(targets, file)
if os.path.isfile(path) and path.endswith(".json"):
config = ArchConfig.from_file(path)
model = StackedHourglass(config, **search_space.model_kwargs)
archid = os.path.splitext(file)[0]
m = ArchaiModel(model, archid, config)
num_params = evaluator.evaluate(m, None)
if num_params < constraint[0] or num_params > constraint[1]:
print(f"Model {file} has {num_params} parameters and is outside the valid range.")

Просмотреть файл

@ -0,0 +1,102 @@
search:
search_space:
name: hgnet
params:
num_classes: 18
img_size: [256, 256] # (w, h)
in_channels: 3
op_subset: ['conv3x3', 'conv5x5', 'conv7x7']
stem_strides: [2]
# Number of downsampling blocks (without counting stem conv)
num_blocks: 5
# Maximum number of layers in downsampling blocks
downsample_block_max_ops: 4
# Maximum number of layers in skip blocks
skip_block_max_ops: 2
# Maximum number of layers in upsampling blocks
upsample_block_max_ops: 4
# Maximum number of layers after the final upsampling layer
post_upsample_max_ops: 2
algorithm:
name: evolution_pareto
params:
num_iters: 20
init_num_models: 20
mutations_per_parent: 5
num_crossovers: 6
max_unseen_population: 20
num_random_mix: 6
max_parameters: 5e7
# we are training elsewhere, so tell the search not to try and save them locally!
save_pareto_model_weights: false
target:
name: snp
metric_key: mean
max_retries: 15
retry_interval: 60
verbose: true
training:
# https://learn.microsoft.com/en-us/answers/questions/1215210/limited-gpu-ram
batch_size: 16
learning_rate: 2e-4
partial_training_epochs: 1
metric_key: val_iou
aml:
connection_str: ${MODEL_STORAGE_CONNECTION_STRING}
blob_container_name: models
experiment_name: facesynthetics
partition_key: main
subscription_id: ${AZURE_SUBSCRIPTION_ID}
resource_group: ${AML_RESOURCE_GROUP}
workspace_name: ${AML_WORKSPACE_NAME}
timeout: 18000
search_cluster:
name: nas-cpu-cluster-D14-v2
size: Standard_D14_v2
location: westus2
training_cluster:
name: nas-gpu-cluster-NC6
size: Standard_NC6
location: westus2
max_instances: 20
environment:
name: facesynthetics-nas-env
channels:
- conda-forge
- pytorch
- nvidia
dependencies:
- python=3.10
- pip
- pip:
- azure-ai-ml==1.5.0
- azure-storage-blob
- azure-data-tables
- azure-identity
- azureml-mlflow
- datasets>=2.4.0
- matplotlib
- mldesigner
- mlflow
- onnx>=1.10.2
- onnxruntime>=1.10.0
- psutil
- torch
- torchvision
- torchaudio
- lightning>=2.0.0
- archai[dev] @ git+https://github.com/microsoft/archai.git

Просмотреть файл

@ -34,14 +34,17 @@ search:
num_crossovers: 6
max_unseen_population: 20
num_random_mix: 6
max_parameters: 5e7
target:
name: snp
connection_str_env_var: MODEL_STORAGE_CONNECTION_STRING
blob_container_name: models
table_name: status
partition_key: main
max_retries: 15
retry_interval: 120
metric_key: mean
max_retries: 15
retry_interval: 60
verbose: true
aml:
connection_str: ${MODEL_STORAGE_CONNECTION_STRING}
blob_container_name: models
partition_key: main
experiment_name: facesynthetics

Просмотреть файл

@ -0,0 +1,31 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import argparse
import os
from archai.datasets.cv.face_synthetics import FaceSyntheticsDatasetProvider
def main():
""" This script downloads the Face Synthetics dataset to the specified folder. """
# input and output arguments
print("Starting prep_data_store...")
parser = argparse.ArgumentParser()
parser.add_argument("--path", type=str, help="root folder to place the downloaded dataset.")
args = parser.parse_args()
path = args.path
print(f'Writing Face Synthetics dataset to: {path}')
if not path or not os.path.exists(path):
raise ValueError(f'Missing path: {path}')
provider = FaceSyntheticsDatasetProvider(dataset_dir=path)
# now force the full download to happen to that root folder.
provider.get_train_dataset()
provider.get_val_dataset()
provider.get_test_dataset()
if __name__ == "__main__":
main()

Просмотреть файл

@ -21,6 +21,8 @@ from archai.discrete_search.evaluators.remote_azure_benchmark import RemoteAzure
from search_space.hgnet import HgnetSegmentationSearchSpace
from training.partial_training_evaluator import PartialTrainingValIOU
from aml.training.aml_training_evaluator import AmlPartialTrainingEvaluator
from aml.util.setup import configure_store
AVAILABLE_ALGOS = {
'mo_bananas': MoBananasSearch,
@ -47,21 +49,24 @@ def filter_extra_args(extra_args: List[str], prefix: str) -> List[str]:
def main():
parser = ArgumentParser()
parser.add_argument('--dataset_dir', type=Path, help='Face Synthetics dataset directory.', required=True)
parser.add_argument('--output_dir', type=Path, help='Output directory.', required=True)
parser.add_argument('--dataset_dir', type=Path, help='Face Synthetics dataset directory.')
parser.add_argument('--output_dir', type=Path, help='Output directory.', default='output')
parser.add_argument('--search_config', type=Path, help='Search config file.', default=confs_path / 'cpu_search.yaml')
parser.add_argument('--serial_training', help='Search config file.', action='store_true')
parser.add_argument('--gpus_per_job', type=float, help='Number of GPUs used per job (if `serial_training` flag is disabled)',
default=0.5)
parser.add_argument('--partial_tr_epochs', type=float, help='Number of epochs to run partial training', default=1.0)
parser.add_argument('--partial_tr_epochs', type=int, help='Number of epochs to run partial training', default=1)
parser.add_argument('--seed', type=int, help='Random seed', default=42)
parser.add_argument('--max_parameters', type=float, help='Specify a maximum number of parameters in the model (default 50M or 5e7).', default=5e7)
parser.add_argument('--timeout', type=int, help='Timeout for partial training (in seconds)(default 10800)', default=10800)
args, extra_args = parser.parse_known_args()
timeout_seconds = args.timeout
# Filters extra args that have the prefix `search_space`
search_extra_args = filter_extra_args(extra_args, 'search.')
search_config = Config(str(args.search_config), search_extra_args)['search']
config = Config(str(args.search_config), search_extra_args, resolve_env_vars=True)
search_config = config['search']
# Search space
ss_config = search_config['search_space']
@ -73,6 +78,9 @@ def main():
input_shape = (1, search_space.in_channels, *search_space.img_size[::-1])
partial_training_output = args.output_dir / 'partial_training_logs'
os.makedirs(partial_training_output, exist_ok=True)
# Search objectives
so = SearchObjectives()
@ -81,12 +89,15 @@ def main():
assert target_name in ['cpu', 'snp']
max_latency = 0.3 if target_name == 'cpu' else 0.185
algo_config = search_config['algorithm']
algo_params = algo_config.get('params', {})
max_parameters = float(algo_params.pop('max_parameters', 5e7))
# Adds a constraint on number of parameters so we don't sample models that are too large
so.add_constraint(
'Model Size (b)',
TorchNumParameters(),
constraint=(1e6, args.max_parameters)
constraint=(1e6, max_parameters)
)
# Adds a constrained objective on model latency so we don't pick models that are too slow.
@ -100,24 +111,17 @@ def main():
constraint=[0, max_latency]
)
aml_training = False
if target_name == 'snp':
# Gets connection string from env variable
env_var_name = target_config.pop('connection_str_env_var')
con_str = os.getenv(env_var_name)
if not con_str:
print("Please set environment variable {env_var_name} containing the Azure storage account connection " +
"string for the Azure storage account you want to use to control this experiment.")
sys.exit(1)
blob_container_name = target_config.pop('blob_container_name', 'models')
table_name = target_config.pop('table_name', 'status')
partition_key = target_config.pop('partition_key', 'main')
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
store = ArchaiStore(storage_account_name, storage_account_key, blob_container_name, table_name, partition_key)
aml_config = config['aml']
experiment_name = aml_config.get('experiment_name', 'facesynthetics')
store: ArchaiStore = configure_store(aml_config)
evaluator = RemoteAzureBenchmarkEvaluator(
input_shape=input_shape,
store=store,
experiment_name=experiment_name,
onnx_export_kwargs={'opset_version': 11},
**target_config
)
@ -129,20 +133,34 @@ def main():
compute_intensive=True
)
# Dataset provider
dataset_provider = FaceSyntheticsDatasetProvider(args.dataset_dir)
aml_training = 'training_cluster' in aml_config
partial_tr_obj = PartialTrainingValIOU(
dataset_provider,
tr_epochs=args.partial_tr_epochs,
output_dir=args.output_dir / 'partial_training_logs'
)
if not args.serial_training:
partial_tr_obj = RayParallelEvaluator(
partial_tr_obj, num_gpus=args.gpus_per_job,
max_calls=1
if aml_training:
# do the partial training on an AML gpu cluster
partial_tr_obj = AmlPartialTrainingEvaluator(
config,
tr_epochs=int(args.partial_tr_epochs),
timeout_seconds=timeout_seconds,
local_output=partial_training_output
)
else:
if args.dataset_dir is None:
raise ValueError('--dataset_dir must be specified if target is not aml')
# Dataset provider
dataset_provider = FaceSyntheticsDatasetProvider(args.dataset_dir)
partial_tr_obj = PartialTrainingValIOU(
dataset_provider,
tr_epochs=args.partial_tr_epochs,
output_dir=partial_training_output
)
if not args.serial_training:
partial_tr_obj = RayParallelEvaluator(
partial_tr_obj, num_gpus=args.gpus_per_job,
max_calls=1
)
so.add_objective(
'Partial Training Val. IOU',
@ -152,11 +170,11 @@ def main():
)
# Search algorithm
algo_config = search_config['algorithm']
algo = AVAILABLE_ALGOS[algo_config['name']](
search_space, so,
output_dir=args.output_dir, seed=args.seed,
**algo_config.get('params', {}),
output_dir=args.output_dir,
seed=args.seed,
**algo_params,
)
algo.search()

Просмотреть файл

@ -1,49 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<DirectedGraph xmlns="http://schemas.microsoft.com/vs/2009/dgml">
<Nodes>
<Node Id="..." Bounds="-891.259155273438,-397.317448474421,50,25.96" UseManualLocation="True" />
<Node Id="...1" Bounds="-899.17041015625,-141.018983432007,50,25.96" Label="..." UseManualLocation="True" />
<Node Id="AzureBlobStore" Bounds="-1193.71833333333,-519.02,106.03,25.96" Label="azure blob store" UseManualLocation="True" />
<Node Id="AzureStatusTable" Bounds="-1193.71833333333,-445.02,112.973333333333,25.96" Label="azure status table" UseManualLocation="True" />
<Node Id="JupyterNotebook" Bounds="-1396.73103800456,-328.529480438232,115.163333333333,25.96" Label="Jupyter Notebook" UseManualLocation="True" />
<Node Id="KubernetesCluster" Bounds="-1208.17041666667,-308.193725585938,119.78,25.96" Label="Kubernetes Cluster" UseManualLocation="True" />
<Node Id="QualcommDevice" Bounds="-769.718333333333,-534.029998779297,115.2,25.96" Label="Qualcomm device" UseManualLocation="True" />
<Node Id="QualcommDevice1" Bounds="-771.540833333333,-453.524998168945,115.2,25.96" Label="Qualcomm device" UseManualLocation="True" />
<Node Id="Runner.py" Bounds="-903.436666666667,-534.04,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
<Node Id="Runner.py1" Bounds="-900.436666666666,-453.277548474121,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
<Node Id="Runner.py2" Bounds="-899.452100016276,-313.899282537842,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
<Node Id="Runner.py3" Bounds="-903.170422770182,-257.939182537842,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
<Node Id="Runner.py4" Bounds="-901.449353434245,-196.979083432007,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
<Node Id="SnpeQuantiation" Bounds="-760.452083333333,-251.989580438032,109.126666666667,25.96" Label="snpe quantiation" UseManualLocation="True" />
<Node Id="SnpeQuantization" Bounds="-765.452083333333,-319.111862182617,114.553333333333,25.96" Label="snpe quantization" UseManualLocation="True" />
<Node Id="Test_onnx" Bounds="-761.731038004557,-196.029480438032,70.6,25.96" Label="test_onnx" UseManualLocation="True" />
<Node Id="Upload" Bounds="-1358,-486,56.9533333333334,25.96" Label="upload" UseManualLocation="True" />
</Nodes>
<Links>
<Link Source="AzureBlobStore" Target="Runner.py" Bounds="-1087.68830810547,-518.622952023936,175.265282619028,9.66072548941548" />
<Link Source="AzureBlobStore" Target="Runner.py1" Bounds="-1088.87310498647,-493.671486483734,179.682249037085,42.8785481723203" />
<Link Source="AzureBlobStore" Target="Runner.py2" Bounds="-1123.20784148837,-493.060008544922,234.258958940773,173.798238915231" />
<Link Source="AzureStatusTable" Target="JupyterNotebook" Bounds="-1308.85493100781,-419.059993286133,149.124540767732,86.0330250379735" />
<Link Source="AzureStatusTable" Target="Runner.py" Bounds="-1100.05297851563,-505.965637207031,188.762878417969,60.9456481933594" />
<Link Source="AzureStatusTable" Target="Runner.py1" Bounds="-1128.9501953125,-459.267211914063,219.597229003906,14.2472229003906" />
<Link Source="AzureStatusTable" Target="Runner.py2" Bounds="-1125.57153320313,-419.059997558594,218.665649414063,102.314849853516" />
<Link Source="KubernetesCluster" Target="Runner.py2" Bounds="-1088.39039550781,-300.030978319804,179.940110471371,3.61429271166003" />
<Link Source="KubernetesCluster" Target="Runner.py3" Bounds="-1088.48298803671,-284.494147681096,176.453782358855,31.6319870791329" />
<Link Source="KubernetesCluster" Target="Runner.py4" Bounds="-1115.36108846197,-282.233729858398,207.979818936008,82.0059196880921" />
<Link Source="Runner.py" Target="AzureStatusTable" Bounds="-1131.49816894531,-516.163635253906,228.0615234375,69.8030700683594" />
<Link Source="Runner.py" Target="QualcommDevice" Bounds="-832.983324788411,-521.057729225212,54.2649914734869,0.00347075341346681" />
<Link Source="Runner.py1" Target="AzureStatusTable" Bounds="-1096.17529296875,-453.341644287109,195.738647460938,11.53271484375" />
<Link Source="Runner.py1" Target="QualcommDevice1" Bounds="-829.983324788411,-440.436044683939,49.4425034983809,0.0808848954610539" />
<Link Source="Runner.py2" Target="AzureStatusTable" Bounds="-1130.11853027344,-414.711029052734,230.666442871094,107.175262451172" />
<Link Source="Runner.py2" Target="SnpeQuantization" Bounds="-828.998766682943,-303.91817535817,54.5517017881938,1.8222062802526" />
<Link Source="Runner.py3" Target="SnpeQuantiation" Bounds="-832.717089436849,-243.665883447078,63.271065419621,2.32289765526392" />
<Link Source="Runner.py4" Target="Test_onnx" Bounds="-830.996020100911,-183.759789886113,60.2651897407106,0.409381159973066" />
<Link Source="Upload" Target="AzureBlobStore" Bounds="-1301.04664876302,-495.218628273773,98.4628791895163,17.2187510083821" />
<Link Source="Upload" Target="AzureStatusTable" Bounds="-1301.04664876302,-466.951233796788,99.2138483678966,21.1438396495243" />
</Links>
<Properties>
<Property Id="Bounds" DataType="System.Windows.Rect" />
<Property Id="Label" Label="Label" Description="Displayable label of an Annotatable object" DataType="System.String" />
<Property Id="UseManualLocation" DataType="System.Boolean" />
</Properties>
</DirectedGraph>

Двоичные данные
tasks/face_segmentation/snpe/images/system.png

Двоичный файл не отображается.

До

Ширина:  |  Высота:  |  Размер: 28 KiB

Просмотреть файл

@ -1,70 +0,0 @@
# Readme
This folder contains code that automates the testing of ONNX models across one or more machines that are connected via
USB to Qualcomm 888 boards.
The code is organized into:
1. [SNPE Device Code](snpe/readme.md) that knows how to use the Qualcomm Neural Processing SDK to talk to the device,
convert ONNX models to .dlc, quantize them, and test them on the board using the Android `adb` tool.
1. [Azure Code](azure/readme.md) that talks to a configured Azure storage account for uploading models to test,
downloading them, uploading test results, and keeping an Azure table "status" that summarizes results of all your
models.
1. [Docker](docker/quantizer/readme.md) scripts for setting up your Azure account and optionally creating a docker image
for running in an Azure Kubernetes cluster to do model quantization using the Qualcomm Neural Processing SDK.
Quantization is time consuming so having an elastic scale speeds things up a lot.
1. [Notebooks](notebook/gallery_performance.md) contains a Jupyter Notebook that can visualize the results from the
Azure "status" table.
It is best if you setup a new Conda Python environment for Python 3.10 with the `requirements.txt` included here using:
```shell
pip install -r requirements.txt
```
The SNPE SDK only works on Linux, so you need a Linux machine with this repo. Then follow additional setup in each of
the above readmes.
## Workflow
The overall workflow looks like this. One or more Linux machines are setup as above and are running `azure/runner.py`
including a Kubernetes cluster setup for quantization (see [docker/quantizer](docker/quantizer) folder).
![system](images/system.png)
Each instance of `runner.py` looks for work, and executes it in priority order where the prioritization is defined by
the `find_work_prioritized` function in the runner. This script is completely restartable, and can distribute the work
across multiple instances of the runner script. Each instance will pick up where a previous one left off based on what
it finds in your "status" Azure table. The prioritization maps to the columns of the status table as follows:
1. **macs:** convert to .dlc and post Macs score and `snpe-dlc-viewer` output and do model quantization (runs on Linux) - priority 20
1. **total_inference_avg** run `snpe_bench.py` with quantized model on Qualcomm device DSP - priority 30
1. **f1_onnx** compute f1 from onnxruntime on .onnx model on a 10k test set on Linux - priority 60
1. **f1_1k** compute f1 on quantized .dlc model on Qualcomm device DSP with a 1k test set - priority
is the mean f1 score so that quicker models are prioritized.
1. **f1_1k_f** compute f1 on floating point .dlc model on on Qualcomm device CPU with a 1k test set
- priority 10 * the mean f1 score so that quicker models are prioritized.
1. **f1_10k** compute f1 on quantized model on a 10k test set - priority = 100 * the mean f1 score
so that quicker models are prioritized.
Lower number means higher priority job and each machine will run the highest priority work first.
You can override the priority of a specific job by passing a `--proprity` parameter on the `upload.py` script or by
editing the Azure `status` table and adding a `priority` field to the JSON stored there. You can set any priority number
you want, if you specify priority 0 it will run before anything else which can be handy if you have a cool new model
that you want to bump to the top of the list.
Notice some of the above jobs can run on Linux and do not require Qualcomm device. So in order to maximize throughput on
machines that do have a Qualcomm devices you can allocate other Linux machines with no Qualcomm devices to do the other
work, namely, converting models, quantizing them, and running the `f1_onnx` test.
Folks across your team can use the `azure/upload.py` to submit jobs and let them run, or they can automate that as
shown in the `RemoteAzureBenchmarkEvaluator` in the `search.py` script.
You can use `status.py` to monitor progress or look at the Azure `status` table. Various status messages are posted
there so you can see which machine is doing what and is in what stage of the job.
Next you can go to the `notebook` page and get some pretty pictures of your Pareto Curves.

Просмотреть файл

@ -3,7 +3,8 @@
from pathlib import Path
from argparse import ArgumentParser
import os
import time
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
@ -12,64 +13,122 @@ from archai.datasets.cv.face_synthetics import FaceSyntheticsDatasetProvider
from archai.discrete_search.search_spaces.config import ArchConfig
from search_space.hgnet import StackedHourglass
from training.pl_trainer import SegmentationTrainingLoop
from archai.common.store import ArchaiStore
from archai.common.config import Config
parser = ArgumentParser()
parser.add_argument('arch', type=Path)
parser.add_argument('--dataset_dir', type=Path, help='Face Synthetics dataset directory.', required=True)
parser.add_argument('--output_dir', type=Path, help='Output directory.', required=True)
parser.add_argument('--lr', type=float, default=2e-4)
parser.add_argument('--batch_size', type=int, default=16)
parser.add_argument('--epochs', type=int, default=1)
parser.add_argument('--val_check_interval', type=float, default=1)
def main():
parser = ArgumentParser()
parser.add_argument('arch', type=Path)
parser.add_argument('--dataset_dir', type=Path, help='Face Synthetics dataset directory.', required=True)
parser.add_argument('--output_dir', type=Path, help='Output directory.', required=True)
parser.add_argument('--lr', type=float, default=2e-4)
parser.add_argument('--batch_size', type=int, default=16)
parser.add_argument('--epochs', type=int, default=1)
parser.add_argument('--val_check_interval', type=float, default=1.0)
parser.add_argument('--model_id', type=str, default=None)
parser.add_argument('--config', type=Path, default=None)
args = parser.parse_args()
model_id = args.model_id
store: ArchaiStore = None
epochs = 1 if args.epochs < 1 else args.epochs
start_time = time.time()
storing = False
config = args.config
experiment_name = None
if config and config.is_file():
config = Config(str(config))
if 'aml' in config:
# we are running in azure ml.
aml_config = config['aml']
metric_key = config['training'].get('metric_key')
connection_str = aml_config['connection_str']
experiment_name = aml_config['experiment_name']
storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(connection_str)
store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
storing = True
try:
if storing:
print(f'Locking entity {model_id}')
e = store.lock(model_id, 'training')
if e is None:
e = store.get_status(model_id)
node = e['node']
raise Exception(f'Entity should not be locked by: "{node}"')
pipeline_id = os.getenv('AZUREML_ROOT_RUN_ID')
if pipeline_id is not None:
e['pipeline_id'] = pipeline_id
store.merge_status_entity(e)
arch_config = ArchConfig.from_file(args.arch)
model = StackedHourglass(arch_config, num_classes=18)
pl_model = SegmentationTrainingLoop(model, lr=args.lr)
dataset_prov = FaceSyntheticsDatasetProvider(args.dataset_dir)
tr_dl = torch.utils.data.DataLoader(
dataset_prov.get_train_dataset(), batch_size=args.batch_size, num_workers=8,
shuffle=True
)
val_dl = torch.utils.data.DataLoader(
dataset_prov.get_val_dataset(), batch_size=args.batch_size, num_workers=8
)
callbacks = [
ModelCheckpoint(
dirpath=str(args.output_dir / 'checkpoints'),
monitor='validation_loss', mode='min',
save_last=True, save_top_k=1, verbose=True,
filename='{epoch}-{step}-{validation_loss:.2f}'
)
]
trainer = Trainer(
default_root_dir=str(args.output_dir), accelerator='gpu',
val_check_interval=args.val_check_interval,
max_epochs=epochs,
callbacks=callbacks
)
trainer.fit(pl_model, tr_dl, val_dl)
val_result = trainer.validate(trainer.model, val_dl)
print(val_result)
end_time = time.time()
if storing:
# post updated progress to our unified status table and unlock the row.
metric = float(val_result[0]['validation_mIOU'])
print(f"Storing {metric_key}={metric} for model {model_id}")
e = store.get_status(model_id)
e[metric_key] = metric
e['status'] = 'complete'
e['training_time'] = end_time - start_time
store.unlock_entity(e)
trainer.save_checkpoint(args.output_dir / 'model.ckpt')
# Save onnx model.
input_shape = (1, 3, 256, 256)
rand_range = (0.0, 1.0)
export_kwargs = {'opset_version': 11}
rand_min, rand_max = rand_range
sample_input = ((rand_max - rand_min) * torch.rand(*input_shape) + rand_min).type("torch.FloatTensor")
onnx_file = str(args.output_dir / 'model.onnx')
torch.onnx.export(model, (sample_input,), onnx_file, input_names=["input_0"], **export_kwargs, )
except Exception as ex:
# record failed state.
if storing:
e['status'] = 'failed'
e['error'] = str(ex)
store.unlock_entity(e)
if __name__ == '__main__':
args = parser.parse_args()
arch_config = ArchConfig.from_file(args.arch)
model = StackedHourglass(arch_config, num_classes=18)
pl_model = SegmentationTrainingLoop(model, lr=args.lr)
dataset_prov = FaceSyntheticsDatasetProvider(args.dataset_dir)
tr_dl = torch.utils.data.DataLoader(
dataset_prov.get_train_dataset(), batch_size=args.batch_size, num_workers=8,
shuffle=True
)
val_dl = torch.utils.data.DataLoader(
dataset_prov.get_val_dataset(), batch_size=args.batch_size, num_workers=8
)
callbacks = [
ModelCheckpoint(
dirpath=str(args.output_dir / 'checkpoints'),
monitor='validation_loss', mode='min',
save_last=True, save_top_k=1, verbose=True,
filename='{epoch}-{step}-{validation_loss:.2f}'
)
]
trainer = Trainer(
default_root_dir=str(args.output_dir), accelerator='gpu',
val_check_interval=args.val_check_interval,
max_epochs=args.epochs,
callbacks=callbacks
)
trainer.fit(pl_model, tr_dl, val_dl)
val_result = trainer.validate(trainer.model, val_dl)
print(val_result)
trainer.save_checkpoint(args.output_dir / 'final_model.ckpt')
# Save onnx model.
input_shape = (1, 3, 256, 256)
rand_range = (0.0, 1.0)
export_kwargs = {'opset_version': 11}
rand_min, rand_max = rand_range
sample_input = ((rand_max - rand_min) * torch.rand(*input_shape) + rand_min).type("torch.FloatTensor")
onnx_file = str(args.output_dir / 'final_model.onnx')
torch.onnx.export(model, (sample_input,), onnx_file, input_names=["input_0"], **export_kwargs, )
main()

Просмотреть файл

@ -14,7 +14,7 @@ class PartialTrainingValIOU(ModelEvaluator):
output_dir: str, tr_epochs: float = 1.0,
batch_size: int = 16, lr: float = 2e-4,
tr_dl_workers: int = 8, val_dl_workers: int = 8,
val_check_interval: float = 1):
val_check_interval: float = 1.0):
self.dataset_provider = dataset_provider
self.output_dir = Path(output_dir)
self.tr_epochs = tr_epochs