Add Azure ML running to the face segmentation task. (#217)

* add code owners * initial commit, beginnings of AML version of face synthetics search pipeline. * Add download_and_extract_zip Add download capability to FaceSyntheticsDataset Fix face segmentation data prep script. * fix bugs * cleanup launch.json * cleanup launch.json add download capability to FaceSyntheticsDataset add download_and_extract_zip helper * fix file count test * work in progress * work in progress * unify snpe status table and aml training table. * fix experiment referencing * fix experiment referencing * work in progress * fix complete status * fix bugs * fix bug * fix metric key, we have 2, one for remote snpe, and another for aml training pipelines. * pass seed through to the search.py script. * fix use of AzureMLOnBehalfOfCredential * fix bugs * fix bugs * publish new image * fix bugs * fix bugs * fix bug * maerge * revert * new version * fix bugs * rename the top level folder from 'snpe' to 'aml' and move all AML code into this folder except the top level entry point 'aml.py' make the keys returned from the JobCompletionMonitor wait method configurable Rename AmlPartialTrainingEvaluator and make it restartable. Turn off save_pareto_model_weights Remove redundant copy of JobCompletionMonitor * rev the versions. * updates to readme information. * only inference testing targets are 'cpu' and 'snp', trigger the aml partial training by a different key in the config file. * add iteration info * new version. * fix ordering of results from AmlPartialTrainingEvaluator * change AML batch size default to 64 for faster training don't store MODEL_STORAGE_CONNECTION_STRING * Fix bug in merge_status_entity, add more unit test coverage * new version * Store training time in status table. * improve diagram. * save iteration in status table. * pick up new version of archai to fix randomness bug in the EvolutionParetoSearch so that these search jobs are restartable.
2023-04-21 10:47:58 -07:00 · 2023-04-21 10:47:58 -07:00 · aae38db1a5
--- a/.gitignore
+++ b/.gitignore
@ -164,5 +164,6 @@ output/
 snpe-2.5.0.4052.zip
 android-ndk-r25b-linux.zip
 android-ndk-r25c-linux.zip
-tasks/face_segmentation/snpe/docker/quantizer/quantizer.yaml
+tasks/face_segmentation/aml/docker/quantizer/quantizer.yaml
 tasks/face_segmentation/.vscode/launch.json
+tasks/face_segmentation/conda.yaml
--- a/2
+++ b/2
@ -30,4 +30,6 @@
 /research/lm_eval_harness @gugarosa

 # Tasks
+/tasks/face_segmentation @piero2c
+/tasks/face_segmentation/aml @lovettchris
 /tasks/text_generation @gugarosa
--- a/archai/common/monitor.py
+++ b/archai/common/monitor.py
@ -56,8 +56,12 @@ class JobCompletionMonitor:
                        failed += 1
                        self.store.update_status_entity(e)
                    else:
-                        msg = f"{e['val_acc']}" if 'val_acc' in e else ''
-                        print(f'Training job {id} completed with validation accuracy: {msg}')
+                        if len(self.metric_keys) > 0 and self.metric_keys[0] in e:
+                            key = self.metric_keys[0]
+                            metric = e[key]
+                            print(f'Training job {id} completed with {key} = {metric}')
+                        else:
+                            print(f'Training job {id} completed')

            if len(waiting) == 0:
                break
@ -131,12 +135,16 @@ def main():
    parser.add_argument('--config', help='bin hexed config json info for MLClient')
    parser.add_argument('--timeout', type=int, help='pipeline timeout in seconds (default 1 hour)', default=3600)
    parser.add_argument('--model_path', required=True, help='mounted path containing the pending.json file')
-    parser.add_argument('--output', required=True, help='folder to write the results to)')
+    parser.add_argument('--output', required=True, help='folder to write the results to')
+    parser.add_argument('--metrics', type=str, help='metrics to return from the azure table')

    args = parser.parse_args()
    output = args.output
    timeout = args.timeout
    model_path = args.model_path
+    metrics = []
+    if args.metrics:
+        metrics = [x.strip() for x in args.metrics.split(',')]

    print(f"Monitor running with model_path={model_path}")
    if not os.path.isdir(model_path):
@ -177,7 +185,7 @@ def main():

    store = ArchaiStore(storage_account_name, storage_account_key)

-    monitor = JobCompletionMonitor(store, ml_client, timeout=timeout)
+    monitor = JobCompletionMonitor(store, ml_client, metrics, timeout=timeout)
    results = monitor.wait(model_ids)
    if output is not None:
        # save the results with updated validation accuracies to models.json
--- a/docs/advanced_guide/cloud/azure/notebooks/multi_node_search/multi_node_search.ipynb
+++ b/docs/advanced_guide/cloud/azure/notebooks/multi_node_search/multi_node_search.ipynb
@ -455,7 +455,8 @@
        "    \"Full Training Pipeline\", hex_config, scripts_dir, gpu_compute_name, \n",
        "    datastore_path, output_path, experiment_name, environment_name, full_epochs, save_models=True)\n",
        "                                                 \n",
-        "monitor_component = make_monitor_command(hex_config, scripts_dir, results_path, environment_name, timeout)"
+        "keys = ['val_acc']                                                 \n",
+        "monitor_component = make_monitor_command(hex_config, scripts_dir, results_path, environment_name, keys, timeout)"
      ]
    },
    {
--- a/docs/advanced_guide/cloud/azure/notebooks/multi_node_search/scripts/aml_training_evaluator.py
+++ b/docs/advanced_guide/cloud/azure/notebooks/multi_node_search/scripts/aml_training_evaluator.py
@ -11,7 +11,7 @@ from archai.discrete_search.api.model_evaluator import AsyncModelEvaluator
 from azure.ai.ml import MLClient, command, Input, Output, dsl
 from archai.common.store import ArchaiStore
 from shutil import copyfile
-from monitor import JobCompletionMonitor
+from archai.common.monitor import JobCompletionMonitor
 from training_pipeline import start_training_pipeline
 from utils import copy_code_folder

@ -73,7 +73,9 @@ class AmlTrainingValAccuracy(AsyncModelEvaluator):
        print(f'AmlTrainingValAccuracy: Started training pipeline: {job_id}')

        # wait for all the parallel training jobs to finish
-        monitor = JobCompletionMonitor(self.store, self.ml_client, job_id, self.timeout)
+        metric_key = 'vac_acc'
+        keys = [metric_key]
+        monitor = JobCompletionMonitor(self.store, self.ml_client, keys, job_id, self.timeout)
        results = monitor.wait(model_names)

        # save the results to the output folder (which is mapped by the AML pipeline to our
@ -93,7 +95,7 @@ class AmlTrainingValAccuracy(AsyncModelEvaluator):
        # not so good.
        accuracies = []
        for i, m in enumerate(results['models']):
-            val_acc = m['val_acc']
+            val_acc = m[metric_key]
            accuracies += [val_acc]

        print(f'AmlTrainingValAccuracy: fetch_all returning : {accuracies}')
--- a/docs/advanced_guide/cloud/azure/notebooks/multi_node_search/scripts/commands.py
+++ b/docs/advanced_guide/cloud/azure/notebooks/multi_node_search/scripts/commands.py
@ -46,12 +46,13 @@ def make_train_model_command(output_path, code_dir, environment_name, id,
    )


-def make_monitor_command(hex_config, code_dir, results_uri, environment_name, timeout=3600):
+def make_monitor_command(hex_config, code_dir, results_uri, environment_name, metrics=[], timeout=3600):
    """ This command waits up to some timeout for all the given training jobs to complete
     and returns the validation accuracy results """

    fixed_args = f'--config "{hex_config}" ' + \
-                 f'--timeout {timeout} '
+                 f'--timeout {timeout} ' + \
+                 f'--metrics {",".join(metrics)}'

    # this node depends on training and is therefore also not deterministic, so we set is_deterministic=False
    # which disables pipeline caching.
--- a/docs/advanced_guide/cloud/azure/notebooks/multi_node_search/scripts/monitor.py
+++ b/docs/advanced_guide/cloud/azure/notebooks/multi_node_search/scripts/monitor.py
@ -1,188 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import time
-import argparse
-import json
-import os
-from typing import List, Dict
-from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
-from azure.identity import DefaultAzureCredential
-from archai.common.store import ArchaiStore
-from azure.ai.ml import MLClient
-from azure.ai.ml import dsl
-
-
-class JobCompletionMonitor:
-    """ This helper class uses the ArchaiStore to monitor the status of some long running
-    training operations and the status of the Azure ML pipeline those jobs are running in
-    and waits for them to finish (either successfully or with a failure)"""
-    def __init__(self, store : ArchaiStore, ml_client : MLClient, pipeline_id=None, timeout=3600):
-        """
-        Initialize a JobCompletionMonitor instance.
-        :param store: an instance of ArchaiStore to monitor the status of some long running training operations
-        :param ml_client: an instance of MLClient to check the status of the Azure ML pipeline those jobs are running in
-        :param pipeline_id: (optional) the ID of the Azure ML pipeline to monitor, if not provided we can get this from the ArchaiStore.
-        :param timeout: (optional) the timeout in seconds
-        """
-        self.store = store
-        self.ml_client = ml_client
-        self.timeout = timeout
-        self.pipeline_id = pipeline_id
-
-    def wait(self, model_ids: List[str]) -> List[Dict[str, str]]:
-        """
-        Wait for all the training jobs to finish and return a list of dictionaries
-        containing details about each model, including their training validation accuracies.
-        :param model_ids: a list of training job IDs
-        :return: a list of dictionaries containing details about each model
-        """
-        completed = {}
-        waiting = list(model_ids)
-        start = time.time()
-        failed = 0
-
-        while len(waiting) > 0:
-            for i in range(len(waiting) - 1, -1, -1):
-                id = waiting[i]
-                e = self.store.get_status(id)
-                if self.pipeline_id is None and 'pipeline_id' in e:
-                    self.pipeline_id = e['pipeline_id']
-                if e is not None and 'status' in e and (e['status'] == 'completed' or e['status'] == 'failed'):
-                    del waiting[i]
-                    completed[id] = e
-                    if e['status'] == 'failed':
-                        error = e['error']
-                        print(f'Training job {id} failed with error: {error}')
-                        failed += 1
-                        self.store.update_status_entity(e)
-                    else:
-                        msg = f"{e['val_acc']}" if 'val_acc' in e else ''
-                        print(f'Training job {id} completed with validation accuracy: {msg}')
-
-            if len(waiting) == 0:
-                break
-
-            # check the overall pipeline status just in case training jobs failed to even start.
-            pipeline_status = None
-            try:
-                if self.pipeline_id is not None:
-                    train_job = self.ml_client.jobs.get(self.pipeline_id)
-                    if train_job is not None:
-                        pipeline_status = train_job.status
-            except Exception as e:
-                print(f'Error getting pipeline status for pipeline {self.pipeline_id}: {e}')
-
-            if pipeline_status is not None:
-                if pipeline_status == 'Completed':
-                    # ok, all jobs are done, which means if we still have waiting tasks then they failed to
-                    # even start.
-                    break
-                elif pipeline_status == 'Failed' or pipeline_status == 'Canceled':
-                    for id in waiting:
-                        e = self.store.get_status(id)
-                        if 'error' not in e:
-                            e['error'] = f'Pipeline {pipeline_status}'
-                        if 'status' not in e or e['status'] != 'completed':
-                            e['status'] = pipeline_status.lower()
-                        self.store.update_status_entity(e)
-                    raise Exception('Partial Training Pipeline failed')
-
-            if len(waiting) > 0:
-                if time.time() > self.timeout + start:
-                    break
-                print("AmlTrainingValAccuracy: Waiting 20 seconds for partial training to complete...")
-                time.sleep(20)
-
-        # awesome - they all completed!
-        if len(completed) == 0:
-            if time.time() > self.timeout + start:
-                raise Exception(f'Partial Training Pipeline timed out after {self.timeout} seconds')
-            else:
-                raise Exception('Partial Training Pipeline failed to start')
-
-        if failed == len(completed):
-            raise Exception('Partial Training Pipeline failed all jobs')
-
-        # stitch together the models.json file from our status table.
-        print('Top model results: ')
-        models = []
-        for id in model_ids:
-            row = {'id': id}
-            e = completed[id] if id in completed else {}
-            for key in ['nb_layers', 'kernel_size', 'hidden_dim', 'val_acc', 'job_id', 'status', 'error', 'epochs']:
-                if key in e:
-                    row[key] = e[key]
-            models += [row]
-
-        results = {
-            'models': models
-        }
-
-        timespan = time.strftime('%H:%M:%S', time.gmtime(time.time() - start))
-        print(f'Training: Distributed training completed in {timespan} ')
-        print(f'Training: returning {len(results)} results:')
-        print(json.dumps(results, indent=2))
-        return results
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', help='bin hexed config json info for MLClient')
-    parser.add_argument('--timeout', type=int, help='pipeline timeout in seconds (default 1 hour)', default=3600)
-    parser.add_argument('--model_path', required=True, help='mounted path containing the pending.json file')
-    parser.add_argument('--output', required=True, help='folder to write the results to)')
-
-    args = parser.parse_args()
-    output = args.output
-    timeout = args.timeout
-    model_path = args.model_path
-
-    print(f"Monitor running with model_path={model_path}")
-    if not os.path.isdir(model_path):
-        raise Exception("### directory not found")
-
-    models_file = os.path.join(model_path, 'pending.json')
-    if not os.path.isfile(models_file):
-        raise Exception("### 'pending.json' not found in --model_path")
-
-    models = json.load(open(models_file))
-    model_ids = [m['id'] for m in models['models']]
-
-    identity = AzureMLOnBehalfOfCredential()
-    if args.config:
-        print("Using AzureMLOnBehalfOfCredential...")
-        workspace_config = str(bytes.fromhex(args.config), encoding='utf-8')
-        print(f"Config: {workspace_config}")
-        config = json.loads(workspace_config)
-    else:
-        print("Using DefaultAzureCredential...")
-        config_file = "../.azureml/config.json"
-        print(f"Config: {config_file}")
-        config = json.load(open(config_file, 'r'))
-        identity = DefaultAzureCredential()
-
-    subscription = config['subscription_id']
-    resource_group = config['resource_group']
-    workspace_name = config['workspace_name']
-    storage_account_key = config['storage_account_key']
-    storage_account_name = config['storage_account_name']
-
-    ml_client = MLClient(
-        identity,
-        subscription,
-        resource_group,
-        workspace_name
-    )
-
-    store = ArchaiStore(storage_account_name, storage_account_key)
-
-    monitor = JobCompletionMonitor(store, ml_client, timeout=timeout)
-    results = monitor.wait(model_ids)
-    if output is not None:
-        # save the results with updated validation accuracies to models.json
-        with open(os.path.join(output, 'models.json'), 'w') as f:
-            f.write(json.dumps(results, indent=2))
-
-
-if __name__ == "__main__":
-    main()
--- a/tasks/face_segmentation/snpe/.vscode/launch.json
+++ b/tasks/face_segmentation/snpe/.vscode/launch.json
@ -11,7 +11,9 @@
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": true,
-            "args":["001c3ee6c74e05e63252b1ba3dfc58ca3cbb4f56"]
+            "args":[
+                "--search_config", "output/confs/aml_search.yaml"
+            ]
        }
    ]
 }
--- a/tasks/face_segmentation/README.md
+++ b/tasks/face_segmentation/README.md
@ -38,7 +38,7 @@ configurations based on the desired target (CPU or Snapdragon processor), `searc
 * [cpu_search.yaml](confs/cpu_search.yaml)
 * [snp_search.yaml](confs/snp_search.yaml)

-Note: to use `snp_search.yaml` you will need to follow the [SNP setup instructions](snpe/readme.md).
+Note: to use `snp_search.yaml` you will need to follow the [Azure ML setup instructions](aml/readme.md).

 By default, `search.py` will run multiple partial training jobs using Ray (2 jobs per GPU). To change the number of gpus
 per job, set `--gpus_per_job`, or use the `--serial_training` flag to disable parallel training jobs altogether.
--- a/tasks/face_segmentation/aml.py
+++ b/tasks/face_segmentation/aml.py
@ -0,0 +1,204 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from argparse import ArgumentParser
+from pathlib import Path
+import os
+import sys
+import yaml
+from typing import Optional, Dict
+from azure.identity import DefaultAzureCredential
+from azure.ai.ml.entities import UserIdentityConfiguration
+from azure.ai.ml import MLClient
+from azure.ai.ml import command, Input, Output, dsl
+from archai.common.config import Config
+import archai.common.azureml_helper as aml_helper
+from archai.common.store import ArchaiStore
+from archai.common.file_utils import TemporaryFiles
+from shutil import copyfile, rmtree
+from aml.util.setup import register_datastore, configure_store, create_cluster, copy_code_folder
+
+
+confs_path = Path(__file__).absolute().parent / 'confs'
+
+
+def data_prep_component(environment_name, datastore_path):
+    return command(
+        name="data_prep",
+        display_name="Data preparation for training",
+        description="Downloads the remote dataset to our blob store.",
+        inputs={
+            "name": Input(type='string')
+        },
+        outputs={
+            "data": Output(type="uri_folder", path=datastore_path, mode="rw_mount")
+        },
+
+        # The source folder of the component
+        code='data_prep',
+        command="""python3 prep_data_store.py \
+                --path ${{outputs.data}} \
+                """,
+        environment=environment_name,
+    )
+
+
+def search_component(config, environment_name, seed, modelstore_path, output_path: Path):
+    # we need a folder containing all the specific code we need here, which is not everything in this repo.
+    scripts_path = output_path / 'scripts'
+    os.makedirs(str(scripts_path), exist_ok=True)
+    config_dir = scripts_path / 'confs'
+    os.makedirs(str(config_dir), exist_ok=True)
+    copyfile('search.py', str(scripts_path / 'search.py'))
+    copyfile('train.py', str(scripts_path / 'train.py'))
+    copy_code_folder('search_space', str(scripts_path / 'search_space'))
+    copy_code_folder('training', str(scripts_path / 'training'))
+    copy_code_folder(os.path.join('aml', 'training'), str(scripts_path / 'aml' / 'training'))
+    copy_code_folder(os.path.join('aml', 'util'), str(scripts_path / 'aml' / 'util'))
+    config.save(str(config_dir / 'aml_search.yaml'))
+
+    aml_config = config['aml']
+    timeout = int(aml_config.get('timeout', 3600))
+    con_str = aml_config['connection_str']
+
+    fixed_args = f'--seed {seed} --timeout {timeout} --search_config confs/aml_search.yaml'
+
+    return command(
+        name="search",
+        display_name="Archai search job",
+        description="Searches for the best face segmentation model.",
+        is_deterministic=False,
+        inputs={
+            "data": Input(type="uri_folder")
+        },
+        outputs={
+            "results": Output(type="uri_folder", path=modelstore_path, mode="rw_mount")
+        },
+        identity=UserIdentityConfiguration(),
+        # The source folder of the component
+        code=str(scripts_path),
+        environment_variables={'MODEL_STORAGE_CONNECTION_STRING': con_str},
+        command="""python3 search.py \
+                --dataset_dir ${{inputs.data}} \
+                --output_dir ${{outputs.results}} \
+                """ + fixed_args,
+        environment=environment_name,
+    )
+
+
+def main(output_dir: Path, experiment_name: str, seed: int):
+    if output_dir.exists():
+        rmtree(str(output_dir))
+    output_dir.mkdir(parents=True)
+
+    # Filters extra args that have the prefix `search_space`
+    config_file = str(confs_path / 'aml_search.yaml')
+    config = Config(config_file, resolve_env_vars=True)
+
+    aml_config = config['aml']
+    con_str = aml_config.get('connection_str', '$')
+    if '$' in con_str:
+        print("Please set environment variable MODEL_STORAGE_CONNECTION_STRING containing the Azure" +
+              "storage account connection string for the Azure storage account you want to use to " +
+              "control this experiment.")
+        return 1
+
+    workspace_name = aml_config['workspace_name']
+    subscription_id = aml_config['subscription_id']
+    resource_group_name = aml_config['resource_group']
+
+    # extract conda.yaml.
+    with open('conda.yaml', 'w') as f:
+        yaml.dump(aml_config['environment'].to_dict(), f)
+
+    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
+    print(f'Using storage account: {storage_account_name}')
+
+    ml_client = MLClient(
+        credential=DefaultAzureCredential(),
+        subscription_id=subscription_id,
+        resource_group_name=resource_group_name,
+        workspace_name=workspace_name
+    )
+    print(f'Using workspace "{ml_client.workspace_name}" in resource group "{ml_client.resource_group_name}"')
+
+    # Create aml computer clusters
+    cpu_compute_name = create_cluster(ml_client, aml_config, 'search_cluster')
+    create_cluster(ml_client, aml_config, 'training_cluster')
+
+    archai_job_env = aml_helper.create_environment_from_file(
+        ml_client,
+        image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
+        conda_file="conda.yaml",
+        version='1.0.20')
+    environment_name = f"{archai_job_env.name}:{archai_job_env.version}"
+
+    # Register the datastore with AML
+    data_store_name = 'datasets'
+    data_container_name = 'datasets'
+    model_store_name = 'models'
+    model_container_name = aml_config.get('blob_container_name', 'models')
+
+    # register our azure datastores
+    results_path = register_datastore(ml_client, model_store_name, model_container_name, storage_account_name, storage_account_key, experiment_name)
+    datastore_path = register_datastore(ml_client, data_store_name, data_container_name, storage_account_name, storage_account_key, experiment_name)
+
+    # save this in the output folder so it can be found by pipeline components.
+    aml_config['experiment_name'] = experiment_name
+    aml_config['environment_name'] = environment_name
+    aml_config['datastore_path'] = datastore_path
+    aml_config['results_path'] = results_path
+
+    # make sure the datasets container exists
+    store = configure_store(aml_config, data_container_name)
+
+    # make sure the models container exists
+    store = configure_store(aml_config, model_container_name)
+    with TemporaryFiles() as tmp_files:
+        filename = tmp_files.get_temp_file()
+        aml_config['connection_str'] = "${MODEL_STORAGE_CONNECTION_STRING}"
+        config.save(filename)
+        aml_config['connection_str'] = con_str
+        store.upload_blob(f"{experiment_name}/config", filename, 'aml_search.yaml')
+
+    @dsl.pipeline(
+        compute=cpu_compute_name,
+        description="FaceSynthetics Archai search pipeline",
+    )
+    def archai_search_pipeline():
+
+        data_prep_job = data_prep_component(environment_name, datastore_path)(
+            name=experiment_name
+        )
+
+        search_job = search_component(config, environment_name, seed, results_path, output_dir)(
+            data=data_prep_job.outputs.data
+        )
+
+        return {
+            "results": search_job.outputs.results
+        }
+
+    pipeline_job = ml_client.jobs.create_or_update(
+        archai_search_pipeline(),
+        # Project's name
+        experiment_name=experiment_name,
+    )
+
+    import webbrowser
+    webbrowser.open(pipeline_job.services["Studio"].endpoint)
+
+    job_name = pipeline_job.name
+    print(f'Started pipeline: {job_name}')
+
+    return 0
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser("""This script runs the search in an Azure ML workspace.""")
+    parser.add_argument('--output_dir', type=Path, help='Output directory for downloading results.', default='output')
+    parser.add_argument('--experiment_name', default='facesynthetics')
+    parser.add_argument('--seed', type=int, help='Random seed', default=42)
+
+    args = parser.parse_args()
+    rc = main(args.output_dir, args.experiment_name, args.seed)
+    sys.exit(rc)
--- a/tasks/face_segmentation/snpe/.flake8
+++ b/tasks/face_segmentation/snpe/.flake8
--- a/tasks/face_segmentation/aml/init.py
+++ b/tasks/face_segmentation/aml/init.py
--- a/tasks/face_segmentation/snpe/azure/cleanup_stale_pods.py
+++ b/tasks/face_segmentation/snpe/azure/cleanup_stale_pods.py
@ -42,11 +42,12 @@ def cleanup_stale_pods(store: ArchaiStore):


 if __name__ == '__main__':
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    con_str = os.getenv(CONNECTION_NAME)
    if not con_str:
        print(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)

    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-    store = ArchaiStore(storage_account_name, storage_account_key)
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
    cleanup_stale_pods(store)
--- a/tasks/face_segmentation/snpe/azure/compare_status.py
+++ b/tasks/face_segmentation/snpe/azure/compare_status.py
--- a/tasks/face_segmentation/snpe/azure/delete.py
+++ b/tasks/face_segmentation/snpe/azure/delete.py
@ -15,15 +15,16 @@ def delete(con_str):
    args = parser.parse_args()

    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-    store = ArchaiStore(storage_account_name, storage_account_key)
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
    store.delete_blobs(args.name, args.file)
    if not args.file:
        store.delete_status(args.name)


 if __name__ == '__main__':
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    con_str = os.getenv(CONNECTION_NAME)
    if not con_str:
        print(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)
-    delete(con_str)
+    delete(con_str, experiment_name)
--- a/tasks/face_segmentation/snpe/azure/download.py
+++ b/tasks/face_segmentation/snpe/azure/download.py
@ -8,7 +8,7 @@ from archai.common.store import ArchaiStore
 CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'


-def download(con_str):
+def download(con_str, experiment_name):
    parser = argparse.ArgumentParser(
        description="Download assets from azure blob store using friendly name.")
    parser.add_argument('--name', help='Friendly name of model to download (if not provided it downloads them all')
@ -16,7 +16,7 @@ def download(con_str):
    args = parser.parse_args()

    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-    store = ArchaiStore(storage_account_name, storage_account_key)
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
    friendly_name = args.name
    if not friendly_name:
        friendly_names = [e['name'] for e in store.get_all_status_entities()]
@ -32,8 +32,9 @@ def download(con_str):


 if __name__ == '__main__':
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    con_str = os.getenv(CONNECTION_NAME)
    if not con_str:
        print(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)
-    download(con_str)
+    download(con_str, experiment_name)
--- a/tasks/face_segmentation/snpe/azure/find_unsteady_runs.py
+++ b/tasks/face_segmentation/snpe/azure/find_unsteady_runs.py
--- a/tasks/face_segmentation/snpe/azure/list.py
+++ b/tasks/face_segmentation/snpe/azure/list.py
@ -8,7 +8,7 @@ from archai.common.store import ArchaiStore
 CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'


-def list_models(con_str):
+def list_models(con_str, experiment_name):
    parser = argparse.ArgumentParser(
        description="List all azure blob store assets.")
    parser.add_argument('--prefix', type=str, required=True, default=None,
@ -17,14 +17,15 @@ def list_models(con_str):
    prefix = args.prefix

    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-    store = ArchaiStore(storage_account_name, storage_account_key)
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
    for blob in store.list_blobs(prefix):
        print(blob)


 if __name__ == '__main__':
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    con_str = os.getenv(CONNECTION_NAME)
    if not con_str:
        print(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)
-    list_models(con_str)
+    list_models(con_str, experiment_name)
--- a/tasks/face_segmentation/snpe/azure/loop.sh
+++ b/tasks/face_segmentation/snpe/azure/loop.sh
--- a/tasks/face_segmentation/snpe/azure/readme.md
+++ b/tasks/face_segmentation/snpe/azure/readme.md
@ -12,7 +12,7 @@ named `Connection string`.
 In Linux you should use double quotes around the connection string like this:

 ```
-export MODEL_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=mymodels;AccountKey=...==;EndpointSuffix=core.windows.net"
+export MODEL_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=...EndpointSuffix=core.windows.net"
 ```

 You'll use it a lot so it is handy if you put it in your `~/.profile`.
@ -44,13 +44,14 @@ Then to get the ball rolling create a temp folder and run this:

 ```
 mkdir -p ~/experiment
-python ~/git/archai/tasks/face_Segmentation/snpe/azure/runner.py --working ~/experiment
+python ~/git/archai/tasks/face_Segmentation/aml/azure/runner.py --working ~/experiment
 ```

 This will monitor the Azure blob store for new work to do, and run those jobs in priority order.  If you also provide a
 `--device` option pointing to the `adb device` for a Qualcomm 888 Dev Board then it will also run the quantized models
-on that device and report the performance and F1 score results.
+on that device and report the performance and F1 score results.  If the row in the azure table contains the property
+`benchmark_only` equal to 1, then it will skip computing any F1 scores.

 If you setup a quantization only runner in the cloud using the `docker/quantizer` image, you can pass
 `--no_quantization` argument when you have a `--device` so that the local runs do not do quantization. This will stop
-your linux machine from getting overloaded with quantization work so it can focus on the SNPE device workloads.
+your linux machine from getting overloaded with quantization work so it can focus on managing the SNPE device workloads.
--- a/tasks/face_segmentation/snpe/azure/report_device_usage.py
+++ b/tasks/face_segmentation/snpe/azure/report_device_usage.py
--- a/tasks/face_segmentation/snpe/azure/reset.py
+++ b/tasks/face_segmentation/snpe/azure/reset.py
@ -8,13 +8,13 @@ from archai.common.store import ArchaiStore
 CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'


-def reset(con_str):
+def reset(con_str, experiment_name):
    parser = argparse.ArgumentParser(
        description='Reset the named entity.')
    parser.add_argument('name', help='The friendly name to reset or "*" to reset all rows', default=None)
    args = parser.parse_args()
    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-    store = ArchaiStore(storage_account_name, storage_account_key)
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)

    entities = []
    if args.name == "*":
@ -29,15 +29,15 @@ def reset(con_str):

    for e in entities:
        name = e['name']
-        print(f"Resetting {name}")
        store.reset(e['name'], ['benchmark_only', 'model_date'])
        store.delete_blobs(name, 'model.dlc')
        store.delete_blobs(name, 'model.quant.dlc')


 if __name__ == '__main__':
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    con_str = os.getenv(CONNECTION_NAME)
    if not con_str:
        print(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)
-    reset(con_str)
+    reset(con_str, experiment_name)
--- a/tasks/face_segmentation/snpe/azure/runner.py
+++ b/tasks/face_segmentation/snpe/azure/runner.py
@ -134,7 +134,7 @@ def record_error(entity, error_message):
    store.merge_status_entity(entity)


-def convert(name, entity, long_name, model_path):
+def convert(experiment, name, entity, long_name, model_path):
    global store
    log("Converting model: " + long_name)
    entity['model_name'] = long_name
@ -153,12 +153,12 @@ def convert(name, entity, long_name, model_path):
        store.merge_status_entity(entity)

    log("Uploading converted model: " + model)
-    store.upload_blob(name, model)
-
+    blob_name = f'{experiment}/{name}'
+    store.upload_blob(blob_name, model)
    return model


-def quantize(name, entity, onnx_model, model):
+def quantize(experiment, name, entity, onnx_model, model):
    global store
    log("Quantizing model: " + name + "...")
    log(" (Please be patient this can take a while, up to 10 minutes or more)")
@ -176,7 +176,8 @@ def quantize(name, entity, onnx_model, model):

    # save the quantized .dlc since it takes so long to produce.
    log("Uploading quantized model: " + model)
-    store.upload_blob(name, model)
+    blob_name = f'{experiment}/{name}'
+    store.upload_blob(blob_name, model)
    return model


@ -302,7 +303,7 @@ def get_avg_latency(latencies):
    return sum / count


-def benchmark(entity, onnx_model, model, name, test_input):
+def benchmark(experiment, entity, onnx_model, model, name, test_input):
    global BENCHMARK_RUN_COUNT, CLEAR_RANDOM_INPUTS, store, usage

    # next highest priority is to get benchmark times
@ -329,7 +330,7 @@ def benchmark(entity, onnx_model, model, name, test_input):
        add_usage(usage, get_device(), start, end)

        for file in glob.glob(os.path.join(output_dir, 'perf_results*.csv')):
-            store.upload_blob(name, file)
+            store.upload_blob(f'{experiment}/{name}', file)

        total_inference_avg = get_total_inference_avg(entity)
        total_inference_avg += [ifs]
@ -359,7 +360,7 @@ def ensure_complete(entity):
        store.merge_status_entity(entity)


-def run_model(name, dataset, use_device, benchmark_only, no_quantization):
+def run_model(experiment, name, dataset, use_device, benchmark_only, no_quantization):
    global store, usage
    log("===================================================================================================")
    log(f"Checking model: {name} on node {get_unique_node_id()}")
@ -385,7 +386,8 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):

    entity = store.get_status(name)

-    downloaded = store.download(name, model_dir, r'.*\.onnx$')
+    blob_name = f'{experiment}/{name}'
+    downloaded = store.download(blob_name, model_dir, r'.*\.onnx$')
    if len(downloaded) == 0 or not os.path.isfile(downloaded[0]):
        record_error(entity, 'missing model')
        log(f"### no model found for {name}")
@ -395,8 +397,8 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):

    # see if we have converted the model or not.
    # do this first no matter what.
-    converted = len(store.list_blobs(f'{name}/model.dlc')) > 0
-    is_quantized = len(store.list_blobs(f'{name}/model.quant.dlc')) > 0
+    converted = len(store.list_blobs(f'{blob_name}/model.dlc')) > 0
+    is_quantized = len(store.list_blobs(f'{blob_name}/model.quant.dlc')) > 0
    if not is_quantized:
        # oh, the quant model disappeared so clear the flag so it gets
        # quantized again by a machine that can do that.
@ -411,11 +413,11 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
        converted = False

    if not converted:
-        model = convert(name, entity, long_name, onnx_model)
+        model = convert(experiment, name, entity, long_name, onnx_model)
        if model == 'error':
            return
    elif converted:
-        downloaded = store.download(name, snpe_model_dir, 'model.dlc')
+        downloaded = store.download(blob_name, snpe_model_dir, 'model.dlc')
        if len(downloaded) == 0:
            raise Exception('### internal error, the model.dlc download failed!')
    elif not is_quantized and not converted:
@ -426,7 +428,7 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
    # see if we have a quantized model or not.
    model = os.path.join(snpe_model_dir, 'model.dlc')
    if not is_quantized:
-        model = quantize(name, entity, onnx_model, model)
+        model = quantize(experiment, name, entity, onnx_model, model)
        if model == 'error':
            return
        entity['quantized'] = True
@ -439,7 +441,7 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):

    quantized_model = os.path.join(snpe_model_dir, 'model.quant.dlc')
    if not os.path.isfile(quantized_model):
-        downloaded = store.download(name, snpe_model_dir, 'model.quant.dlc')
+        downloaded = store.download(blob_name, snpe_model_dir, 'model.quant.dlc')
        if len(downloaded) == 0 or not os.path.isfile(downloaded[0]):
            raise Exception("??? quantized model should exist at this point...")
        quantized_model = downloaded[0]
@ -453,14 +455,14 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
        csv_file = os.path.join(snpe_model_dir, 'model.quant.info.csv')
        with open(csv_file, 'w') as f:
            f.write(csv_data)
-        store.upload_blob(name, csv_file)
+        store.upload_blob(blob_name, csv_file)
        return

    input_shape = eval(entity['shape'])
    if use_device:
        check_dataset(input_shape, 'test', 1000)
        test_input = os.path.realpath(os.path.join('data', 'test'))
-        if benchmark(entity, onnx_model, quantized_model, name, test_input):
+        if benchmark(experiment, entity, onnx_model, quantized_model, name, test_input):
            return

    if benchmark_only:
@ -533,8 +535,8 @@ def run_model(name, dataset, use_device, benchmark_only, no_quantization):
    log(f"### Saving {prop} score of {f1score}")
    entity[prop] = f1score
    store.merge_status_entity(entity)
-    store.upload_blob(name, test_results, f"test_results_{prop}.csv")
-    store.upload_blob(name, chart, f"pr_curve_{prop}.png")
+    store.upload_blob(blob_name, test_results, f"test_results_{prop}.csv")
+    store.upload_blob(blob_name, chart, f"pr_curve_{prop}.png")

    if 'f1_1k' in entity and 'f1_10k' in entity and 'f1_1k_f' in entity and 'f1_onnx' in entity:
        ensure_complete(entity)
@ -629,6 +631,7 @@ def find_work_prioritized(use_device, benchmark_only, subset_list, no_quantizati
        elif use_device and (total_benchmark_runs < MAX_BENCHMARK_RUNS):
            priority = 30 + total_benchmark_runs
        elif is_benchmark_only(entity, benchmark_only):
+            # this model is done!
            continue
        elif not is_complete(entity, 'f1_onnx'):
            priority = 60
@ -678,7 +681,7 @@ class MemoryMonitor:
        return growth


-def monitor(dataset, use_device, benchmark_only, subset_list, no_quantization):
+def monitor(experiment, dataset, use_device, benchmark_only, subset_list, no_quantization):
    global rss_start, store, usage

    logging.basicConfig(filename=LOG_FILE_NAME, filemode='a',
@ -721,7 +724,7 @@ def monitor(dataset, use_device, benchmark_only, subset_list, no_quantization):
            gc.collect()
            tracemalloc.start()
            snapshot1 = tracemalloc.take_snapshot()
-            run_model(name, dataset, use_device, benchmark_only_flag, no_quantization)
+            run_model(experiment, name, dataset, use_device, benchmark_only_flag, no_quantization)
            gc.collect()
            snapshot2 = tracemalloc.take_snapshot()
            for i in snapshot2.compare_to(snapshot1, 'lineno')[:10]:
@ -759,12 +762,13 @@ def get_storage_account(con_str):

 def setup_store():
    global store, usage
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    conn_string = os.getenv(CONNECTION_NAME)
    if not conn_string:
        log(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)
    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(conn_string)
-    store = ArchaiStore(storage_account_name, storage_account_key, table_name='status')
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
    usage = ArchaiStore(storage_account_name, storage_account_key, table_name='usage')
    return conn_string

@ -851,5 +855,7 @@ if __name__ == '__main__':
        check_stale_pods(args.cleanup_stale_pods)

    dataset = os.getenv("INPUT_DATASET")
-    rc = monitor(dataset, device is not None, args.benchmark, subset, args.no_quantization)
+    experiment = os.getenv("EXPERIMENT_NAME", "facesynthetics")
+
+    rc = monitor(experiment, dataset, device is not None, args.benchmark, subset, args.no_quantization)
    sys.exit(rc)
--- a/tasks/face_segmentation/snpe/azure/status.py
+++ b/tasks/face_segmentation/snpe/azure/status.py
@ -9,7 +9,7 @@ from archai.common.store import ArchaiStore
 CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'


-def status(con_str):
+def status(con_str, experiment_name):
    parser = argparse.ArgumentParser(description='Print status in .csv format')
    parser.add_argument('--status', help='Optional match for the status column (default None).')
    parser.add_argument('--name', help='Optional name of single status row to return (default None).')
@ -18,7 +18,7 @@ def status(con_str):
    parser.add_argument('--cols', help='Comma separated list of columns to report (default is to print all)')
    args = parser.parse_args()
    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-    store = ArchaiStore(storage_account_name, storage_account_key)
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
    entities = store.get_all_status_entities(args.status, args.not_equal)
    if args.locked:
        entities = [e for e in entities if 'node' in e and e['node']]
@ -32,8 +32,9 @@ def status(con_str):


 if __name__ == '__main__':
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    con_str = os.getenv(CONNECTION_NAME)
    if not con_str:
        print(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)
-    status(con_str)
+    status(con_str, experiment_name)
--- a/tasks/face_segmentation/snpe/azure/unlock.py
+++ b/tasks/face_segmentation/snpe/azure/unlock.py
@ -8,19 +8,20 @@ from archai.common.store import ArchaiStore
 CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'


-def unlock(con_str):
+def unlock(con_str, experiment_name):
    parser = argparse.ArgumentParser(
        description='Unlock all jobs for given node or unlock all jobs.')
    parser.add_argument('--node', help='Optional node name (default None).')
    args = parser.parse_args()
    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-    store = ArchaiStore(storage_account_name, storage_account_key)
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
    store.unlock_all(args.node)


 if __name__ == '__main__':
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    con_str = os.getenv(CONNECTION_NAME)
    if not con_str:
        print(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)
-    unlock(con_str)
+    unlock(con_str, experiment_name)
--- a/tasks/face_segmentation/snpe/azure/upload.py
+++ b/tasks/face_segmentation/snpe/azure/upload.py
@ -8,7 +8,7 @@ from archai.common.store import ArchaiStore
 CONNECTION_NAME = 'MODEL_STORAGE_CONNECTION_STRING'


-def upload(con_str, args):
+def upload(con_str, experiment_name, args):
    parser = argparse.ArgumentParser(description='Upload a named model (and optional accompanying files) to your ' +
                                     'azure blob store')
    parser.add_argument('name', help='Friendly name of the folder to put this in.')
@ -19,13 +19,14 @@ def upload(con_str, args):
    parser.add_argument('--reset', help='Reset stats for the model if it exists already.', action="store_true")
    args = parser.parse_args(args)
    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-    store = ArchaiStore(storage_account_name, storage_account_key)
-    store.upload(args.name, args.file, args.reset, priority=args.priority)
+    store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
+    store.upload(f'{experiment_name}/args.name', args.file, args.reset, priority=args.priority)


 if __name__ == '__main__':
+    experiment_name = os.getenv("EXPERIMENT_NAME", "facesynthetics")
    con_str = os.getenv(CONNECTION_NAME)
    if not con_str:
        print(f"Please specify your {CONNECTION_NAME} environment variable.")
        sys.exit(1)
-    upload(con_str)
+    upload(con_str, experiment_name)
--- a/tasks/face_segmentation/snpe/azure/usage.py
+++ b/tasks/face_segmentation/snpe/azure/usage.py
--- a/tasks/face_segmentation/snpe/docker/quantizer/Dockerfile
+++ b/tasks/face_segmentation/snpe/docker/quantizer/Dockerfile
@ -62,13 +62,13 @@ RUN wget -O azcopy_v10.tar.gz https://aka.ms/downloadazcopy-v10-linux && tar -xf

 # this echo is a trick to bypass docker build cache.
 # simply change the echo string every time you want docker build to pull down new bits.
-RUN echo '04/06/2023 06:28 PM' >/dev/null && git clone https://github.com/microsoft/archai.git
-RUN cd archai && git checkout task_segmentation && pip install -e .[dev]
+RUN echo '04/18/2023 12:22 PM' >/dev/null && git clone https://github.com/microsoft/archai.git
+RUN cd archai && git checkout clovett/aml && pip install -e .[dev]

 RUN echo "using this pip version: " && which pip
 RUN echo "using this python version: " && which python

-RUN pushd /home/archai/archai/tasks/face_segmentation/snpe && \
+RUN pushd /home/archai/archai/tasks/face_segmentation/aml && \
    python --version && \
    pip install -r requirements.txt

--- a/tasks/face_segmentation/snpe/docker/quantizer/cleanup.ps1
+++ b/tasks/face_segmentation/snpe/docker/quantizer/cleanup.ps1
--- a/tasks/face_segmentation/snpe/docker/quantizer/quantizer.template.yaml
+++ b/tasks/face_segmentation/snpe/docker/quantizer/quantizer.template.yaml
--- a/tasks/face_segmentation/snpe/docker/quantizer/readme.md
+++ b/tasks/face_segmentation/snpe/docker/quantizer/readme.md
@ -33,7 +33,7 @@ talk to the right azure storage account.  On linux this would be an export in yo
 don't forget the double quotes.

 ```
-export MODEL_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=mymodels;AccountKey=...==;EndpointSuffix=core.windows.net"
+export MODEL_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=...==;EndpointSuffix=core.windows.net"
 ```

 ## Dockerfile
@ -44,11 +44,9 @@ you increase your Qualcomm device utilization.

 The `setup.ps1` script shows what docker commands to run to build the image, how to login to your
 azure docker container registry, how to take your image for that container registry and push it
-to Azure.  So you do not need to use the public docker.org container registry.  You will decide
-what version number to attach to your image here and the same version needs to be specified in the
-following `quantizer.yaml`.
+to Azure.  So you do not need to use the public docker.org container registry.

-You can also test your docker image locally by running:
+You can test your docker image locally by running:
 ```
 docker run -e MODEL_STORAGE_CONNECTION_STRING=$MODEL_STORAGE_CONNECTION_STRING -it <image_id>
 ```
@ -75,7 +73,7 @@ that shows a string like this:
 ```
 az aks get-credentials --resource-group snpe-quantizaton-rg --name snpe-quantizer-aks
 ```
-Run that locally and then you can push docker images to this registry.:
+Run that locally and then you can push docker images to this registry:

 ```
 docker push snpecontainerregistry001.azurecr.io/quantizer:1.27
@ -86,9 +84,7 @@ increment this version number each time it runs in case you need to push new ver

 ## quantizer.yaml

-Then you can use `kubectl apply -f quantizer.yaml` to configure the AKS custer.  Note that the version
-of the image to use is specified in this file so you may need to edit the file and change the
-version `1.13` to whatever you just tagged and pushed to the azure container registry.
+Then you can use `kubectl apply -f quantizer.yaml` to deploy this new image version to your AKS custer.

 Notice this yaml configures AKS to scale up to 100 nodes if necessary and the scaling is triggered
 when a given node passes 40% CPU utilization.  You can tweak these numbers however you like to fit
@ -132,4 +128,6 @@ face segmentation ONNX model to do a test run.  You can train one of these model
 ## run.sh

 This little script is used as the entry point to the Docker image, you will see this in the last
-`RUN` command in the Dockerfile.
+`RUN` command in the Dockerfile. The reason this `run.sh` contains a loop is because the Python
+script checks for memory leaks and auto-terminates itself if it sees memory usage climb too high.
+This way the quantizer pod can run pretty much forever, or at least until you deploy a new version.
--- a/tasks/face_segmentation/snpe/docker/quantizer/requirements.txt
+++ b/tasks/face_segmentation/snpe/docker/quantizer/requirements.txt
--- a/tasks/face_segmentation/snpe/docker/quantizer/run.sh
+++ b/tasks/face_segmentation/snpe/docker/quantizer/run.sh
@ -17,7 +17,7 @@ pushd /home/archai/experiment

 while true
 do
-    python -u /home/archai/archai/tasks/face_segmentation/snpe/azure/runner.py
+    python -u /home/archai/archai/tasks/face_segmentation/aml/azure/runner.py
    if [ $? != 0 ]; then
      echo "Script returned an error code!"
    fi
--- a/tasks/face_segmentation/snpe/docker/quantizer/setup.ps1
+++ b/tasks/face_segmentation/snpe/docker/quantizer/setup.ps1
@ -246,4 +246,4 @@ Write-Host "  kubectl apply -f quantizer.yaml"

 Write-Host ""
 Write-Host "### To run the runner script locally please set the following environment variable: "
-Write-HOst "set MODEL_STORAGE_CONNECTION_STRING=$conn_str"
+Write-Host "set MODEL_STORAGE_CONNECTION_STRING=$conn_str"
--- a/tasks/face_segmentation/aml/images/portal.png
+++ b/tasks/face_segmentation/aml/images/portal.png
--- a/tasks/face_segmentation/aml/images/snpe.dgml
+++ b/tasks/face_segmentation/aml/images/snpe.dgml
@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="utf-8"?>
+<DirectedGraph GraphDirection="TopToBottom" Layout="Sugiyama" Offset="-1829.8148940022802,-883.0494917160034" ZoomLevel="1" xmlns="http://schemas.microsoft.com/vs/2009/dgml">
+  <Nodes>
+    <Node Id="..." Bounds="-891.259155273438,-398.550804903068,50,25.96" UseManualLocation="True" />
+    <Node Id="...1" Bounds="-996.17041015625,-122.697003871574,50,25.96" Label="..." UseManualLocation="True" />
+    <Node Id="AzureBlobStore" Category="storage" Bounds="-1193.71833333333,-533.830712207031,106.03,73.6213311767578" Label="azure blob store" UseManualLocation="True" />
+    <Node Id="AzureStatusTable" Category="storage" Bounds="-1192.64078544617,-430.209281030274,112.973333333333,73.6213311767578" Label="azure status table" UseManualLocation="True" />
+    <Node Id="JupyterNotebook" Category="script" Bounds="-1374.80858579,-377.363886517334,115.163333333333,62.964467010498" Label="Jupyter Notebook" UseManualLocation="True" />
+    <Node Id="KubernetesCluster" Category="cluster" Bounds="-1195.17041666667,-264.359319506836,119.78,62.9644670104981" Label="Kubernetes Cluster" UseManualLocation="True" />
+    <Node Id="QualcommDevice" Category="device" Bounds="-767.554234593709,-540.257548474121,115.2,62.96" Label="Qualcomm device" UseManualLocation="True" />
+    <Node Id="QualcommDevice1" Category="device" Bounds="-767.259849828084,-447.297448474121,115.2,62.96" Label="Qualcomm device" UseManualLocation="True" />
+    <Node Id="Runner.py" Category="script" Bounds="-905.306400171916,-584.479938924064,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
+    <Node Id="Runner.py1" Category="script" Bounds="-905.012036539714,-491.515371913566,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
+    <Node Id="Runner.py2" Category="script" Bounds="-901.180981852214,-342.590704903068,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
+    <Node Id="Runner.py3" Category="script" Bounds="-903.170422770182,-249.62613789257,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
+    <Node Id="Runner.py4" Category="script" Bounds="-901.449353434245,-156.661570882072,70.4533333333334,62.9644670104981" Label="runner.py" UseManualLocation="True" />
+    <Node Id="SnpeQuantiation" Category="quantize" Bounds="-760.452083333333,-255.710307686227,109.126666666667,62.964467010498" Label="snpe quantiation" UseManualLocation="True" />
+    <Node Id="SnpeQuantization" Category="quantize" Bounds="-763.723191502889,-348.674874696725,114.553333333333,62.9644670104981" Label="snpe quantization" UseManualLocation="True" />
+    <Node Id="Test_onnx" Category="test" Bounds="-761.731038004557,-162.745740675729,70.5999999999999,47.96" Label="test_onnx" UseManualLocation="True" />
+    <Node Id="Upload" Category="script" Bounds="-1358,-486,56.9533333333333,62.964467010498" Label="upload" UseManualLocation="True" />
+  </Nodes>
+  <Links>
+    <Link Source="AzureBlobStore" Target="Runner.py" Bounds="-1087.68836914063,-543.888146126435,173.568538449327,35.9021208699053" />
+    <Link Source="AzureBlobStore" Target="Runner.py1" Bounds="-1087.68836914063,-489.782197817309,173.759053025964,23.7223370430618" />
+    <Link Source="AzureBlobStore" Target="Runner.py2" Bounds="-1088.21890281343,-461.505920916907,179.584023885674,121.517239678322" />
+    <Link Source="AzureStatusTable" Target="JupyterNotebook" Bounds="-1250.94000054214,-378.575417826448,58.2992201059451,15.298820648026" />
+    <Link Source="AzureStatusTable" Target="Runner.py" Bounds="-1079.66748046875,-523.59619140625,166.653381347656,105.640502929688" />
+    <Link Source="AzureStatusTable" Target="Runner.py1" Bounds="-1079.66748046875,-451.611999511719,165.948486328125,41.5036010742188" />
+    <Link Source="AzureStatusTable" Target="Runner.py2" Bounds="-1079.66748046875,-379.401794433594,169.876037597656,51.8286743164063" />
+    <Link Source="KubernetesCluster" Target="...1" Bounds="-1093.33052927498,-201.394849319458,97.6659896150068,73.2956685371913" />
+    <Link Source="KubernetesCluster" Target="Runner.py2" Bounds="-1075.39039550781,-298.365678712837,165.566640952845,48.0922930284574" />
+    <Link Source="KubernetesCluster" Target="Runner.py3" Bounds="-1075.39039550781,-229.576487801222,163.233609160429,8.99595989414743" />
+    <Link Source="KubernetesCluster" Target="Runner.py4" Bounds="-1075.39039550781,-208.904464362079,165.585551579934,66.2801678235037" />
+    <Link Source="Runner.py" Target="AzureStatusTable" Bounds="-1072.13012695313,-535.250061035156,166.82373046875,105.237548828125" />
+    <Link Source="Runner.py" Target="QualcommDevice" Bounds="-834.853066838583,-543.269532104768,58.6235405861986,16.1894387709768" />
+    <Link Source="Runner.py1" Target="AzureStatusTable" Bounds="-1070.91955566406,-447.686828613281,165.907531738281,41.6351928710938" />
+    <Link Source="Runner.py1" Target="QualcommDevice1" Bounds="-834.55870320638,-450.305950587842,58.6235177587191,16.1877924281616" />
+    <Link Source="Runner.py2" Target="AzureStatusTable" Bounds="-1071.05505371094,-370.519165039063,169.874084472656,51.8364562988281" />
+    <Link Source="Runner.py2" Target="SnpeQuantization" Bounds="-830.72764851888,-314.664864642865,58.0109886197575,2.21273140591086" />
+    <Link Source="Runner.py3" Target="SnpeQuantiation" Bounds="-832.717089436849,-221.841899441892,63.2713423409264,2.37545015813271" />
+    <Link Source="Runner.py4" Target="Test_onnx" Bounds="-830.996020100911,-134.464308505795,60.3071978911451,5.86127855164511" />
+    <Link Source="Upload" Target="AzureBlobStore" Bounds="-1301.04666666667,-483.110272187314,98.5479867833703,22.1825747724036" />
+    <Link Source="Upload" Target="AzureStatusTable" Bounds="-1301.04666666667,-445.517004029011,99.824346137553,31.5519803774536" />
+  </Links>
+  <Categories>
+    <Category Id="cluster" />
+    <Category Id="device" />
+    <Category Id="quantize" />
+    <Category Id="script" />
+    <Category Id="storage" />
+    <Category Id="test" />
+  </Categories>
+  <Properties>
+    <Property Id="Bounds" DataType="System.Windows.Rect" />
+    <Property Id="Expression" DataType="System.String" />
+    <Property Id="GraphDirection" DataType="Microsoft.VisualStudio.Diagrams.Layout.LayoutOrientation" />
+    <Property Id="GroupLabel" DataType="System.String" />
+    <Property Id="IsEnabled" DataType="System.Boolean" />
+    <Property Id="Label" Label="Label" Description="Displayable label of an Annotatable object" DataType="System.String" />
+    <Property Id="Layout" DataType="System.String" />
+    <Property Id="Offset" DataType="System.String" />
+    <Property Id="TargetType" DataType="System.Type" />
+    <Property Id="UseManualLocation" DataType="System.Boolean" />
+    <Property Id="Value" DataType="System.String" />
+    <Property Id="ValueLabel" DataType="System.String" />
+    <Property Id="ZoomLevel" DataType="System.String" />
+  </Properties>
+  <Styles>
+    <Style TargetType="Node" GroupLabel="test" ValueLabel="True">
+      <Condition Expression="HasCategory('test')" />
+      <Setter Property="Icon" Value="CodeMap_TestProject" />
+    </Style>
+    <Style TargetType="Node" GroupLabel="storage" ValueLabel="True">
+      <Condition Expression="HasCategory('storage')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Table.png" />
+    </Style>
+    <Style TargetType="Node" GroupLabel="script" ValueLabel="True">
+      <Condition Expression="HasCategory('script')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Script.png" />
+    </Style>
+    <Style TargetType="Node" GroupLabel="quantize" ValueLabel="True">
+      <Condition Expression="HasCategory('quantize')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Gears.png" />
+    </Style>
+    <Style TargetType="Node" GroupLabel="device" ValueLabel="True">
+      <Condition Expression="HasCategory('device')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Device.png" />
+    </Style>
+    <Style TargetType="Node" GroupLabel="cluster" ValueLabel="True">
+      <Condition Expression="HasCategory('cluster')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Network.png" />
+    </Style>
+    <Style TargetType="Node">
+      <Setter Property="HorizontalAlignment" Value="Center" />
+      <Setter Property="IconPlacement" Value="Top" />
+    </Style>
+  </Styles>
+</DirectedGraph>
--- a/tasks/face_segmentation/aml/images/snpe.png
+++ b/tasks/face_segmentation/aml/images/snpe.png
--- a/tasks/face_segmentation/aml/images/store.png
+++ b/tasks/face_segmentation/aml/images/store.png
--- a/tasks/face_segmentation/aml/images/system.dgml
+++ b/tasks/face_segmentation/aml/images/system.dgml
@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="utf-8"?>
+<DirectedGraph GraphDirection="TopToBottom" Layout="Sugiyama" Offset="-820.8108333333334,-384.5347442626953" ZoomLevel="1" xmlns="http://schemas.microsoft.com/vs/2009/dgml">
+  <Nodes>
+    <Node Id="AmlTraining" Category="aml" Bounds="178.281669209798,-42.1863471113841,158.853333333333,78.924467010498" Label="aml training &#xD;&#xA;(AmlPartialTrainingEvaluator)" UseManualLocation="True" />
+    <Node Id="AzureMLPipelines" Category="aml" Bounds="455.9075,-19.9970294464111,120.336666666667,62.9644670104981" Label="Azure ML Pipelines" UseManualLocation="True" />
+    <Node Id="BlobStore(models)" Category="storage" Bounds="372.473597513835,72.9675375640869,116.526666666667,89.5813311767578" Label="blob store &#xD;&#xA;(models &amp; results)" UseManualLocation="True" />
+    <Node Id="RemoteDeviceInferenceTesting" Category="remote" Bounds="72.2816666666666,170.359651075872,206.316666666667,78.9244670104981" Label="remote device inference testing &#xD;&#xA;(RemoteAzureBenchmarkEvaluator)" UseManualLocation="True" />
+    <Node Id="Search" Category="search" Bounds="0,0,53.9833333333333,62.964467010498" Label="search" UseManualLocation="True" />
+    <Node Id="TableResults" Category="storage" Bounds="210.715582682292,66.738219899114,84.0966666666666,73.6213311767578" Label="table results" UseManualLocation="True" />
+  </Nodes>
+  <Links>
+    <Link Source="AmlTraining" Target="AzureMLPipelines" Bounds="337.135002543131,1.64407801566514,109.786077444833,6.03785408365997" />
+    <Link Source="AmlTraining" Target="BlobStore(models)" Bounds="314.38133986177,36.7381198991139,50.7064047608595,35.3076040473083" />
+    <Link Source="AmlTraining" Target="TableResults" Bounds="254.894834328378,36.7381198991139,0.977495669259497,21.0098250749297" />
+    <Link Source="AzureMLPipelines" Target="BlobStore(models)" Bounds="448.526763916016,42.9674377441406,44.100830078125,25.0390167236328" />
+    <Link Source="BlobStore(models)" Target="AzureMLPipelines" Bounds="451.671051025391,47.7068099975586,44.4396362304688,25.2607269287109" />
+    <Link Source="RemoteDeviceInferenceTesting" Target="TableResults" Bounds="204.152602969458,147.637053103474,16.5328942653301,22.7225979723975" />
+    <Link Source="Search" Target="AmlTraining" Bounds="53.9833333333333,10.3717085435882,115.395650767872,17.1087060975231" />
+    <Link Source="Search" Target="RemoteDeviceInferenceTesting" Bounds="52.7301876549027,62.403421700363,84.1039893644507,101.039033450549" />
+    <Link Source="TableResults" Target="Search" Bounds="62.5571377644598,42.8347555073116,148.158444917832,47.2922740456947" />
+  </Links>
+  <Categories>
+    <Category Id="aml" />
+    <Category Id="remote" />
+    <Category Id="search" />
+    <Category Id="storage" />
+  </Categories>
+  <Properties>
+    <Property Id="Bounds" DataType="System.Windows.Rect" />
+    <Property Id="Expression" DataType="System.String" />
+    <Property Id="GraphDirection" DataType="Microsoft.VisualStudio.Diagrams.Layout.LayoutOrientation" />
+    <Property Id="GroupLabel" DataType="System.String" />
+    <Property Id="IsEnabled" DataType="System.Boolean" />
+    <Property Id="Label" Label="Label" Description="Displayable label of an Annotatable object" DataType="System.String" />
+    <Property Id="Layout" DataType="System.String" />
+    <Property Id="Offset" DataType="System.String" />
+    <Property Id="TargetType" DataType="System.Type" />
+    <Property Id="UseManualLocation" DataType="System.Boolean" />
+    <Property Id="Value" DataType="System.String" />
+    <Property Id="ValueLabel" DataType="System.String" />
+    <Property Id="ZoomLevel" DataType="System.String" />
+  </Properties>
+  <Styles>
+    <Style TargetType="Node" GroupLabel="storage" ValueLabel="True">
+      <Condition Expression="HasCategory('storage')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Table.png" />
+    </Style>
+    <Style TargetType="Node" GroupLabel="remote" ValueLabel="True">
+      <Condition Expression="HasCategory('remote')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/CellPhone.png" />
+    </Style>
+    <Style TargetType="Node" GroupLabel="search" ValueLabel="True">
+      <Condition Expression="HasCategory('search')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Gears.png" />
+    </Style>
+    <Style TargetType="Node" GroupLabel="aml" ValueLabel="True">
+      <Condition Expression="HasCategory('aml')" />
+      <Setter Property="Icon" Value="pack://application:,,,/Microsoft.VisualStudio.Progression.GraphControl;component/Icons/Network.png" />
+    </Style>
+    <Style TargetType="Node">
+      <Setter Property="HorizontalAlignment" Value="Center" />
+      <Setter Property="IconPlacement" Value="Top" />
+    </Style>
+  </Styles>
+</DirectedGraph>
--- a/tasks/face_segmentation/aml/images/system.png
+++ b/tasks/face_segmentation/aml/images/system.png
--- a/tasks/face_segmentation/snpe/notebooks/gallery/gallery_performance.ipynb
+++ b/tasks/face_segmentation/snpe/notebooks/gallery/gallery_performance.ipynb
@ -112,12 +112,13 @@
    }
   ],
   "source": [
+    "experiment_name = 'facesynthetics'\n",
    "con_str = os.getenv(\"MODEL_STORAGE_CONNECTION_STRING\")\n",
    "if con_str is None:\n",
    "    print(\"Please set your MODEL_STORAGE_CONNECTION_STRING environment variable\")\n",
    "else:\n",
    "    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)\n",
-    "    store = ArchaiStore(storage_account_name, storage_account_key)\n",
+    "    store = ArchaiStore(storage_account_name, storage_account_key, 'models', experiment_name)\n",
    "    print(f'using {storage_account_name}')"
   ]
  },
--- a/tasks/face_segmentation/snpe/notebooks/gallery/pareto.py
+++ b/tasks/face_segmentation/snpe/notebooks/gallery/pareto.py
--- a/tasks/face_segmentation/aml/readme.md
+++ b/tasks/face_segmentation/aml/readme.md
@ -0,0 +1,95 @@
+# Readme
+
+This folder contains code that automates the search, partial training and inference latency testing in [Azure
+ML](https://azure.microsoft.com/en-us/products/machine-learning/). The inference testing of ONNX models can be performed
+across one or more machines that are connected via USB to Qualcomm 888 boards.
+
+The code is organized into:
+
+1. [Training](training/readme.md) code that plugs into the Archai Search to perform partial training
+of selected models on a GPU cluster in Azure ML.
+
+1. [SNPE Device](snpe/readme.md) code that uses [Microsoft
+Olive](https://github.com/microsoft/olive) to drive the the
+[Qualcomm Neural Processing SDK](https://developer.qualcomm.com/software/qualcomm-neural-processing-sdk) to talk
+to the device, convert ONNX models to .dlc, quantize them, and test them on one or more
+[Qualcomm 888 dev kits](https://developer.qualcomm.com/hardware/snapdragon-888-hdk).
+
+1. [Azure Code](azure/readme.md) that talks to a configured Azure storage account for uploading
+models to test, downloading them, uploading test results, and keeping an Azure status table that
+summarizes results of all the work in progress.
+
+1. [Docker](docker/quantizer/readme.md) contains scripts for setting up your Azure account and optionally
+creating a docker image for running in an Azure Kubernetes cluster to do model quantization using
+the Qualcomm Neural Processing SDK. Quantization is time consuming so having an elastic scale speeds
+things up a lot.
+
+1. [Notebooks](notebook/gallery_performance.md) contains a Jupyter Notebook that can visualize the
+results from your Azure "status" table.
+
+
+## Workflow
+
+The overall workflow begins with the top level [aml.py](../../aml.py) script which
+starts with an Archai Search that contains an `AmlPartialTrainingEvaluator` and a
+`RemoteAzureBenchmarkEvaluator`.  The remote benchmark evaluator performs inference latency testing
+on Qualcomm hardware.  The `AmlPartialTrainingEvaluator` then kicks off one new Azure ML
+training pipeline for each batch of new model architectures that need to be partially trained, it
+stores the validation IOU results in an Azure blob store and an Azure table so the search can get
+those results and use them to figure out the next iteration of the search algorithm:
+
+![system](images/system.png)
+
+See [AML Training Readme](training/readme.md) for more information.
+
+## Remote Inference Testing
+
+The remote inference testing workflow looks like this, the `RemoteAzureBenchmarkEvaluator` uploads models to the same
+Azure blob store, and adds a row to the status table.  This triggers remote instances of the [runner.py](azure/runner.py) script
+to process these new models on an attached Qualcomm device.  Optionally some of the work can be done in the cloud
+using a Kubernetes cluster, this includes model quantization and accuracy testing using the ONNX runtime.
+The workflow looks like this:
+
+![snpe](images/snpe.png)
+
+Each instance of `runner.py` looks for work, and executes it in priority order where the prioritization is defined by
+the `find_work_prioritized` function in the runner.  This script is completely restartable, and can distribute the work
+across multiple instances of the runner script.  Each instance will pick up where a previous one left off based on what
+it finds in your Azure status table. The prioritization maps to the columns of the status table as follows:
+
+1. **macs:** convert to .dlc and post Macs score and `snpe-dlc-viewer` output and do model quantization (runs on Linux) - priority 20
+1. **total_inference_avg** run `snpe_bench.py` with quantized model on Qualcomm device DSP - priority 30
+1. **f1_onnx** compute f1 from onnxruntime on .onnx model on a 10k test set on Linux - priority 60
+1. **f1_1k** compute f1 on quantized .dlc model on Qualcomm device DSP with a 1k test set - priority
+is the mean f1 score so that quicker models are prioritized.
+1. **f1_1k_f** compute f1 on floating point .dlc model on on Qualcomm device CPU with a 1k test set
+   - priority 10 * the mean f1 score so that quicker models are prioritized.
+1. **f1_10k** compute f1 on quantized model on a 10k test set - priority = 100 * the mean f1 score
+   so that quicker models are prioritized.
+
+Lower number means higher priority job and each machine will run the highest priority work first.
+
+You can override the priority of a specific job by passing a `--proprity` parameter on the `upload.py` script or by
+editing the Azure status table and adding a `priority` field to the JSON stored there. You can set any priority number
+you want, if you specify priority 0 it will run before anything else which can be handy if you have a cool new model
+that you want to bump to the top of the list.
+
+Notice some of the above jobs can run on Linux and do not require Qualcomm device. So in order to maximize throughput on
+machines that do have a Qualcomm devices you can allocate other Linux machines with no Qualcomm devices to do the other
+work, namely, converting models, quantizing them, and running the `f1_onnx` test.
+
+Folks across your team can use the `azure/upload.py` to submit jobs and let them run, or they can automate that as
+shown in the `RemoteAzureBenchmarkEvaluator` in the `search.py` script.
+
+You can use `status.py` to monitor progress or look at the Azure status table.  Various status messages are posted
+there so you can see which machine is doing what and is in what stage of the job.
+
+Next you can go to the `notebook` page and get some pretty pictures of your Pareto Curves.
+
+## Azure Portal
+
+When everything is running you will see progress happening in your Azure status table.  Here you see the snpe-quantizer
+kubernetes cluster is quantizing a bunch of models while other machines are running the bench mark tests on the Qualcomm
+hardware:
+
+![portal](images/portal.png)
--- a/tasks/face_segmentation/snpe/requirements.txt
+++ b/tasks/face_segmentation/snpe/requirements.txt
@ -1,6 +1,5 @@
 importlib-metadata!=4.7.0,<6,>=3.7.0
 packaging<22.0,>=20.0
-cryptography<39,>=38.0.0
 psutil
 tqdm
 pandas
--- a/tasks/face_segmentation/snpe/setup/launch.py
+++ b/tasks/face_segmentation/snpe/setup/launch.py
--- a/tasks/face_segmentation/snpe/setup/readme.md
+++ b/tasks/face_segmentation/snpe/setup/readme.md
--- a/tasks/face_segmentation/snpe/snpe/cleanup_results.py
+++ b/tasks/face_segmentation/snpe/snpe/cleanup_results.py
--- a/tasks/face_segmentation/snpe/snpe/convert_tf.sh
+++ b/tasks/face_segmentation/snpe/snpe/convert_tf.sh
--- a/tasks/face_segmentation/snpe/snpe/create_data.py
+++ b/tasks/face_segmentation/snpe/snpe/create_data.py
--- a/tasks/face_segmentation/snpe/snpe/dlc_helper.py
+++ b/tasks/face_segmentation/snpe/snpe/dlc_helper.py
--- a/tasks/face_segmentation/snpe/snpe/fetch_results.py
+++ b/tasks/face_segmentation/snpe/snpe/fetch_results.py
--- a/tasks/face_segmentation/snpe/snpe/readme.md
+++ b/tasks/face_segmentation/snpe/snpe/readme.md
@ -37,7 +37,7 @@ bits.  If you plan to use Qualcomm hardware devices then set the `SNPE_ANDROID_R
 1. **Install required packages including Olive **

    ```
-    pushd tasks/face_segmentation/snpe
+    pushd tasks/face_segmentation/aml
    pip install -r requirements.txt
    ```

--- a/tasks/face_segmentation/snpe/snpe/snpe_setup.sh
+++ b/tasks/face_segmentation/snpe/snpe/snpe_setup.sh
--- a/tasks/face_segmentation/snpe/snpe/test_onnx.py
+++ b/tasks/face_segmentation/snpe/snpe/test_onnx.py
--- a/tasks/face_segmentation/snpe/snpe/test_snpe.py
+++ b/tasks/face_segmentation/snpe/snpe/test_snpe.py
--- a/tasks/face_segmentation/aml/training/aml_training_evaluator.py
+++ b/tasks/face_segmentation/aml/training/aml_training_evaluator.py
@ -0,0 +1,149 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import json
+from pathlib import Path
+from typing import List, Optional, Union
+from overrides import overrides
+from archai.discrete_search.api.archai_model import ArchaiModel
+from archai.discrete_search.api.model_evaluator import AsyncModelEvaluator
+from archai.common.config import Config
+from shutil import copyfile
+from archai.common.monitor import JobCompletionMonitor
+from aml.training.training_pipeline import start_training_pipeline
+from azure.identity import DefaultAzureCredential
+from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
+from azure.ai.ml import MLClient
+from aml.util.setup import configure_store, get_valid_arch_id
+
+
+def _get_entity_value(entity, key, default_value=''):
+    if key in entity:
+        return entity[key]
+    return default_value
+
+
+class AmlPartialTrainingEvaluator(AsyncModelEvaluator):
+    """ The AmlPartialTrainingEvaluator launches partial training jobs"""
+    def __init__(self,
+                 config : Config,
+                 local_output: Path,
+                 tr_epochs: int = 1,
+                 timeout_seconds=3600):
+        self.config = config
+        self.tr_epochs = int(tr_epochs)
+        self.iteration = 1
+        aml_config = config['aml']
+        workspace_name = aml_config['workspace_name']
+        subscription_id = aml_config['subscription_id']
+        resource_group_name = aml_config['resource_group']
+
+        identity = DefaultAzureCredential()
+        if os.getenv('AZUREML_ROOT_RUN_ID'):
+            identity = AzureMLOnBehalfOfCredential()
+
+        self.ml_client = MLClient(
+            credential=identity,
+            subscription_id=subscription_id,
+            resource_group_name=resource_group_name,
+            workspace_name=workspace_name
+        )
+        self.local_output = local_output
+        self.models = []
+        self.timeout = timeout_seconds
+        self.store = configure_store(aml_config)
+        self.results = []
+        self.metric_key = self.config['training'].get('metric_key', 'val_iou')
+
+    @overrides
+    def send(self, arch: ArchaiModel, budget: Optional[float] = None) -> None:
+        self.models += [arch]
+        model_id = get_valid_arch_id(arch)
+        e = self.store.get_status(model_id)
+        if self.metric_key in e and e[self.metric_key]:
+            # seems to have already been trained then, so to make this a restartable job we pick up those results.
+            if 'iteration' not in e:
+                e['iteration'] = self.iteration
+                self.store.merge_status_entity(e)
+            metric = float(e[self.metric_key])
+            self.results += [{
+                'id': model_id,
+                self.metric_key: metric,
+                'status': _get_entity_value(e, 'status'),
+                'error': _get_entity_value(e, 'error')
+            }]
+
+    @overrides
+    def fetch_all(self) -> List[Union[float, None]]:
+
+        if len(self.results) > 0:
+            print(f'AmlPartialTrainingEvaluator: found {len(self.results)} were already trained.')
+
+        index = {}
+        for existing in self.results:
+            id = existing['id']
+            index[id] = existing
+
+        # pull out the models that have not yet been trained.
+        pending = []
+        for arch in self.models:
+            model_id = get_valid_arch_id(arch)
+            if model_id not in index:
+                pending += [arch]
+
+        if len(pending) > 0:
+            print(f"AmlPartialTrainingEvaluator: Starting training on {len(pending)} models")
+
+            # train all the models listed in the pending on a GPU cluster so we get much training
+            # happening in parallel which greatly reduces the overall Archai Search process.
+            description = f"AmlPartialTrainingEvaluator training {self.tr_epochs} epochs"
+            pipeline_job, model_names = start_training_pipeline(
+                description,  self.iteration, self.ml_client, self.store, pending, self.config, self.tr_epochs, self.local_output)
+
+            job_id = pipeline_job.name
+            print(f'AmlPartialTrainingEvaluator: Started training pipeline: {job_id}')
+
+            # wait for all the parallel training jobs to finish
+            keys = [self.metric_key]
+            monitor = JobCompletionMonitor(self.store, self.ml_client, keys, job_id, self.timeout)
+            models = monitor.wait(model_names)['models']
+            for m in models:
+                id = m['id']
+                index[id] = m
+
+        # now reassemble all results in the right order (order of the send method calls)
+        models = []
+        for arch in self.models:
+            model_id = get_valid_arch_id(arch)
+            result = index[model_id]
+            models += [result]
+
+        results = {'models': models}
+
+        # save the results to the output folder (which is mapped by the AML pipeline to our
+        # blob store under the container 'models' in the folder named the same as the
+        # experiment_name)
+        results_path = f'{self.local_output}/models.json'
+        summary = json.dumps(results, indent=2)
+        with open(results_path, 'w') as f:
+            f.write(summary)
+
+        # save the archai log also which can be handy for debugging later.
+        log = 'archai.log'
+        if os.path.isfile(log):
+            copyfile(log, f'{self.local_output}/{log}')
+
+        # extract the array of results for our return value this is the metric that the
+        # Archai search needs to figure out which models to continue to evolve and which are
+        # not so good.
+        metrics = []
+        for m in results['models']:
+            metric = m[self.metric_key]
+            metrics += [metric]
+
+        self.models = []  # reset for next run.
+        print(f'AmlPartialTrainingEvaluator: fetch_all returning : {summary}')
+
+        self.iteration += 1
+        return metrics
--- a/tasks/face_segmentation/aml/training/readme.md
+++ b/tasks/face_segmentation/aml/training/readme.md
@ -0,0 +1,53 @@
+## AML Training Readme
+
+Two scripts in this folder provide Azure ML training pipelines to the Archai Search.
+The [train.py](../../train.py) script plugs in the `AmlPartialTrainingEvaluator`
+async model evaluator when invoked with the [aml_search.yaml](../../confs/aml_search.yaml) configuration.
+
+The top level [aml.py](../../aml.py) script kicks off this process setting up all the
+required Azure ML resources including:
+- a conda environment used to build the docker image that Azure ML uses
+- a cpu cluster for running the search
+- a gpu cluster for running partial training
+- the datastore for the training dataset
+- the datastore for downloading config info and uploading results, including the trained models
+
+Notice that the [aml_search.yaml](../../confs/aml_search.yaml) configuration
+file requires the following environment variables be defined so that it can find your Azure subscription,
+and Azure ML resource group and workspace, it also needs the connection string for your storage account.
+
+```yaml
+aml:
+  connection_str: ${MODEL_STORAGE_CONNECTION_STRING}
+  subscription_id: ${AZURE_SUBSCRIPTION_ID}
+  resource_group: ${AML_RESOURCE_GROUP}
+  workspace_name: ${AML_WORKSPACE_NAME}
+```
+
+The storage account can be created using the [setup.ps1](../docker/quantizer/setup.ps1) powershell script
+which uses the [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli).
+
+[aml_training_evaluator](aml_training_evaluator.py) provides a class named
+`AmlPartialTrainingEvaluator` which is an `AsyncModelEvaluator` that creates a new Azure ML Pipeline
+for partial training each new batch of models provided by the Archai Search algorithm.  It runs these partial
+training jobs on a GPU cluster to maximize throughput.
+
+[training_pipeline](training_pipeline.py) provides a helper function named `start_training_pipeline`
+that uses the Azure ML Python SDK to create the Azure ML training pipeline.
+
+As the search progresses and training is completed you will find the following files in your
+Azure storage account:
+
+![store](../images/store.png)
+
+The name of this folder is the model id, `id_113c4c240bfc5fd2eaf2f0129439251bb754ddc4` which can also
+be found in your Azure storage table.  The parent folder `facesynthetics` is the `experiment name` and is also
+the name of your Azure storage table. This table contains the overall summary for all the models processed so far.
+
+You can see here that Qualcomm Snapdragon processing was also done on this model, so you see the [.dlc
+files](https://developer.qualcomm.com/sites/default/files/docs/snpe/overview.html) there for the model and for the
+quantized version of the model and a `.csv` file containing information about inference times for this model (including
+layer by layer information which is handy).
+
+The json file contains the `ArchConfig` for the model which can be used to recreate the model
+and fully train it using the [train.py](../../train.py) script.
--- a/tasks/face_segmentation/aml/training/training_pipeline.py
+++ b/tasks/face_segmentation/aml/training/training_pipeline.py
@ -0,0 +1,165 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import json
+import os
+from pathlib import Path
+from typing import List
+from archai.common.store import ArchaiStore
+from azure.ai.ml import MLClient
+from archai.discrete_search.api import ArchaiModel
+from archai.discrete_search.search_spaces.config import ArchConfig
+from azure.ai.ml import command, Input, Output, dsl
+from azure.ai.ml.entities import UserIdentityConfiguration
+from archai.common.config import Config
+from aml.util.setup import copy_code_folder, get_valid_arch_id
+from shutil import copyfile
+
+
+def training_component(output_path: str, code_dir: Path, config, training_epochs: int, config_filename: str, model_id: str, arch: str):
+    # we need a folder containing all the specific code we need here, which is not everything in this repo.
+    training = config['training']
+    learning_rate = training['learning_rate']
+    batch_size = training['batch_size']
+    aml_config = config['aml']
+    environment_name = aml_config['environment_name']
+    con_str = aml_config['connection_str']
+
+    fixed_args = f'--lr {learning_rate} --batch_size {batch_size} ' +\
+                 f'--epochs {int(training_epochs)} --model_id {model_id} --config {config_filename} ' +\
+                 f'{arch}'
+
+    return command(
+        name="train",
+        display_name="Archai training job",
+        description="Trains a face segmentation model.",
+        inputs={
+            "data": Input(type="uri_folder", mode="download")
+        },
+        is_deterministic=False,
+        outputs={
+            "results": Output(type="uri_folder", path=output_path, mode="rw_mount")
+        },
+        environment_variables={'MODEL_STORAGE_CONNECTION_STRING': con_str},
+        identity=UserIdentityConfiguration(),
+        # The source folder of the component
+        code=str(code_dir),
+        command="""python3 train.py \
+                --dataset_dir ${{inputs.data}} \
+                --output_dir ${{outputs.results}} \
+                """ + fixed_args,
+        environment=environment_name,
+    )
+
+
+def start_training_pipeline(description: str, iteration: int, ml_client: MLClient, store: ArchaiStore,
+                            model_architectures: List[ArchaiModel],
+                            config: Config, training_epochs: int, output_folder: Path):
+    """ Creates a new Azure ML Pipeline for training a set of models, updating the status of
+    these jobs in a given Azure Storage Table.  This command does not wait for those jobs to
+    finish.  For that use the monitor.py script which monitors the same Azure Storage Table
+    to find out when the jobs have all finished.  The train.py script will update the table
+    when each training job completes. """
+
+    aml_config = config['aml']
+    training_cluster = aml_config['training_cluster']
+    compute_cluster_name = training_cluster['name']
+    datastore_path = aml_config['datastore_path']
+    root_uri = aml_config['results_path']
+    environment_name = aml_config['environment_name']
+    experiment_name = aml_config['experiment_name']
+    metric_key = config['training'].get('metric_key', 'val_iou')
+
+    print(f"Cluster: {compute_cluster_name}")
+    print(f"Dataset: {datastore_path}")
+    print(f"Output: {root_uri}")
+    print(f"Environment: {environment_name}")
+    print(f"Experiment: {experiment_name}")
+    print(f"Epochs: {training_epochs}")
+    print(f"Iteration: {iteration}")
+
+    code_dir = Path('temp_code')
+    os.makedirs(code_dir, exist_ok=True)
+    config_dir = code_dir / 'confs'
+    os.makedirs(config_dir, exist_ok=True)
+    archs_dir = code_dir / 'archs'
+    os.makedirs(archs_dir, exist_ok=True)
+    copyfile('train.py', str(code_dir / 'train.py'))
+    copy_code_folder('training', str(code_dir / 'training'))
+    copy_code_folder('search_space', str(code_dir / 'search_space'))
+    copy_code_folder(os.path.join('aml', 'training'), str(code_dir / 'aml' / 'training'))
+    copy_code_folder(os.path.join('aml', 'util'), str(code_dir / 'aml' / 'util'))
+    config.save(str(config_dir / 'aml_search.yaml'))
+
+    models = []
+    model_names = []
+    for arch in model_architectures:
+
+        model_id = get_valid_arch_id(arch)
+        model_names += [model_id]
+        print(f'Launching training job for model {model_id}')
+
+        # upload the model architecture to our blob store so we can find it later.
+        metadata: ArchConfig = arch.metadata['config']
+        filename = str(archs_dir / f'{model_id}.json')
+        metadata.to_file(filename)
+        store.upload_blob(f'{experiment_name}/{model_id}', filename, blob_name=f'{model_id}.json')
+
+        # create status entry in azure table
+        e = store.get_status(model_id)
+        e['experiment'] = experiment_name
+        e['epochs'] = training_epochs
+        e['iteration'] = iteration
+        e['status'] = 'preparing'
+        store.merge_status_entity(e)
+        models += [{
+            'id': model_id,
+            'status': 'training',
+            'epochs': training_epochs,
+            metric_key: e[metric_key] if metric_key in e else 0.0
+        }]
+
+    results = {
+        'models': models
+    }
+
+    @dsl.pipeline(
+        compute=compute_cluster_name,
+        description=description,
+    )
+    def parallel_training_pipeline(
+        data_input
+    ):
+        outputs = {}
+        for arch in model_architectures:
+            model_id = get_valid_arch_id(arch)
+            output_path = f'{root_uri}/{model_id}'
+            filename = f'archs/{model_id}.json'
+            train_job = training_component(
+                output_path, code_dir, config, training_epochs, 'confs/aml_search.yaml', model_id, filename)(
+                data=data_input
+            )
+
+            outputs[model_id] = train_job.outputs.results
+
+        return outputs
+
+    training_pipeline = parallel_training_pipeline(
+        data_input=Input(type="uri_folder", path=datastore_path)
+    )
+
+    # submit the pipeline job
+    pipeline_job = ml_client.jobs.create_or_update(
+        training_pipeline,
+        experiment_name=experiment_name,
+    )
+
+    # Write the new list of pending models so that the make_monitor_command
+    # knows what to wait for.
+    print("Writing pending.json: ")
+    print(json.dumps(results, indent=2))
+    results_path = output_folder / 'pending.json'
+    with open(results_path, 'w') as f:
+        f.write(json.dumps(results, indent=2))
+
+    return (pipeline_job, model_names)
--- a/tasks/face_segmentation/snpe/util/priority_queue.py
+++ b/tasks/face_segmentation/snpe/util/priority_queue.py
--- a/tasks/face_segmentation/aml/util/setup.py
+++ b/tasks/face_segmentation/aml/util/setup.py
@ -0,0 +1,82 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import sys
+from glob import glob
+from shutil import copyfile
+from archai.common.config import Config
+from archai.common.store import ArchaiStore
+from azure.ai.ml.entities._credentials import AccountKeyConfiguration
+from azure.ai.ml.entities import AzureBlobDatastore
+import archai.common.azureml_helper as aml_helper
+from archai.discrete_search.api.archai_model import ArchaiModel
+
+
+def configure_store(aml_config: Config, blob_container_name: str = None) -> ArchaiStore:
+    con_str = aml_config.get('connection_str')
+    if not con_str:
+        print("Please set environment variable 'MODEL_STORAGE_CONNECTION_STRING' containing the Azure storage account connection " +
+              "string for the Azure storage account you want to use to control this experiment.")
+        sys.exit(1)
+
+    if blob_container_name is None:
+        blob_container_name = aml_config.get('blob_container_name', 'models')
+    experiment_name = aml_config.get('experiment_name', 'facesynthetics')
+    partition_key = aml_config.get('partition_key', 'main')
+    storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
+    return ArchaiStore(storage_account_name, storage_account_key, blob_container_name, experiment_name, partition_key)
+
+
+def register_datastore(ml_client, data_store_name, blob_container_name, storage_account_name, storage_account_key, experiment_name):
+    try:
+        credentials = AccountKeyConfiguration(account_key=storage_account_key)
+        model_store = ml_client.datastores.get(data_store_name)
+        if model_store.container_name != blob_container_name:
+            raise Exception(f'The container name does not match. Only the credentials on {data_store_name} can be updated')
+        if model_store.account_name != storage_account_name:
+            raise Exception(f'The storage account name does not match. Only the credentials on {data_store_name} can be updated')
+        model_store.credentials = credentials
+    except:
+        model_store = AzureBlobDatastore(
+            name=data_store_name,
+            description="Datastore pointing to a blob container.",
+            account_name=storage_account_name,
+            container_name=blob_container_name,
+            credentials=credentials,
+        )
+
+    ml_client.create_or_update(model_store)
+    return f'azureml://datastores/{data_store_name}/paths/{experiment_name}'
+
+
+def create_cluster(ml_client, config, key):
+    section = config[key]
+    compute_name = section['name']
+    size = section['size']
+    location = section['location']
+    max_instances = section.get('max_instances', 1)
+    aml_helper.create_compute_cluster(ml_client, compute_name, size=size, location=location, max_instances=max_instances)
+    return compute_name
+
+
+def copy_code_folder(src_dir, target_dir):
+    """ Copies the code folder into a separate folder.  This is needed otherwise the pipeline will fail with
+    UserError: The code snapshot was modified in blob storage, which could indicate tampering.
+    If this was unintended, you can create a new snapshot for the run. To do so, edit any
+    content in the local source directory and resubmit the run.
+    """
+    os.makedirs(target_dir, exist_ok=True)
+    for path in glob(os.path.join(src_dir, '*.py')):
+        file = os.path.basename(path)
+        print(f"copying source file : {file} to {target_dir}")
+        copyfile(path, os.path.join(target_dir, file))
+    for name in os.listdir(src_dir):
+        path = os.path.join(src_dir, name)
+        if os.path.isdir(path):
+            copy_code_folder(path, os.path.join(target_dir, name))
+
+
+def get_valid_arch_id(arch: ArchaiModel):
+    # bug in azure ml sdk requires blob store folder names not begin with digits, so we prefix with 'id_'
+    return f'id_{arch.archid}'
--- a/tasks/face_segmentation/snpe/util/shell.py
+++ b/tasks/face_segmentation/snpe/util/shell.py
--- a/tasks/face_segmentation/snpe/vision/collect_metrics.py
+++ b/tasks/face_segmentation/snpe/vision/collect_metrics.py
--- a/tasks/face_segmentation/snpe/vision/readme.md
+++ b/tasks/face_segmentation/snpe/vision/readme.md
--- a/tasks/face_segmentation/check.py
+++ b/tasks/face_segmentation/check.py
@ -1,25 +0,0 @@
-import os
-from archai.discrete_search.search_spaces.config import ArchConfig
-from archai.discrete_search.evaluators import TorchNumParameters
-from archai.discrete_search.api.archai_model import ArchaiModel
-from search_space.hgnet import StackedHourglass, HgnetSegmentationSearchSpace
-from archai.common.config import Config
-
-
-constraint = (1e5, 5e7)
-evaluator = TorchNumParameters()
-search_config = Config('confs/cpu_search.yaml')['search']
-ss_config = search_config['search_space']
-search_space = HgnetSegmentationSearchSpace(seed=1680312796, **ss_config.get('params', {}))
-targets = os.path.join('archs', 'snp_target')
-
-for file in os.listdir(targets):
-    path = os.path.join(targets, file)
-    if os.path.isfile(path) and path.endswith(".json"):
-        config = ArchConfig.from_file(path)
-        model = StackedHourglass(config, **search_space.model_kwargs)
-        archid = os.path.splitext(file)[0]
-        m = ArchaiModel(model, archid, config)
-        num_params = evaluator.evaluate(m, None)
-        if num_params < constraint[0] or num_params > constraint[1]:
-            print(f"Model {file} has {num_params} parameters and is outside the valid range.")
--- a/tasks/face_segmentation/confs/aml_search.yaml
+++ b/tasks/face_segmentation/confs/aml_search.yaml
@ -0,0 +1,102 @@
+search:
+  search_space:
+    name: hgnet
+
+    params:
+      num_classes: 18
+      img_size: [256, 256] # (w, h)
+      in_channels: 3
+      op_subset: ['conv3x3', 'conv5x5', 'conv7x7']
+      stem_strides: [2]
+
+      # Number of downsampling blocks (without counting stem conv)
+      num_blocks: 5
+
+      # Maximum number of layers in downsampling blocks
+      downsample_block_max_ops: 4
+
+      # Maximum number of layers in skip blocks
+      skip_block_max_ops: 2
+
+      # Maximum number of layers in upsampling blocks
+      upsample_block_max_ops: 4
+
+      # Maximum number of layers after the final upsampling layer
+      post_upsample_max_ops: 2
+
+  algorithm:
+    name: evolution_pareto
+
+    params:
+      num_iters: 20
+      init_num_models: 20
+      mutations_per_parent: 5
+      num_crossovers: 6
+      max_unseen_population: 20
+      num_random_mix: 6
+      max_parameters: 5e7
+      # we are training elsewhere, so tell the search not to try and save them locally!
+      save_pareto_model_weights: false
+
+  target:
+    name: snp
+    metric_key: mean
+    max_retries: 15
+    retry_interval: 60
+    verbose: true
+
+training:
+  # https://learn.microsoft.com/en-us/answers/questions/1215210/limited-gpu-ram
+  batch_size: 16
+  learning_rate: 2e-4
+  partial_training_epochs: 1
+  metric_key: val_iou
+
+aml:
+  connection_str: ${MODEL_STORAGE_CONNECTION_STRING}
+  blob_container_name: models
+  experiment_name: facesynthetics
+  partition_key: main
+  subscription_id: ${AZURE_SUBSCRIPTION_ID}
+  resource_group: ${AML_RESOURCE_GROUP}
+  workspace_name: ${AML_WORKSPACE_NAME}
+  timeout: 18000
+
+  search_cluster:
+    name: nas-cpu-cluster-D14-v2
+    size: Standard_D14_v2
+    location: westus2
+
+  training_cluster:
+    name: nas-gpu-cluster-NC6
+    size: Standard_NC6
+    location: westus2
+    max_instances: 20
+
+  environment:
+    name: facesynthetics-nas-env
+    channels:
+      - conda-forge
+      - pytorch
+      - nvidia
+    dependencies:
+      - python=3.10
+      - pip
+      - pip:
+        - azure-ai-ml==1.5.0
+        - azure-storage-blob
+        - azure-data-tables
+        - azure-identity
+        - azureml-mlflow
+        - datasets>=2.4.0
+        - matplotlib
+        - mldesigner
+        - mlflow
+        - onnx>=1.10.2
+        - onnxruntime>=1.10.0
+        - psutil
+        - torch
+        - torchvision
+        - torchaudio
+        - lightning>=2.0.0
+        - archai[dev] @ git+https://github.com/microsoft/archai.git
--- a/tasks/face_segmentation/confs/snp_search.yaml
+++ b/tasks/face_segmentation/confs/snp_search.yaml
@ -34,14 +34,17 @@ search:
      num_crossovers: 6
      max_unseen_population: 20
      num_random_mix: 6
+      max_parameters: 5e7

  target:
    name: snp
-    connection_str_env_var: MODEL_STORAGE_CONNECTION_STRING
-    blob_container_name: models
-    table_name: status
-    partition_key: main
-    max_retries: 15
-    retry_interval: 120
    metric_key: mean
+    max_retries: 15
+    retry_interval: 60
    verbose: true
+
+aml:
+  connection_str: ${MODEL_STORAGE_CONNECTION_STRING}
+  blob_container_name: models
+  partition_key: main
+  experiment_name: facesynthetics
--- a/tasks/face_segmentation/data_prep/prep_data_store.py
+++ b/tasks/face_segmentation/data_prep/prep_data_store.py
@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import argparse
+import os
+from archai.datasets.cv.face_synthetics import FaceSyntheticsDatasetProvider
+
+
+def main():
+    """ This script downloads the Face Synthetics dataset to the specified folder. """
+    # input and output arguments
+    print("Starting prep_data_store...")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path", type=str, help="root folder to place the downloaded dataset.")
+    args = parser.parse_args()
+
+    path = args.path
+
+    print(f'Writing Face Synthetics dataset to: {path}')
+
+    if not path or not os.path.exists(path):
+        raise ValueError(f'Missing path: {path}')
+
+    provider = FaceSyntheticsDatasetProvider(dataset_dir=path)
+    # now force the full download to happen to that root folder.
+    provider.get_train_dataset()
+    provider.get_val_dataset()
+    provider.get_test_dataset()
+
+
+if __name__ == "__main__":
+    main()
--- a/tasks/face_segmentation/search.py
+++ b/tasks/face_segmentation/search.py
@ -21,6 +21,8 @@ from archai.discrete_search.evaluators.remote_azure_benchmark import RemoteAzure

 from search_space.hgnet import HgnetSegmentationSearchSpace
 from training.partial_training_evaluator import PartialTrainingValIOU
+from aml.training.aml_training_evaluator import AmlPartialTrainingEvaluator
+from aml.util.setup import configure_store

 AVAILABLE_ALGOS = {
    'mo_bananas': MoBananasSearch,
@ -47,21 +49,24 @@ def filter_extra_args(extra_args: List[str], prefix: str) -> List[str]:

 def main():
    parser = ArgumentParser()
-    parser.add_argument('--dataset_dir', type=Path, help='Face Synthetics dataset directory.', required=True)
-    parser.add_argument('--output_dir', type=Path, help='Output directory.', required=True)
+    parser.add_argument('--dataset_dir', type=Path, help='Face Synthetics dataset directory.')
+    parser.add_argument('--output_dir', type=Path, help='Output directory.', default='output')
    parser.add_argument('--search_config', type=Path, help='Search config file.', default=confs_path / 'cpu_search.yaml')
    parser.add_argument('--serial_training', help='Search config file.', action='store_true')
    parser.add_argument('--gpus_per_job', type=float, help='Number of GPUs used per job (if `serial_training` flag is disabled)',
                        default=0.5)
-    parser.add_argument('--partial_tr_epochs', type=float, help='Number of epochs to run partial training', default=1.0)
+    parser.add_argument('--partial_tr_epochs', type=int, help='Number of epochs to run partial training', default=1)
    parser.add_argument('--seed', type=int, help='Random seed', default=42)
-    parser.add_argument('--max_parameters', type=float, help='Specify a maximum number of parameters in the model (default 50M or 5e7).', default=5e7)
+    parser.add_argument('--timeout', type=int, help='Timeout for partial training (in seconds)(default 10800)', default=10800)

    args, extra_args = parser.parse_known_args()

+    timeout_seconds = args.timeout
+
    # Filters extra args that have the prefix `search_space`
    search_extra_args = filter_extra_args(extra_args, 'search.')
-    search_config = Config(str(args.search_config), search_extra_args)['search']
+    config = Config(str(args.search_config), search_extra_args, resolve_env_vars=True)
+    search_config = config['search']

    # Search space
    ss_config = search_config['search_space']
@ -73,6 +78,9 @@ def main():

    input_shape = (1, search_space.in_channels, *search_space.img_size[::-1])

+    partial_training_output = args.output_dir / 'partial_training_logs'
+    os.makedirs(partial_training_output, exist_ok=True)
+
    # Search objectives
    so = SearchObjectives()

@ -81,12 +89,15 @@ def main():
    assert target_name in ['cpu', 'snp']

    max_latency = 0.3 if target_name == 'cpu' else 0.185
+    algo_config = search_config['algorithm']
+    algo_params = algo_config.get('params', {})
+    max_parameters = float(algo_params.pop('max_parameters', 5e7))

    # Adds a constraint on number of parameters so we don't sample models that are too large
    so.add_constraint(
        'Model Size (b)',
        TorchNumParameters(),
-        constraint=(1e6, args.max_parameters)
+        constraint=(1e6, max_parameters)
    )

    # Adds a constrained objective on model latency so we don't pick models that are too slow.
@ -100,24 +111,17 @@ def main():
        constraint=[0, max_latency]
    )

+    aml_training = False
+
    if target_name == 'snp':
        # Gets connection string from env variable
-        env_var_name = target_config.pop('connection_str_env_var')
-        con_str = os.getenv(env_var_name)
-        if not con_str:
-            print("Please set environment variable {env_var_name} containing the Azure storage account connection " +
-                  "string for the Azure storage account you want to use to control this experiment.")
-            sys.exit(1)
-
-        blob_container_name = target_config.pop('blob_container_name', 'models')
-        table_name = target_config.pop('table_name', 'status')
-        partition_key = target_config.pop('partition_key', 'main')
-        storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(con_str)
-        store = ArchaiStore(storage_account_name, storage_account_key, blob_container_name, table_name, partition_key)
-
+        aml_config = config['aml']
+        experiment_name = aml_config.get('experiment_name', 'facesynthetics')
+        store: ArchaiStore = configure_store(aml_config)
        evaluator = RemoteAzureBenchmarkEvaluator(
            input_shape=input_shape,
            store=store,
+            experiment_name=experiment_name,
            onnx_export_kwargs={'opset_version': 11},
            **target_config
        )
@ -129,20 +133,34 @@ def main():
            compute_intensive=True
        )

-    # Dataset provider
-    dataset_provider = FaceSyntheticsDatasetProvider(args.dataset_dir)
+        aml_training = 'training_cluster' in aml_config

-    partial_tr_obj = PartialTrainingValIOU(
-        dataset_provider,
-        tr_epochs=args.partial_tr_epochs,
-        output_dir=args.output_dir / 'partial_training_logs'
-    )
-
-    if not args.serial_training:
-        partial_tr_obj = RayParallelEvaluator(
-            partial_tr_obj, num_gpus=args.gpus_per_job,
-            max_calls=1
+    if aml_training:
+        # do the partial training on an AML gpu cluster
+        partial_tr_obj = AmlPartialTrainingEvaluator(
+            config,
+            tr_epochs=int(args.partial_tr_epochs),
+            timeout_seconds=timeout_seconds,
+            local_output=partial_training_output
        )
+    else:
+        if args.dataset_dir is None:
+            raise ValueError('--dataset_dir must be specified if target is not aml')
+
+        # Dataset provider
+        dataset_provider = FaceSyntheticsDatasetProvider(args.dataset_dir)
+
+        partial_tr_obj = PartialTrainingValIOU(
+            dataset_provider,
+            tr_epochs=args.partial_tr_epochs,
+            output_dir=partial_training_output
+        )
+
+        if not args.serial_training:
+            partial_tr_obj = RayParallelEvaluator(
+                partial_tr_obj, num_gpus=args.gpus_per_job,
+                max_calls=1
+            )

    so.add_objective(
        'Partial Training Val. IOU',
@ -152,11 +170,11 @@ def main():
    )

    # Search algorithm
-    algo_config = search_config['algorithm']
    algo = AVAILABLE_ALGOS[algo_config['name']](
        search_space, so,
-        output_dir=args.output_dir, seed=args.seed,
-        **algo_config.get('params', {}),
+        output_dir=args.output_dir,
+        seed=args.seed,
+        **algo_params,
    )

    algo.search()
--- a/tasks/face_segmentation/snpe/images/system.dgml
+++ b/tasks/face_segmentation/snpe/images/system.dgml
@ -1,49 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<DirectedGraph xmlns="http://schemas.microsoft.com/vs/2009/dgml">
-  <Nodes>
-    <Node Id="..." Bounds="-891.259155273438,-397.317448474421,50,25.96" UseManualLocation="True" />
-    <Node Id="...1" Bounds="-899.17041015625,-141.018983432007,50,25.96" Label="..." UseManualLocation="True" />
-    <Node Id="AzureBlobStore" Bounds="-1193.71833333333,-519.02,106.03,25.96" Label="azure blob store" UseManualLocation="True" />
-    <Node Id="AzureStatusTable" Bounds="-1193.71833333333,-445.02,112.973333333333,25.96" Label="azure status table" UseManualLocation="True" />
-    <Node Id="JupyterNotebook" Bounds="-1396.73103800456,-328.529480438232,115.163333333333,25.96" Label="Jupyter Notebook" UseManualLocation="True" />
-    <Node Id="KubernetesCluster" Bounds="-1208.17041666667,-308.193725585938,119.78,25.96" Label="Kubernetes Cluster" UseManualLocation="True" />
-    <Node Id="QualcommDevice" Bounds="-769.718333333333,-534.029998779297,115.2,25.96" Label="Qualcomm device" UseManualLocation="True" />
-    <Node Id="QualcommDevice1" Bounds="-771.540833333333,-453.524998168945,115.2,25.96" Label="Qualcomm device" UseManualLocation="True" />
-    <Node Id="Runner.py" Bounds="-903.436666666667,-534.04,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
-    <Node Id="Runner.py1" Bounds="-900.436666666666,-453.277548474121,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
-    <Node Id="Runner.py2" Bounds="-899.452100016276,-313.899282537842,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
-    <Node Id="Runner.py3" Bounds="-903.170422770182,-257.939182537842,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
-    <Node Id="Runner.py4" Bounds="-901.449353434245,-196.979083432007,70.4533333333334,25.96" Label="runner.py" UseManualLocation="True" />
-    <Node Id="SnpeQuantiation" Bounds="-760.452083333333,-251.989580438032,109.126666666667,25.96" Label="snpe quantiation" UseManualLocation="True" />
-    <Node Id="SnpeQuantization" Bounds="-765.452083333333,-319.111862182617,114.553333333333,25.96" Label="snpe quantization" UseManualLocation="True" />
-    <Node Id="Test_onnx" Bounds="-761.731038004557,-196.029480438032,70.6,25.96" Label="test_onnx" UseManualLocation="True" />
-    <Node Id="Upload" Bounds="-1358,-486,56.9533333333334,25.96" Label="upload" UseManualLocation="True" />
-  </Nodes>
-  <Links>
-    <Link Source="AzureBlobStore" Target="Runner.py" Bounds="-1087.68830810547,-518.622952023936,175.265282619028,9.66072548941548" />
-    <Link Source="AzureBlobStore" Target="Runner.py1" Bounds="-1088.87310498647,-493.671486483734,179.682249037085,42.8785481723203" />
-    <Link Source="AzureBlobStore" Target="Runner.py2" Bounds="-1123.20784148837,-493.060008544922,234.258958940773,173.798238915231" />
-    <Link Source="AzureStatusTable" Target="JupyterNotebook" Bounds="-1308.85493100781,-419.059993286133,149.124540767732,86.0330250379735" />
-    <Link Source="AzureStatusTable" Target="Runner.py" Bounds="-1100.05297851563,-505.965637207031,188.762878417969,60.9456481933594" />
-    <Link Source="AzureStatusTable" Target="Runner.py1" Bounds="-1128.9501953125,-459.267211914063,219.597229003906,14.2472229003906" />
-    <Link Source="AzureStatusTable" Target="Runner.py2" Bounds="-1125.57153320313,-419.059997558594,218.665649414063,102.314849853516" />
-    <Link Source="KubernetesCluster" Target="Runner.py2" Bounds="-1088.39039550781,-300.030978319804,179.940110471371,3.61429271166003" />
-    <Link Source="KubernetesCluster" Target="Runner.py3" Bounds="-1088.48298803671,-284.494147681096,176.453782358855,31.6319870791329" />
-    <Link Source="KubernetesCluster" Target="Runner.py4" Bounds="-1115.36108846197,-282.233729858398,207.979818936008,82.0059196880921" />
-    <Link Source="Runner.py" Target="AzureStatusTable" Bounds="-1131.49816894531,-516.163635253906,228.0615234375,69.8030700683594" />
-    <Link Source="Runner.py" Target="QualcommDevice" Bounds="-832.983324788411,-521.057729225212,54.2649914734869,0.00347075341346681" />
-    <Link Source="Runner.py1" Target="AzureStatusTable" Bounds="-1096.17529296875,-453.341644287109,195.738647460938,11.53271484375" />
-    <Link Source="Runner.py1" Target="QualcommDevice1" Bounds="-829.983324788411,-440.436044683939,49.4425034983809,0.0808848954610539" />
-    <Link Source="Runner.py2" Target="AzureStatusTable" Bounds="-1130.11853027344,-414.711029052734,230.666442871094,107.175262451172" />
-    <Link Source="Runner.py2" Target="SnpeQuantization" Bounds="-828.998766682943,-303.91817535817,54.5517017881938,1.8222062802526" />
-    <Link Source="Runner.py3" Target="SnpeQuantiation" Bounds="-832.717089436849,-243.665883447078,63.271065419621,2.32289765526392" />
-    <Link Source="Runner.py4" Target="Test_onnx" Bounds="-830.996020100911,-183.759789886113,60.2651897407106,0.409381159973066" />
-    <Link Source="Upload" Target="AzureBlobStore" Bounds="-1301.04664876302,-495.218628273773,98.4628791895163,17.2187510083821" />
-    <Link Source="Upload" Target="AzureStatusTable" Bounds="-1301.04664876302,-466.951233796788,99.2138483678966,21.1438396495243" />
-  </Links>
-  <Properties>
-    <Property Id="Bounds" DataType="System.Windows.Rect" />
-    <Property Id="Label" Label="Label" Description="Displayable label of an Annotatable object" DataType="System.String" />
-    <Property Id="UseManualLocation" DataType="System.Boolean" />
-  </Properties>
-</DirectedGraph>
--- a/tasks/face_segmentation/snpe/images/system.png
+++ b/tasks/face_segmentation/snpe/images/system.png
--- a/tasks/face_segmentation/snpe/readme.md
+++ b/tasks/face_segmentation/snpe/readme.md
@ -1,70 +0,0 @@
-# Readme
-
-This folder contains code that automates the testing of ONNX models across one or more machines that are connected via
-USB to Qualcomm 888 boards.
-
-The code is organized into:
-
-1. [SNPE Device Code](snpe/readme.md) that knows how to use the Qualcomm Neural Processing SDK to talk to the device,
-convert ONNX models to .dlc, quantize them, and test them on the board using the Android `adb` tool.
-
-1. [Azure Code](azure/readme.md) that talks to a configured Azure storage account for uploading models to test,
-downloading them, uploading test results, and keeping an Azure table "status" that summarizes results of all your
-models.
-
-1. [Docker](docker/quantizer/readme.md) scripts for setting up your Azure account and optionally creating a docker image
-for running in an Azure Kubernetes cluster to do model quantization using the Qualcomm Neural Processing SDK.
-Quantization is time consuming so having an elastic scale speeds things up a lot.
-
-1. [Notebooks](notebook/gallery_performance.md) contains a Jupyter Notebook that can visualize the results from the
-Azure "status" table.
-
-It is best if you setup a new Conda Python environment for Python 3.10 with the `requirements.txt` included here using:
-
-```shell
-pip install -r requirements.txt
-```
-
-The SNPE SDK only works on Linux, so you need a Linux machine with this repo. Then follow additional setup in each of
-the above readmes.
-
-## Workflow
-
-The overall workflow looks like this. One or more Linux machines are setup as above and are running `azure/runner.py`
-including a Kubernetes cluster setup for quantization (see [docker/quantizer](docker/quantizer) folder).
-
-![system](images/system.png)
-
-Each instance of `runner.py` looks for work, and executes it in priority order where the prioritization is defined by
-the `find_work_prioritized` function in the runner.  This script is completely restartable, and can distribute the work
-across multiple instances of the runner script.  Each instance will pick up where a previous one left off based on what
-it finds in your "status" Azure table. The prioritization maps to the columns of the status table as follows:
-
-1. **macs:** convert to .dlc and post Macs score and `snpe-dlc-viewer` output and do model quantization (runs on Linux) - priority 20
-1. **total_inference_avg** run `snpe_bench.py` with quantized model on Qualcomm device DSP - priority 30
-1. **f1_onnx** compute f1 from onnxruntime on .onnx model on a 10k test set on Linux - priority 60
-1. **f1_1k** compute f1 on quantized .dlc model on Qualcomm device DSP with a 1k test set - priority
-is the mean f1 score so that quicker models are prioritized.
-1. **f1_1k_f** compute f1 on floating point .dlc model on on Qualcomm device CPU with a 1k test set
-   - priority 10 * the mean f1 score so that quicker models are prioritized.
-1. **f1_10k** compute f1 on quantized model on a 10k test set - priority = 100 * the mean f1 score
-   so that quicker models are prioritized.
-
-Lower number means higher priority job and each machine will run the highest priority work first.
-
-You can override the priority of a specific job by passing a `--proprity` parameter on the `upload.py` script or by
-editing the Azure `status` table and adding a `priority` field to the JSON stored there. You can set any priority number
-you want, if you specify priority 0 it will run before anything else which can be handy if you have a cool new model
-that you want to bump to the top of the list.
-
-Notice some of the above jobs can run on Linux and do not require Qualcomm device. So in order to maximize throughput on
-machines that do have a Qualcomm devices you can allocate other Linux machines with no Qualcomm devices to do the other
-work, namely, converting models, quantizing them, and running the `f1_onnx` test.
-
-Folks across your team can use the `azure/upload.py` to submit jobs and let them run, or they can automate that as
-shown in the `RemoteAzureBenchmarkEvaluator` in the `search.py` script.
-
-You can use `status.py` to monitor progress or look at the Azure `status` table.  Various status messages are posted
-there so you can see which machine is doing what and is in what stage of the job.
-
-Next you can go to the `notebook` page and get some pretty pictures of your Pareto Curves.
--- a/tasks/face_segmentation/train.py
+++ b/tasks/face_segmentation/train.py
@ -3,7 +3,8 @@

 from pathlib import Path
 from argparse import ArgumentParser
-
+import os
+import time
 import torch
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
@ -12,64 +13,122 @@ from archai.datasets.cv.face_synthetics import FaceSyntheticsDatasetProvider
 from archai.discrete_search.search_spaces.config import ArchConfig
 from search_space.hgnet import StackedHourglass
 from training.pl_trainer import SegmentationTrainingLoop
+from archai.common.store import ArchaiStore
+from archai.common.config import Config


-parser = ArgumentParser()
-parser.add_argument('arch', type=Path)
-parser.add_argument('--dataset_dir', type=Path, help='Face Synthetics dataset directory.', required=True)
-parser.add_argument('--output_dir', type=Path, help='Output directory.', required=True)
-parser.add_argument('--lr', type=float, default=2e-4)
-parser.add_argument('--batch_size', type=int, default=16)
-parser.add_argument('--epochs', type=int, default=1)
-parser.add_argument('--val_check_interval', type=float, default=1)
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('arch', type=Path)
+    parser.add_argument('--dataset_dir', type=Path, help='Face Synthetics dataset directory.', required=True)
+    parser.add_argument('--output_dir', type=Path, help='Output directory.', required=True)
+    parser.add_argument('--lr', type=float, default=2e-4)
+    parser.add_argument('--batch_size', type=int, default=16)
+    parser.add_argument('--epochs', type=int, default=1)
+    parser.add_argument('--val_check_interval', type=float, default=1.0)
+    parser.add_argument('--model_id', type=str, default=None)
+    parser.add_argument('--config', type=Path, default=None)
+    args = parser.parse_args()
+
+    model_id = args.model_id
+    store: ArchaiStore = None
+    epochs = 1 if args.epochs < 1 else args.epochs
+
+    start_time = time.time()
+    storing = False
+    config = args.config
+    experiment_name = None
+    if config and config.is_file():
+        config = Config(str(config))
+        if 'aml' in config:
+            # we are running in azure ml.
+            aml_config = config['aml']
+            metric_key = config['training'].get('metric_key')
+            connection_str = aml_config['connection_str']
+            experiment_name = aml_config['experiment_name']
+            storage_account_name, storage_account_key = ArchaiStore.parse_connection_string(connection_str)
+            store = ArchaiStore(storage_account_name, storage_account_key, table_name=experiment_name)
+            storing = True
+
+    try:
+        if storing:
+            print(f'Locking entity {model_id}')
+            e = store.lock(model_id, 'training')
+            if e is None:
+                e = store.get_status(model_id)
+                node = e['node']
+                raise Exception(f'Entity should not be locked by: "{node}"')
+
+            pipeline_id = os.getenv('AZUREML_ROOT_RUN_ID')
+            if pipeline_id is not None:
+                e['pipeline_id'] = pipeline_id
+                store.merge_status_entity(e)
+
+        arch_config = ArchConfig.from_file(args.arch)
+        model = StackedHourglass(arch_config, num_classes=18)
+
+        pl_model = SegmentationTrainingLoop(model, lr=args.lr)
+        dataset_prov = FaceSyntheticsDatasetProvider(args.dataset_dir)
+
+        tr_dl = torch.utils.data.DataLoader(
+            dataset_prov.get_train_dataset(), batch_size=args.batch_size, num_workers=8,
+            shuffle=True
+        )
+
+        val_dl = torch.utils.data.DataLoader(
+            dataset_prov.get_val_dataset(), batch_size=args.batch_size, num_workers=8
+        )
+
+        callbacks = [
+            ModelCheckpoint(
+                dirpath=str(args.output_dir / 'checkpoints'),
+                monitor='validation_loss', mode='min',
+                save_last=True, save_top_k=1, verbose=True,
+                filename='{epoch}-{step}-{validation_loss:.2f}'
+            )
+        ]
+
+        trainer = Trainer(
+            default_root_dir=str(args.output_dir), accelerator='gpu',
+            val_check_interval=args.val_check_interval,
+            max_epochs=epochs,
+            callbacks=callbacks
+        )
+
+        trainer.fit(pl_model, tr_dl, val_dl)
+
+        val_result = trainer.validate(trainer.model, val_dl)
+        print(val_result)
+        end_time = time.time()
+
+        if storing:
+            # post updated progress to our unified status table and unlock the row.
+            metric = float(val_result[0]['validation_mIOU'])
+            print(f"Storing {metric_key}={metric} for model {model_id}")
+            e = store.get_status(model_id)
+            e[metric_key] = metric
+            e['status'] = 'complete'
+            e['training_time'] = end_time - start_time
+            store.unlock_entity(e)
+
+        trainer.save_checkpoint(args.output_dir / 'model.ckpt')
+
+        # Save onnx model.
+        input_shape = (1, 3, 256, 256)
+        rand_range = (0.0, 1.0)
+        export_kwargs = {'opset_version': 11}
+        rand_min, rand_max = rand_range
+        sample_input = ((rand_max - rand_min) * torch.rand(*input_shape) + rand_min).type("torch.FloatTensor")
+        onnx_file = str(args.output_dir / 'model.onnx')
+        torch.onnx.export(model, (sample_input,), onnx_file, input_names=["input_0"], **export_kwargs, )
+
+    except Exception as ex:
+        # record failed state.
+        if storing:
+            e['status'] = 'failed'
+            e['error'] = str(ex)
+            store.unlock_entity(e)


 if __name__ == '__main__':
-    args = parser.parse_args()
-
-    arch_config = ArchConfig.from_file(args.arch)
-    model = StackedHourglass(arch_config, num_classes=18)
-
-    pl_model = SegmentationTrainingLoop(model, lr=args.lr)
-    dataset_prov = FaceSyntheticsDatasetProvider(args.dataset_dir)
-
-    tr_dl = torch.utils.data.DataLoader(
-        dataset_prov.get_train_dataset(), batch_size=args.batch_size, num_workers=8,
-        shuffle=True
-    )
-
-    val_dl = torch.utils.data.DataLoader(
-        dataset_prov.get_val_dataset(), batch_size=args.batch_size, num_workers=8
-    )
-
-    callbacks = [
-        ModelCheckpoint(
-            dirpath=str(args.output_dir / 'checkpoints'),
-            monitor='validation_loss', mode='min',
-            save_last=True, save_top_k=1, verbose=True,
-            filename='{epoch}-{step}-{validation_loss:.2f}'
-        )
-    ]
-
-    trainer = Trainer(
-        default_root_dir=str(args.output_dir), accelerator='gpu',
-        val_check_interval=args.val_check_interval,
-        max_epochs=args.epochs,
-        callbacks=callbacks
-    )
-
-    trainer.fit(pl_model, tr_dl, val_dl)
-
-    val_result = trainer.validate(trainer.model, val_dl)
-    print(val_result)
-
-    trainer.save_checkpoint(args.output_dir / 'final_model.ckpt')
-
-    # Save onnx model.
-    input_shape = (1, 3, 256, 256)
-    rand_range = (0.0, 1.0)
-    export_kwargs = {'opset_version': 11}
-    rand_min, rand_max = rand_range
-    sample_input = ((rand_max - rand_min) * torch.rand(*input_shape) + rand_min).type("torch.FloatTensor")
-    onnx_file = str(args.output_dir / 'final_model.onnx')
-    torch.onnx.export(model, (sample_input,), onnx_file, input_names=["input_0"], **export_kwargs, )
+    main()
--- a/tasks/face_segmentation/training/partial_training_evaluator.py
+++ b/tasks/face_segmentation/training/partial_training_evaluator.py
@ -14,7 +14,7 @@ class PartialTrainingValIOU(ModelEvaluator):
                 output_dir: str, tr_epochs: float = 1.0,
                 batch_size: int = 16, lr: float = 2e-4,
                 tr_dl_workers: int = 8, val_dl_workers: int = 8,
-                 val_check_interval: float = 1):
+                 val_check_interval: float = 1.0):
        self.dataset_provider = dataset_provider
        self.output_dir = Path(output_dir)
        self.tr_epochs = tr_epochs