* remove old notebook

* store onnx latency
allow aml partial training with no snapdragon mode

* fix docker file now that aml branch is merged.

* fix bug in reset
add notebook

* add link to notebook.
This commit is contained in:
Chris Lovett 2023-04-22 03:23:38 -07:00 коммит произвёл GitHub
Родитель aae38db1a5
Коммит fc031eb447
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
11 изменённых файлов: 278 добавлений и 681 удалений

Просмотреть файл

@ -390,7 +390,8 @@ class ArchaiStore:
self._reset(e, except_list)
def _reset(self, e, except_list=[]):
if self.is_locked_by_other(e):
name = e['name']
if self.is_locked_by_other(name):
node = self.get_lock(e)
print(f"Skipping {e['RowKey']} as it is locked by {node}")
elif self._reset_metrics(e, except_list):

Просмотреть файл

@ -72,6 +72,15 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
_ = self.store.get_existing_status(unknown_id)
_ = self.store.list_blobs(unknown_id)
def _reset(self, entity):
changed = False
for k in ['mean', 'macs', 'params', 'stdev', 'total_inference_avg', 'error']:
if k in entity:
del entity[k]
changed = True
if changed:
self.store.update_status_entity(entity)
@overrides
def send(self, arch: ArchaiModel, budget: Optional[float] = None) -> None:
# bug in azure ml sdk requires blob store folder names not begin with digits, so we prefix with 'id_'
@ -88,8 +97,8 @@ class RemoteAzureBenchmarkEvaluator(AsyncModelEvaluator):
print(f"Entry for {archid} already exists with {self.metric_key} = {value}")
return
else:
# complete but missing the mean, so reset everything so we can try again below.
self.store.reset(archid, ['benchmark_only', 'model_date'])
# complete but missing the mean, so reset the benchmark metrics so we can try again.
self._reset(entity)
else:
# job is still running, let it continue
if self.verbose:

Просмотреть файл

@ -129,7 +129,7 @@ def main(output_dir: Path, experiment_name: str, seed: int):
ml_client,
image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:latest",
conda_file="conda.yaml",
version='1.0.20')
version='1.0.21')
environment_name = f"{archai_job_env.name}:{archai_job_env.version}"
# Register the datastore with AML

Просмотреть файл

@ -63,7 +63,7 @@ RUN wget -O azcopy_v10.tar.gz https://aka.ms/downloadazcopy-v10-linux && tar -xf
# this echo is a trick to bypass docker build cache.
# simply change the echo string every time you want docker build to pull down new bits.
RUN echo '04/18/2023 12:22 PM' >/dev/null && git clone https://github.com/microsoft/archai.git
RUN cd archai && git checkout clovett/aml && pip install -e .[dev]
RUN cd archai && pip install -e .[dev]
RUN echo "using this pip version: " && which pip
RUN echo "using this python version: " && which python

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,103 +0,0 @@
import numpy as np
from scipy import interpolate
from scipy import spatial
from functools import reduce
def spline_fit(xs, ys):
indexes = np.argsort(np.array(xs), axis=0)
ixs = []
iys = []
for i in indexes:
ixs += [xs[i]]
iys += [ys[i]]
f = interpolate.UnivariateSpline(ixs, iys)
f.set_smoothing_factor(0.1)
xn = np.linspace(ixs[0], ixs[-1], 100)
return [xn, f(xn)]
def filter_(pts, pt):
"""
Get all points in pts that are not Pareto dominated by the point pt
"""
weakly_worse = (pts <= pt).all(axis=-1)
strictly_worse = (pts < pt).any(axis=-1)
return pts[~(weakly_worse & strictly_worse)]
def get_pareto_undominated_by(pts1, pts2=None):
"""
Return all points in pts1 that are not Pareto dominated
by any points in pts2
"""
if pts2 is None:
pts2 = pts1
return reduce(filter_, pts2, pts1)
def get_pareto_frontier(pts):
"""
Iteratively filter points based on the convex hull heuristic
"""
pareto_groups = []
# loop while there are points remaining
while pts.shape[0]:
# brute force if there are few points:
if pts.shape[0] < 10:
pareto_groups.append(get_pareto_undominated_by(pts))
break
# compute vertices of the convex hull
hull_vertices = spatial.ConvexHull(pts).vertices
# get corresponding points
hull_pts = pts[hull_vertices]
# get points in pts that are not convex hull vertices
nonhull_mask = np.ones(pts.shape[0], dtype=bool)
nonhull_mask[hull_vertices] = False
pts = pts[nonhull_mask]
# get points in the convex hull that are on the Pareto frontier
pareto = get_pareto_undominated_by(hull_pts)
pareto_groups.append(pareto)
# filter remaining points to keep those not dominated by
# Pareto points of the convex hull
pts = get_pareto_undominated_by(pts, pareto)
return np.vstack(pareto_groups)
def pareto_curve(Xs, Ys):
# in our case faster is better so invert the X axis
pts = np.array(list(zip(Xs, Ys))) * np.array((-1, 1))
result = get_pareto_frontier(pts)
indices = []
for pt in result:
i = np.where(pts == pt)[0][0]
indices += [i]
return indices
def get_pareto_edges(xs, ys):
left = None
top = None
bottom = None
right = None
for i in range(len(xs)):
x = xs[i]
y = ys[i]
if left is None or x < left:
left = x
if bottom is None or y < bottom:
bottom = y
if top is None or y > top:
top = y
if right is None or x > right:
right = x
return (left, bottom, right, top)

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -24,9 +24,8 @@ creating a docker image for running in an Azure Kubernetes cluster to do model q
the Qualcomm Neural Processing SDK. Quantization is time consuming so having an elastic scale speeds
things up a lot.
1. [Notebooks](notebook/gallery_performance.md) contains a Jupyter Notebook that can visualize the
results from your Azure "status" table.
1. [Notebook](notebooks/results.ipynb) a simple Jupyter notebook for visualizing the
results in your Azure table.
## Workflow

Просмотреть файл

@ -0,0 +1,49 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from typing import Any, Dict, List, Optional, Tuple, Union
from overrides import overrides
from archai.discrete_search.evaluators import AvgOnnxLatency
from archai.discrete_search.api.archai_model import ArchaiModel
from archai.common.store import ArchaiStore
class AvgOnnxLatencyEvaluator(AvgOnnxLatency):
"""Evaluate the average ONNX Latency (in seconds) of an architecture and store the result.
The latency is measured by running the model on random inputs and averaging the latency over
`num_trials` trials.
"""
def __init__(
self,
input_shape: Union[Tuple[int, ...], List[Tuple[int, ...]]],
num_trials: Optional[int] = 1,
input_dtype: Optional[str] = "torch.FloatTensor",
rand_range: Optional[Tuple[float, float]] = (0.0, 1.0),
export_kwargs: Optional[Dict[str, Any]] = None,
device: Optional[str] = 'cpu',
inf_session_kwargs: Optional[Dict[str, Any]] = None,
store: ArchaiStore = None,
metric_key: str = 'onnx_latency'
) -> None:
super(AvgOnnxLatencyEvaluator, self).__init__(
input_shape,
num_trials,
input_dtype,
rand_range,
export_kwargs,
device,
inf_session_kwargs)
self.store = store
self.metric_key = metric_key
@overrides
def evaluate(self, model: ArchaiModel, budget: Optional[float] = None) -> float:
result = super(AvgOnnxLatencyEvaluator, self).evaluate(model, budget)
if self.store is not None:
archid = f'id_{model.archid}'
e = self.store.get_status(archid)
e['status'] = 'complete'
e[self.metric_key] = result
self.store.merge_status_entity(e)
return result

Просмотреть файл

@ -92,9 +92,6 @@ aml:
- matplotlib
- mldesigner
- mlflow
- onnx>=1.10.2
- onnxruntime>=1.10.0
- psutil
- torch
- torchvision
- torchaudio

Просмотреть файл

@ -16,13 +16,14 @@ from archai.discrete_search.algos import (
MoBananasSearch, EvolutionParetoSearch, LocalSearch,
RandomSearch, RegularizedEvolutionSearch
)
from archai.discrete_search.evaluators import TorchNumParameters, AvgOnnxLatency, RayParallelEvaluator
from archai.discrete_search.evaluators import TorchNumParameters, RayParallelEvaluator
from archai.discrete_search.evaluators.remote_azure_benchmark import RemoteAzureBenchmarkEvaluator
from search_space.hgnet import HgnetSegmentationSearchSpace
from training.partial_training_evaluator import PartialTrainingValIOU
from aml.training.aml_training_evaluator import AmlPartialTrainingEvaluator
from aml.util.setup import configure_store
from aml.training.onnx_latency import AvgOnnxLatencyEvaluator
AVAILABLE_ALGOS = {
'mo_bananas': MoBananasSearch,
@ -100,24 +101,27 @@ def main():
constraint=(1e6, max_parameters)
)
aml_training = False
store = None
if 'aml' in config:
aml_config = config['aml']
experiment_name = aml_config.get('experiment_name', 'facesynthetics')
store: ArchaiStore = configure_store(aml_config)
aml_training = 'training_cluster' in aml_config
# Adds a constrained objective on model latency so we don't pick models that are too slow.
so.add_objective(
'CPU ONNX Latency (s)',
AvgOnnxLatency(
input_shape=input_shape, export_kwargs={'opset_version': 11}
AvgOnnxLatencyEvaluator(
input_shape=input_shape, export_kwargs={'opset_version': 11}, store=store
),
higher_is_better=False,
compute_intensive=False,
constraint=[0, max_latency]
)
aml_training = False
if target_name == 'snp':
# Gets connection string from env variable
aml_config = config['aml']
experiment_name = aml_config.get('experiment_name', 'facesynthetics')
store: ArchaiStore = configure_store(aml_config)
evaluator = RemoteAzureBenchmarkEvaluator(
input_shape=input_shape,
store=store,
@ -133,8 +137,6 @@ def main():
compute_intensive=True
)
aml_training = 'training_cluster' in aml_config
if aml_training:
# do the partial training on an AML gpu cluster
partial_tr_obj = AmlPartialTrainingEvaluator(