зеркало из https://github.com/microsoft/nni.git
Migrate pipeline to 1ES (#4986)
This commit is contained in:
Родитель
570448eab8
Коммит
4e71ed6211
|
@ -171,7 +171,8 @@ def load_training_service_config(config) -> TrainingServiceConfig:
|
|||
cls = _get_ts_config_class(config['platform'])
|
||||
if cls is not None:
|
||||
return cls(**config)
|
||||
return config # not valid json, don't touch
|
||||
# not valid json, don't touch
|
||||
return config # type: ignore
|
||||
|
||||
def _get_ts_config_class(platform: str) -> type[TrainingServiceConfig] | None:
|
||||
from ..training_service import TrainingServiceConfig # avoid circular import
|
||||
|
|
|
@ -10,6 +10,7 @@ import string
|
|||
from typing import Any, Dict, Iterable, List
|
||||
|
||||
from nni.experiment import rest
|
||||
from nni.retiarii.integration import RetiariiAdvisor
|
||||
|
||||
from .interface import AbstractExecutionEngine, AbstractGraphListener
|
||||
from .utils import get_mutation_summary
|
||||
|
@ -75,20 +76,21 @@ class BaseExecutionEngine(AbstractExecutionEngine):
|
|||
self.url_prefix = rest_url_prefix
|
||||
|
||||
self._listeners: List[AbstractGraphListener] = []
|
||||
|
||||
# register advisor callbacks
|
||||
advisor = get_advisor()
|
||||
advisor.send_trial_callback = self._send_trial_callback
|
||||
advisor.request_trial_jobs_callback = self._request_trial_jobs_callback
|
||||
advisor.trial_end_callback = self._trial_end_callback
|
||||
advisor.intermediate_metric_callback = self._intermediate_metric_callback
|
||||
advisor.final_metric_callback = self._final_metric_callback
|
||||
|
||||
self._running_models: Dict[int, Model] = dict()
|
||||
self._history: List[Model] = []
|
||||
|
||||
self.resources = 0
|
||||
|
||||
# register advisor callbacks
|
||||
advisor: RetiariiAdvisor = get_advisor()
|
||||
advisor.register_callbacks({
|
||||
'send_trial': self._send_trial_callback,
|
||||
'request_trial_jobs': self._request_trial_jobs_callback,
|
||||
'trial_end': self._trial_end_callback,
|
||||
'intermediate_metric': self._intermediate_metric_callback,
|
||||
'final_metric': self._final_metric_callback
|
||||
})
|
||||
|
||||
def submit_models(self, *models: Model) -> None:
|
||||
for model in models:
|
||||
data = self.pack_model_data(model)
|
||||
|
|
|
@ -14,6 +14,7 @@ from dataclasses import dataclass
|
|||
|
||||
from nni.common.device import GPUDevice, Device
|
||||
from nni.experiment.config.training_services import RemoteConfig
|
||||
from nni.retiarii.integration import RetiariiAdvisor
|
||||
from .interface import AbstractExecutionEngine, AbstractGraphListener, WorkerInfo
|
||||
from .. import codegen, utils
|
||||
from ..graph import Model, ModelStatus, MetricData, Node
|
||||
|
@ -28,6 +29,10 @@ from .base import BaseGraphData
|
|||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _noop(*args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrialSubmission:
|
||||
model: Model
|
||||
|
@ -90,12 +95,14 @@ class CGOExecutionEngine(AbstractExecutionEngine):
|
|||
self._queue_lock = threading.Lock()
|
||||
|
||||
# register advisor callbacks
|
||||
advisor = get_advisor()
|
||||
# advisor.send_trial_callback = self._send_trial_callback
|
||||
# advisor.request_trial_jobs_callback = self._request_trial_jobs_callback
|
||||
advisor.trial_end_callback = self._trial_end_callback
|
||||
advisor.intermediate_metric_callback = self._intermediate_metric_callback
|
||||
advisor.final_metric_callback = self._final_metric_callback
|
||||
advisor: RetiariiAdvisor = get_advisor()
|
||||
advisor.register_callbacks({
|
||||
'send_trial': _noop,
|
||||
'request_trial_jobs': _noop,
|
||||
'trial_end': self._trial_end_callback,
|
||||
'intermediate_metric': self._intermediate_metric_callback,
|
||||
'final_metric': self._final_metric_callback
|
||||
})
|
||||
|
||||
self._stopped = False
|
||||
self._consumer_thread = threading.Thread(target=self._consume_models)
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Any, Callable, Optional, Dict, List, Tuple
|
||||
|
||||
import nni
|
||||
from nni.common.serializer import PayloadTooLarge
|
||||
|
@ -21,6 +21,7 @@ _logger = logging.getLogger(__name__)
|
|||
class RetiariiAdvisor(MsgDispatcherBase):
|
||||
"""
|
||||
The class is to connect Retiarii components to NNI backend.
|
||||
It can be considered as a Python wrapper of NNI manager.
|
||||
|
||||
It will function as the main thread when running a Retiarii experiment through NNI.
|
||||
Strategy will be launched as its thread, who will call APIs in execution engine. Execution
|
||||
|
@ -32,9 +33,6 @@ class RetiariiAdvisor(MsgDispatcherBase):
|
|||
The conversion advisor provides are minimum. It is only a send/receive module, and execution engine
|
||||
needs to handle all the rest.
|
||||
|
||||
FIXME
|
||||
How does advisor exit when strategy exists?
|
||||
|
||||
Attributes
|
||||
----------
|
||||
send_trial_callback
|
||||
|
@ -61,6 +59,63 @@ class RetiariiAdvisor(MsgDispatcherBase):
|
|||
|
||||
self.parameters_count = 0
|
||||
|
||||
# Sometimes messages arrive first before the callbacks get registered.
|
||||
# Or in case that we allow engine to be absent during the experiment.
|
||||
# Here we need to store the messages and invoke them later.
|
||||
self.call_queue: List[Tuple[str, list]] = []
|
||||
|
||||
def register_callbacks(self, callbacks: Dict[str, Callable[..., None]]):
|
||||
"""
|
||||
Register callbacks for NNI backend.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
callbacks
|
||||
A dictionary of callbacks.
|
||||
The key is the name of the callback. The value is the callback function.
|
||||
"""
|
||||
self.send_trial_callback = callbacks.get('send_trial')
|
||||
self.request_trial_jobs_callback = callbacks.get('request_trial_jobs')
|
||||
self.trial_end_callback = callbacks.get('trial_end')
|
||||
self.intermediate_metric_callback = callbacks.get('intermediate_metric')
|
||||
self.final_metric_callback = callbacks.get('final_metric')
|
||||
|
||||
self.process_queued_callbacks()
|
||||
|
||||
def process_queued_callbacks(self) -> None:
|
||||
"""
|
||||
Process callbacks in queue.
|
||||
Consume the messages that haven't been handled previously.
|
||||
"""
|
||||
processed_idx = []
|
||||
for queue_idx, (call_name, call_args) in enumerate(self.call_queue):
|
||||
if call_name == 'send_trial' and self.send_trial_callback is not None:
|
||||
self.send_trial_callback(*call_args) # pylint: disable=not-callable
|
||||
processed_idx.append(queue_idx)
|
||||
if call_name == 'request_trial_jobs' and self.request_trial_jobs_callback is not None:
|
||||
self.request_trial_jobs_callback(*call_args) # pylint: disable=not-callable
|
||||
processed_idx.append(queue_idx)
|
||||
if call_name == 'trial_end' and self.trial_end_callback is not None:
|
||||
self.trial_end_callback(*call_args) # pylint: disable=not-callable
|
||||
processed_idx.append(queue_idx)
|
||||
if call_name == 'intermediate_metric' and self.intermediate_metric_callback is not None:
|
||||
self.intermediate_metric_callback(*call_args) # pylint: disable=not-callable
|
||||
processed_idx.append(queue_idx)
|
||||
if call_name == 'final_metric' and self.final_metric_callback is not None:
|
||||
self.final_metric_callback(*call_args) # pylint: disable=not-callable
|
||||
processed_idx.append(queue_idx)
|
||||
|
||||
# Remove processed messages
|
||||
for idx in reversed(processed_idx):
|
||||
self.call_queue.pop(idx)
|
||||
|
||||
def invoke_callback(self, name: str, *args: Any) -> None:
|
||||
"""
|
||||
Invoke callback.
|
||||
"""
|
||||
self.call_queue.append((name, list(args)))
|
||||
self.process_queued_callbacks()
|
||||
|
||||
def handle_initialize(self, data):
|
||||
"""callback for initializing the advisor
|
||||
Parameters
|
||||
|
@ -140,8 +195,7 @@ class RetiariiAdvisor(MsgDispatcherBase):
|
|||
# nevertheless, there could still be blocked by pipe / nni-manager
|
||||
self.send(CommandType.NewTrialJob, send_payload)
|
||||
|
||||
if self.send_trial_callback is not None:
|
||||
self.send_trial_callback(parameters) # pylint: disable=not-callable
|
||||
self.invoke_callback('send_trial', parameters)
|
||||
return self.parameters_count
|
||||
|
||||
def mark_experiment_as_ending(self):
|
||||
|
@ -149,8 +203,7 @@ class RetiariiAdvisor(MsgDispatcherBase):
|
|||
|
||||
def handle_request_trial_jobs(self, num_trials):
|
||||
_logger.debug('Request trial jobs: %s', num_trials)
|
||||
if self.request_trial_jobs_callback is not None:
|
||||
self.request_trial_jobs_callback(num_trials) # pylint: disable=not-callable
|
||||
self.invoke_callback('request_trial_jobs', num_trials)
|
||||
|
||||
def handle_update_search_space(self, data):
|
||||
_logger.debug('Received search space: %s', data)
|
||||
|
@ -158,22 +211,16 @@ class RetiariiAdvisor(MsgDispatcherBase):
|
|||
|
||||
def handle_trial_end(self, data):
|
||||
_logger.debug('Trial end: %s', data)
|
||||
if self.trial_end_callback is not None:
|
||||
self.trial_end_callback(nni.load(data['hyper_params'])['parameter_id'], # pylint: disable=not-callable
|
||||
data['event'] == 'SUCCEEDED')
|
||||
self.invoke_callback('trial_end', nni.load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED')
|
||||
|
||||
def handle_report_metric_data(self, data):
|
||||
_logger.debug('Metric reported: %s', data)
|
||||
if data['type'] == MetricType.REQUEST_PARAMETER:
|
||||
raise ValueError('Request parameter not supported')
|
||||
elif data['type'] == MetricType.PERIODICAL:
|
||||
if self.intermediate_metric_callback is not None:
|
||||
self.intermediate_metric_callback(data['parameter_id'], # pylint: disable=not-callable
|
||||
self._process_value(data['value']))
|
||||
self.invoke_callback('intermediate_metric', data['parameter_id'], self._process_value(data['value']))
|
||||
elif data['type'] == MetricType.FINAL:
|
||||
if self.final_metric_callback is not None:
|
||||
self.final_metric_callback(data['parameter_id'], # pylint: disable=not-callable
|
||||
self._process_value(data['value']))
|
||||
self.invoke_callback('final_metric', data['parameter_id'], self._process_value(data['value']))
|
||||
|
||||
@staticmethod
|
||||
def _process_value(value) -> Any: # hopefully a float
|
||||
|
|
|
@ -127,9 +127,11 @@ class Random(BaseStrategy):
|
|||
if budget_exhausted():
|
||||
return
|
||||
time.sleep(self._polling_interval)
|
||||
_logger.debug('Still waiting for resource.')
|
||||
try:
|
||||
model = get_targeted_model(base_model, applied_mutators, sample)
|
||||
if filter_model(self.filter, model):
|
||||
_logger.debug('Submitting model: %s', model)
|
||||
submit_models(model)
|
||||
except InvalidMutation as e:
|
||||
_logger.warning(f'Invalid mutation: {e}. Skip.')
|
||||
|
|
|
@ -15,14 +15,19 @@ def main(argv):
|
|||
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
|
||||
|
||||
cmd = 'nvidia-smi -q -x'.split()
|
||||
while(True):
|
||||
try:
|
||||
smi_output = subprocess.check_output(cmd)
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
retry = 0
|
||||
while True:
|
||||
smi = subprocess.run(cmd, timeout=20, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if smi.returncode != 0:
|
||||
retry += 1
|
||||
print(f'gpu_metrics_collector error: nvidia-smi return code is {smi.returncode}', file=sys.stderr)
|
||||
print('=' * 20 + f'\nCaptured stdout: {smi.stdout}', file=sys.stderr)
|
||||
print('=' * 20 + f'\nCaptured stderr: {smi.stderr}', file=sys.stderr)
|
||||
gen_empty_gpu_metric(metrics_output_dir)
|
||||
break
|
||||
parse_nvidia_smi_result(smi_output, metrics_output_dir)
|
||||
if retry >= 5:
|
||||
break
|
||||
else:
|
||||
parse_nvidia_smi_result(smi.stdout, metrics_output_dir)
|
||||
# TODO: change to sleep time configurable via arguments
|
||||
time.sleep(5)
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# FIXME: This pipeline is broken due to resource group location limitation.
|
||||
|
||||
trigger: none
|
||||
pr: none
|
||||
|
||||
|
@ -11,6 +13,7 @@ variables:
|
|||
|
||||
jobs:
|
||||
- job: linux
|
||||
pool: nni-it
|
||||
pool:
|
||||
vmImage: ubuntu-latest
|
||||
steps:
|
||||
- template: templates/build-vm-image-template.yml
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# FIXME: This pipeline is broken due to resource group location limitation.
|
||||
|
||||
trigger: none
|
||||
pr: none
|
||||
|
||||
|
@ -11,7 +13,7 @@ variables:
|
|||
|
||||
jobs:
|
||||
- job: windows
|
||||
pool: nni-it
|
||||
pool: nni-it-1es-11
|
||||
timeoutInMinutes: 90
|
||||
steps:
|
||||
- template: templates/build-vm-image-template.yml
|
||||
|
|
|
@ -31,15 +31,18 @@ stages:
|
|||
condition: and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
|
||||
jobs:
|
||||
- job: linux
|
||||
# move back after we complete the 1ES pool...
|
||||
pool:
|
||||
vmImage: ubuntu-latest
|
||||
pool: nni-it-1es-11
|
||||
timeoutInMinutes: 60
|
||||
|
||||
steps:
|
||||
- template: templates/fix-apt-1es.yml
|
||||
parameters:
|
||||
check_gpu: true
|
||||
|
||||
- template: templates/install-dependencies.yml
|
||||
parameters:
|
||||
platform: ubuntu-latest
|
||||
platform: ubuntu-latest-gpu
|
||||
python_env: venv
|
||||
|
||||
- template: templates/install-nni.yml
|
||||
|
||||
|
@ -48,10 +51,9 @@ stages:
|
|||
- script: |
|
||||
cd test/algo
|
||||
python -m pytest compression
|
||||
displayName: compression unit test
|
||||
displayName: Compression unit test
|
||||
|
||||
# add back after we complete the 1ES pool...
|
||||
# - script: |
|
||||
# cd test
|
||||
# source scripts/model_compression.sh
|
||||
# displayName: Model compression test
|
||||
- script: |
|
||||
cd test
|
||||
source scripts/model_compression.sh
|
||||
displayName: Model compression test
|
||||
|
|
|
@ -31,15 +31,18 @@ stages:
|
|||
condition: and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
|
||||
jobs:
|
||||
- job: linux
|
||||
# move back after we complete the 1ES pool...
|
||||
pool:
|
||||
vmImage: ubuntu-latest
|
||||
pool: nni-it-1es-11
|
||||
timeoutInMinutes: 60
|
||||
|
||||
steps:
|
||||
- template: templates/fix-apt-1es.yml
|
||||
parameters:
|
||||
check_gpu: true
|
||||
|
||||
- template: templates/install-dependencies.yml
|
||||
parameters:
|
||||
platform: ubuntu-latest
|
||||
platform: ubuntu-latest-gpu
|
||||
python_env: venv
|
||||
|
||||
- template: templates/install-nni.yml
|
||||
|
||||
|
@ -57,10 +60,7 @@ stages:
|
|||
|
||||
- script: |
|
||||
cd test
|
||||
python training_service/nnitest/run_tests.py \
|
||||
--config training_service/config/integration_tests.yml \
|
||||
--ts local \
|
||||
--exclude mnist-pytorch-local-gpu
|
||||
python training_service/nnitest/run_tests.py --config training_service/config/integration_tests.yml --ts local
|
||||
displayName: Integration test
|
||||
|
||||
# TODO: should add a test on platforms other than linux
|
||||
|
|
|
@ -31,15 +31,18 @@ stages:
|
|||
condition: and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
|
||||
jobs:
|
||||
- job: linux
|
||||
# move back after we complete the 1ES pool...
|
||||
pool:
|
||||
vmImage: ubuntu-latest
|
||||
pool: nni-it-1es-11
|
||||
timeoutInMinutes: 60
|
||||
|
||||
steps:
|
||||
- template: templates/fix-apt-1es.yml
|
||||
parameters:
|
||||
check_gpu: true
|
||||
|
||||
- template: templates/install-dependencies.yml
|
||||
parameters:
|
||||
platform: ubuntu-latest
|
||||
platform: ubuntu-latest-gpu
|
||||
python_env: venv
|
||||
|
||||
- template: templates/install-nni.yml
|
||||
|
||||
|
@ -51,15 +54,17 @@ stages:
|
|||
displayName: NAS test
|
||||
|
||||
- job: windows
|
||||
# move back after we complete the 1ES pool...
|
||||
pool:
|
||||
vmImage: windows-latest
|
||||
pool: nni-it-1es-windows
|
||||
timeoutInMinutes: 60
|
||||
|
||||
steps:
|
||||
# FIXME: Windows should use GPU,
|
||||
# but it's not used now since driver is not installed in the image.
|
||||
|
||||
- template: templates/install-dependencies.yml
|
||||
parameters:
|
||||
platform: windows
|
||||
python_env: noop
|
||||
|
||||
- template: templates/install-nni.yml
|
||||
parameters:
|
||||
|
|
|
@ -7,11 +7,12 @@ schedules:
|
|||
|
||||
jobs:
|
||||
- job: hybrid
|
||||
pool: nni-it
|
||||
pool: nni-it-1es-11
|
||||
timeoutInMinutes: 90
|
||||
|
||||
steps:
|
||||
# FIXME: should use GPU here
|
||||
- template: templates/fix-apt-1es.yml
|
||||
|
||||
- template: templates/install-dependencies.yml
|
||||
parameters:
|
||||
|
|
|
@ -7,10 +7,14 @@ schedules:
|
|||
|
||||
jobs:
|
||||
- job: linux
|
||||
pool: nni-it
|
||||
pool: nni-it-1es-11
|
||||
timeoutInMinutes: 60
|
||||
|
||||
steps:
|
||||
- template: templates/fix-apt-1es.yml
|
||||
parameters:
|
||||
check_gpu: true
|
||||
|
||||
- template: templates/install-dependencies.yml
|
||||
parameters:
|
||||
platform: ubuntu-latest-gpu
|
||||
|
|
|
@ -7,7 +7,7 @@ schedules:
|
|||
|
||||
jobs:
|
||||
- job: windows
|
||||
pool: nni-it-windows
|
||||
pool: nni-it-1es-windows
|
||||
timeoutInMinutes: 120
|
||||
|
||||
steps:
|
||||
|
@ -43,3 +43,5 @@ jobs:
|
|||
displayName: Integration test
|
||||
|
||||
- template: templates/save-crashed-info.yml
|
||||
parameters:
|
||||
training_service: local
|
||||
|
|
|
@ -12,10 +12,11 @@ schedules:
|
|||
|
||||
jobs:
|
||||
- job: remote_linux2linux
|
||||
pool: nni-it
|
||||
pool: nni-it-1es-11
|
||||
timeoutInMinutes: 120
|
||||
|
||||
steps:
|
||||
- template: templates/fix-apt-1es.yml
|
||||
# FIXME: GPU is not supported yet.
|
||||
# Change to ubuntu-latest-gpu when it's done.
|
||||
|
||||
|
@ -97,4 +98,4 @@ jobs:
|
|||
|
||||
- template: templates/save-crashed-info.yml
|
||||
parameters:
|
||||
remote: true
|
||||
training_service: remote
|
||||
|
|
|
@ -11,7 +11,7 @@ variables:
|
|||
|
||||
jobs:
|
||||
- job: remote_windows2windows
|
||||
pool: nni-it-windows
|
||||
pool: nni-it-1es-windows
|
||||
timeoutInMinutes: 120
|
||||
|
||||
steps:
|
||||
|
@ -49,4 +49,4 @@ jobs:
|
|||
|
||||
- template: templates/save-crashed-info.yml
|
||||
parameters:
|
||||
remote: true
|
||||
training_service: remote
|
||||
|
|
|
@ -8,8 +8,11 @@ steps:
|
|||
# 1. Assign the role following the instruction.
|
||||
# 2. Assign contributor role of the resource group to the identity.
|
||||
# 3. Add the identity to VMSS.
|
||||
#
|
||||
# Update 2022/7 (running on Microsoft-hosted agents).
|
||||
# Use a service principal. This service principal must be assigned contributor access to the resource group.
|
||||
- script: |
|
||||
az login --identity --allow-no-subscriptions --username $(identity_id)
|
||||
az login --service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant_id)
|
||||
displayName: Login to Azure
|
||||
|
||||
# Make sure all these are registered.
|
||||
|
@ -65,7 +68,8 @@ steps:
|
|||
export IP_ADDRESS=$(curl -s ifconfig.me)
|
||||
export VERSION=$(date "+%Y").$(date "+%m%d").$(date "+%H%M%S")
|
||||
export CONFIG_PATH=$(packer_config).json
|
||||
sed -i -e "s/<client_id>/$(identity_id)/g" $CONFIG_PATH
|
||||
sed -i -e "s/<client_id>/$(client_id)/g" $CONFIG_PATH
|
||||
sed -i -e "s/<client_secret>/$(client_secret)/g" $CONFIG_PATH
|
||||
sed -i -e "s/<subscription_id>/$(subscription_id)/g" $CONFIG_PATH
|
||||
sed -i -e "s/<managed_image_name>/$(managed_image_name)/g" $CONFIG_PATH
|
||||
sed -i -e "s/<resource_group>/$(resource_group)/g" $CONFIG_PATH
|
||||
|
@ -113,3 +117,6 @@ steps:
|
|||
# az vmss update -n nni-it -g nni --set virtualMachineProfile.storageProfile.osDisk.diskSizeGb=50
|
||||
#
|
||||
# No need to update the image every time, because it's already set to latest.
|
||||
#
|
||||
# NOTE: After using 1ES pool, the pool image has to be updated manually to the latest version.
|
||||
# However, no successful build has been performed yet, because of resource shortage in Southeast Asia.
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
# Fix apt-related issues on 1ES linux pipeline.
|
||||
|
||||
# 1ES has an auto-upgraded with apt-get running in the background, periodically.
|
||||
# This leads to bad consequences:
|
||||
# 1) apt is locked when install is actually needed
|
||||
# 2) unattended upgrade could possibly break the GPU driver version, and crash nvidia-smi.
|
||||
#
|
||||
# The ultimate solution should be to upgrade the VM image correctly,
|
||||
# but it's currently infeasible because of a resource group limitation.
|
||||
# We introduce a workaround here by force disabling the auto-upgrade and,
|
||||
# fix the broken dependencies if upgrade has already been accidentally run.
|
||||
#
|
||||
# This file can be removed after image is updated to latest.
|
||||
|
||||
parameters:
|
||||
- name: check_gpu
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
steps:
|
||||
|
||||
# Don't set -e
|
||||
# Always make sure the lock is released.
|
||||
- script: |
|
||||
set -x
|
||||
sudo bash test/vso_tools/build_vm/disable_apt_daily.sh
|
||||
sudo apt-get -o DPkg::Lock::Timeout=120 --fix-broken -y install
|
||||
displayName: (1ES) Disable apt upgrade
|
||||
|
||||
# Make sure GPU isn't broken.
|
||||
# Sometimes we can't save the GPU because upgrade runs too early.
|
||||
# We have to rerun the pipeline if unlucky. But it doesn't matter if we don't intend to use GPU at all.
|
||||
- script: |
|
||||
echo "There can be unlucky cases when we can't save the GPU. If nvidia-smi fails, try to rerun the failed jobs."
|
||||
nvidia-smi
|
||||
displayName: (1ES) Check GPU status
|
||||
condition: and(succeeded(), ${{ parameters.check_gpu }})
|
|
@ -2,9 +2,9 @@
|
|||
# so that further offline investigations are possible.
|
||||
|
||||
parameters:
|
||||
- name: remote
|
||||
type: boolean
|
||||
default: false
|
||||
- name: training_service
|
||||
type: string
|
||||
default: unknown
|
||||
|
||||
steps:
|
||||
|
||||
|
@ -16,11 +16,16 @@ steps:
|
|||
condition: and(failed(), not(contains(variables['Agent.OS'], 'Windows')))
|
||||
displayName: (failed) (POSIX) Latest experiment directory
|
||||
|
||||
- script: |
|
||||
cp -r /tmp/$USER/nni ${EXPERIMENT_DIR}/local && echo "Copy successful" || echo "Copy failed"
|
||||
condition: and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'local'), not(contains(variables['Agent.OS'], 'Windows')))
|
||||
displayName: (failed) (POSIX) Harvest GPU scheduler logs
|
||||
|
||||
- script: |
|
||||
set -e
|
||||
export EXPERIMENT_ID=$(echo ${EXPERIMENT_DIR} | sed -e 's/\/.*\///g')
|
||||
sudo docker cp $(Build.BuildId):/tmp/nni-experiments/${EXPERIMENT_ID} ${EXPERIMENT_DIR}/remote && echo "Copy successful" || echo "Copy failed"
|
||||
condition: and(variables['experiment_dir'], ${{ parameters.remote }}, not(contains(variables['Agent.OS'], 'Windows')))
|
||||
condition: and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'remote'), not(contains(variables['Agent.OS'], 'Windows')))
|
||||
displayName: (failed) (POSIX) Harvest remote trial logs
|
||||
|
||||
- powershell: |
|
||||
|
@ -30,6 +35,21 @@ steps:
|
|||
condition: and(failed(), contains(variables['Agent.OS'], 'Windows'))
|
||||
displayName: (failed) (Windows) Latest experiment directory
|
||||
|
||||
- powershell: |
|
||||
$latestDir = Get-Item $(experiment_dir)
|
||||
$tmpPath = "${env:Temp}\${env:UserName}\nni"
|
||||
$destPath = "${latestDir}\local"
|
||||
|
||||
if (Test-Path $tmpPath) {
|
||||
Write-Host "Copying $tmpPath to $destPath"
|
||||
Copy-Item $tmpPath -Destination $destPath -Recurse
|
||||
}
|
||||
else {
|
||||
Write-host "$tmpPath doesn't exist"
|
||||
}
|
||||
condition: and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'local'), contains(variables['Agent.OS'], 'Windows'))
|
||||
displayName: (failed) (Windows) Harvest GPU scheduler logs
|
||||
|
||||
- powershell: |
|
||||
$latestDir = Get-Item $(experiment_dir)
|
||||
$experimentId = $latestDir.name
|
||||
|
@ -43,7 +63,7 @@ steps:
|
|||
else {
|
||||
Write-host "$remotePath doesn't exist"
|
||||
}
|
||||
condition: and(variables['experiment_dir'], ${{ parameters.remote }}, contains(variables['Agent.OS'], 'Windows'))
|
||||
condition: and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'remote'), contains(variables['Agent.OS'], 'Windows'))
|
||||
displayName: (failed) (Windows) Harvest remote trial logs
|
||||
|
||||
- publish: $(experiment_dir)
|
||||
|
|
|
@ -53,8 +53,9 @@ def test_kill_process_slow_no_patience():
|
|||
start_time = time.time()
|
||||
kill_command(process.pid, timeout=1) # didn't wait long enough
|
||||
end_time = time.time()
|
||||
if sys.platform == 'linux': # FIXME: on non-linux, seems that the time of termination can't be controlled
|
||||
assert 0.5 < end_time - start_time < 2
|
||||
if sys.platform == 'linux':
|
||||
# There was assert 0.5 < end_time - start_time. It's not stable.
|
||||
assert end_time - start_time < 2
|
||||
assert process.poll() is None
|
||||
assert _check_pid_running(process.pid)
|
||||
else:
|
||||
|
@ -73,8 +74,7 @@ def test_kill_process_slow_patiently():
|
|||
kill_command(process.pid, timeout=3) # wait long enough
|
||||
end_time = time.time()
|
||||
assert end_time - start_time < 5
|
||||
if sys.platform == 'linux':
|
||||
assert end_time - start_time > 1 # I don't know why windows is super fast
|
||||
# assert end_time - start_time > 1 # This check is disabled because it's not stable
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform != 'linux', reason='Signal issues on non-linux.')
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
"type": "azure-arm",
|
||||
|
||||
"client_id": "<client_id>",
|
||||
"client_secret": "<client_secret>",
|
||||
"subscription_id": "<subscription_id>",
|
||||
|
||||
"managed_image_name": "<managed_image_name>",
|
||||
"managed_image_resource_group_name": "<resource_group>",
|
||||
|
@ -20,7 +22,7 @@
|
|||
"gallery_name": "<gallery_name>",
|
||||
"image_name": "<image_name>",
|
||||
"image_version": "<image_version>",
|
||||
"replication_regions": ["southeastasia", "westus2"],
|
||||
"replication_regions": ["southeastasia", "westus2", "eastus"],
|
||||
"storage_account_type": "Standard_LRS"
|
||||
},
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
"type": "azure-arm",
|
||||
|
||||
"client_id": "<client_id>",
|
||||
"client_secret": "<client_secret>",
|
||||
"subscription_id": "<subscription_id>",
|
||||
|
||||
"managed_image_name": "<managed_image_name>",
|
||||
"managed_image_resource_group_name": "<resource_group>",
|
||||
|
@ -18,7 +20,7 @@
|
|||
"gallery_name": "<gallery_name>",
|
||||
"image_name": "<image_name>",
|
||||
"image_version": "<image_version>",
|
||||
"replication_regions": ["southeastasia", "westus2"],
|
||||
"replication_regions": ["southeastasia", "westus2", "eastus"],
|
||||
"storage_account_type": "Standard_LRS"
|
||||
},
|
||||
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Disable the periodical apt-get upgrade, as it will break the GPU driver.
|
||||
|
||||
sed -i -e "s/Update-Package-Lists \"1\"/Update-Package-Lists \"0\"/g" /etc/apt/apt.conf.d/10periodic
|
||||
sed -i -e "s/Update-Package-Lists \"1\"/Update-Package-Lists \"0\"/g" /etc/apt/apt.conf.d/20auto-upgrades
|
||||
sed -i -e "s/Unattended-Upgrade \"1\"/Unattended-Upgrade \"0\"/g" /etc/apt/apt.conf.d/20auto-upgrades
|
||||
systemctl disable apt-daily.timer
|
||||
systemctl disable apt-daily.service
|
||||
systemctl disable apt-daily-upgrade.timer
|
||||
systemctl disable apt-daily-upgrade.service
|
||||
|
||||
# In case the trick above doesn't work, try to uncomment the following lines.
|
||||
# References: https://gist.github.com/posilva/1cefb5bf1eeccf9382920e5d57a4b3fe
|
||||
|
||||
# apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades
|
||||
|
||||
# systemctl kill --kill-who=all apt-daily.service
|
||||
# systemctl kill --kill-who=all apt-daily-upgrade.service
|
||||
|
||||
# systemctl stop apt-daily.timer
|
||||
# systemctl disable apt-daily.timer
|
||||
# systemctl stop apt-daily.service
|
||||
# systemctl disable apt-daily.service
|
||||
|
||||
# systemctl stop apt-daily-upgrade.timer
|
||||
# systemctl disable apt-daily-upgrade.timer
|
||||
# systemctl stop apt-daily-upgrade.service
|
||||
# systemctl disable apt-daily-upgrade.service
|
||||
# systemctl daemon-reload
|
||||
# systemctl reset-failed
|
||||
|
||||
# rm /etc/systemd/system/timers.target.wants/apt-daily.timer
|
||||
# rm /etc/systemd/system/timers.target.wants/apt-daily-upgrade.timer
|
||||
|
||||
# mv /usr/lib/apt/apt.systemd.daily /usr/lib/apt/apt.systemd.daily.DISABLED
|
||||
# mv /lib/systemd/system/apt-daily.service /lib/systemd/system/apt-daily.service.DISABLED
|
||||
# mv /lib/systemd/system/apt-daily.timer /lib/systemd/system/apt-daily.timer.DISABLED
|
||||
# mv /lib/systemd/system/apt-daily-upgrade.service /lib/systemd/system/apt-daily-upgrade.service.DISABLED
|
||||
# mv /lib/systemd/system/apt-daily-upgrade.timer /lib/systemd/system/apt-daily-upgrade.timer.DISABLED
|
Загрузка…
Ссылка в новой задаче