зеркало из https://github.com/microsoft/nni.git
Support adl pipeline (#3233)
This commit is contained in:
Родитель
ae50ed1466
Коммит
fb26187de0
|
@ -0,0 +1,63 @@
|
|||
trigger: none
|
||||
pr: none
|
||||
schedules:
|
||||
- cron: 0 16 * * *
|
||||
branches:
|
||||
include: [ master ]
|
||||
|
||||
jobs:
|
||||
- job: adl
|
||||
pool: NNI CI KUBE CLI
|
||||
timeoutInMinutes: 120
|
||||
|
||||
steps:
|
||||
- script: |
|
||||
export NNI_RELEASE=999.$(date -u +%Y%m%d%H%M%S)
|
||||
echo "##vso[task.setvariable variable=PATH]${PATH}:${HOME}/.local/bin"
|
||||
echo "##vso[task.setvariable variable=NNI_RELEASE]${NNI_RELEASE}"
|
||||
|
||||
echo "Working directory: ${PWD}"
|
||||
echo "NNI version: ${NNI_RELEASE}"
|
||||
echo "Build docker image: $(build_docker_image)"
|
||||
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
displayName: Prepare
|
||||
|
||||
- script: |
|
||||
set -e
|
||||
python3 setup.py build_ts
|
||||
python3 setup.py bdist_wheel -p manylinux1_x86_64
|
||||
python3 -m pip install dist/nni-${NNI_RELEASE}-py3-none-manylinux1_x86_64.whl[SMAC,BOHB]
|
||||
displayName: Build and install NNI
|
||||
|
||||
- script: |
|
||||
set -e
|
||||
cd examples/tuners/customized_tuner
|
||||
python3 setup.py develop --user
|
||||
nnictl algo register --meta meta_file.yml
|
||||
displayName: Install customized tuner
|
||||
|
||||
- script: |
|
||||
set -e
|
||||
docker login -u nnidev -p $(docker_hub_password)
|
||||
sed -i '$a RUN python3 -m pip install adaptdl tensorboard' Dockerfile
|
||||
sed -i '$a COPY examples /examples' Dockerfile
|
||||
sed -i '$a COPY test /test' Dockerfile
|
||||
echo '## Build docker image ##'
|
||||
docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly .
|
||||
echo '## Upload docker image ##'
|
||||
docker push nnidev/nni-nightly
|
||||
condition: eq(variables['build_docker_image'], 'true')
|
||||
displayName: Build and upload docker image
|
||||
|
||||
- script: |
|
||||
set -e
|
||||
cd test
|
||||
python3 nni_test/nnitest/generate_ts_config.py \
|
||||
--ts adl \
|
||||
--nni_docker_image nnidev/nni-nightly \
|
||||
--checkpoint_storage_class $(checkpoint_storage_class) \
|
||||
--checkpoint_storage_size $(checkpoint_storage_size) \
|
||||
--nni_manager_ip $(nni_manager_ip)
|
||||
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts adl
|
||||
displayName: Integration test
|
|
@ -0,0 +1,23 @@
|
|||
authorName: nni
|
||||
experimentName: default_test
|
||||
maxExecDuration: 15m
|
||||
maxTrialNum: 1
|
||||
trialConcurrency: 1
|
||||
searchSpacePath: ./cifar10_adl_search_space.json
|
||||
|
||||
tuner:
|
||||
builtinTunerName: Random
|
||||
assessor:
|
||||
builtinAssessorName: Medianstop
|
||||
classArgs:
|
||||
optimize_mode: maximize
|
||||
trial:
|
||||
codeDir: /examples/trials/cifar10_pytorch
|
||||
command: python3 main_adl.py --epochs 1
|
||||
gpuNum: 1
|
||||
|
||||
useAnnotation: false
|
||||
multiPhase: false
|
||||
multiThread: false
|
||||
|
||||
trainingServicePlatform: adl
|
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"lr":{"_type":"choice", "_value":[0.1, 0.01, 0.001]},
|
||||
"bs":{"_type":"choice","_value":[64, 96, 128]},
|
||||
"model":{"_type":"choice", "_value":["ResNet18", "SENet18", "MobileNet"]}
|
||||
}
|
|
@ -75,6 +75,10 @@ testCases:
|
|||
command: python3 main.py --epochs 1 --batches 1
|
||||
gpuNum: 0
|
||||
|
||||
- name: cifar10-pytorch-adl
|
||||
configFile: test/config/examples/cifar10-pytorch-adl.yml
|
||||
trainingService: adl
|
||||
|
||||
#- name: nested-ss
|
||||
# configFile: test/config/examples/mnist-nested-search-space.yml
|
||||
|
||||
|
|
|
@ -52,6 +52,10 @@ testCases:
|
|||
command: python3 main.py --epochs 1 --batches 1
|
||||
gpuNum: 0
|
||||
|
||||
- name: cifar10-pytorch-adl
|
||||
configFile: test/config/examples/cifar10-pytorch-adl.yml
|
||||
trainingService: adl
|
||||
|
||||
- name: classic-nas-gen-ss
|
||||
configFile: test/config/examples/classic-nas-tf2.yml
|
||||
launchCommand: nnictl ss_gen --trial_command="python3 train.py --epochs 1" --trial_dir=../examples/nas/classic_nas-tf --file=config/examples/nni-nas-search-space-tf2.json
|
||||
|
|
|
@ -103,3 +103,22 @@ remote:
|
|||
port:
|
||||
username:
|
||||
trainingServicePlatform: remote
|
||||
adl:
|
||||
maxExecDuration: 15m
|
||||
nniManagerIp:
|
||||
# use a small trial number to make IT faster
|
||||
maxTrialNum: 2
|
||||
trialConcurrency: 2
|
||||
trial:
|
||||
namespace: default
|
||||
command:
|
||||
codeDir:
|
||||
gpuNum: 1
|
||||
cpuNum: 1
|
||||
image:
|
||||
memorySize: 1Gi
|
||||
checkpoint:
|
||||
storageClass:
|
||||
storageSize:
|
||||
trainingServicePlatform: adl
|
||||
|
||||
|
|
|
@ -88,13 +88,28 @@ def update_training_service_config(args):
|
|||
config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
|
||||
if args.remote_reuse is not None:
|
||||
config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true'
|
||||
elif args.ts == 'adl':
|
||||
if args.nni_docker_image is not None:
|
||||
config[args.ts]['trial']['image'] = args.nni_docker_image
|
||||
if args.checkpoint_storage_class is not None:
|
||||
config[args.ts]['trial']['checkpoint']['storageClass'] = args.checkpoint_storage_class
|
||||
if args.checkpoint_storage_size is not None:
|
||||
config[args.ts]['trial']['checkpoint']['storageSize'] = args.checkpoint_storage_size
|
||||
if args.adaptive is not None:
|
||||
config[args.ts]['trial']['adaptive'] = args.adaptive
|
||||
if args.adl_nfs_server is not None and args.adl_nfs_path is not None and args.adl_nfs_container_mount_path is not None:
|
||||
# default keys in nfs is empty, need to initialize
|
||||
config[args.ts]['trial']['nfs'] = {}
|
||||
config[args.ts]['trial']['nfs']['server'] = args.adl_nfs_server
|
||||
config[args.ts]['trial']['nfs']['path'] = args.adl_nfs_path
|
||||
config[args.ts]['trial']['nfs']['container_mount_path'] = args.nadl_fs_container_mount_path
|
||||
|
||||
dump_yml_content(TRAINING_SERVICE_FILE, config)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local', 'frameworkcontroller'], default='pai')
|
||||
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local', 'frameworkcontroller', 'adl'], default='pai')
|
||||
parser.add_argument("--nni_docker_image", type=str)
|
||||
parser.add_argument("--nni_manager_ip", type=str)
|
||||
# args for PAI
|
||||
|
@ -122,6 +137,13 @@ if __name__ == '__main__':
|
|||
parser.add_argument("--remote_host", type=str)
|
||||
parser.add_argument("--remote_port", type=int)
|
||||
parser.add_argument("--remote_reuse", type=str)
|
||||
# args for adl
|
||||
parser.add_argument("--checkpoint_storage_class", type=str)
|
||||
parser.add_argument("--checkpoint_storage_size", type=str)
|
||||
parser.add_argument("--adaptive", type=str)
|
||||
parser.add_argument("--adl_nfs_server", type=str)
|
||||
parser.add_argument("--adl_nfs_path", type=str)
|
||||
parser.add_argument("--adl_nfs_container_mount_path", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
update_training_service_config(args)
|
||||
|
|
|
@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
|
|||
it_variables = {}
|
||||
|
||||
|
||||
def update_training_service_config(config, training_service):
|
||||
def update_training_service_config(config, training_service, config_file_path):
|
||||
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
|
||||
|
||||
# hack for kubeflow trial config
|
||||
|
@ -38,6 +38,20 @@ def update_training_service_config(config, training_service):
|
|||
config['trial'].pop('command')
|
||||
if 'gpuNum' in config['trial']:
|
||||
config['trial'].pop('gpuNum')
|
||||
|
||||
if training_service == 'adl':
|
||||
# hack for adl trial config, codeDir in adl mode refers to path in container
|
||||
containerCodeDir = config['trial']['codeDir']
|
||||
# replace metric test folders to container folder
|
||||
if config['trial']['codeDir'] == '.':
|
||||
containerCodeDir = '/' + config_file_path[:config_file_path.rfind('/')]
|
||||
elif config['trial']['codeDir'] == '../naive_trial':
|
||||
containerCodeDir = '/test/config/naive_trial'
|
||||
elif '../../../' in config['trial']['codeDir']:
|
||||
# replace example folders to container folder
|
||||
containerCodeDir = config['trial']['codeDir'].replace('../../../', '/')
|
||||
it_ts_config[training_service]['trial']['codeDir'] = containerCodeDir
|
||||
it_ts_config[training_service]['trial']['command'] = 'cd {0} && {1}'.format(containerCodeDir, config['trial']['command'])
|
||||
|
||||
deep_update(config, it_ts_config['all'])
|
||||
deep_update(config, it_ts_config[training_service])
|
||||
|
@ -58,7 +72,7 @@ def prepare_config_file(test_case_config, it_config, args):
|
|||
# apply training service config
|
||||
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
|
||||
# the hack for kubeflow should be applied at last step
|
||||
update_training_service_config(test_yml_config, args.ts)
|
||||
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'])
|
||||
|
||||
# generate temporary config yml file to launch experiment
|
||||
new_config_file = config_path + '.tmp'
|
||||
|
@ -249,6 +263,10 @@ def run(args):
|
|||
wait_for_port_available(8080, 180)
|
||||
else:
|
||||
wait_for_port_available(8080, 30)
|
||||
|
||||
# adl mode need more time to cleanup PVC
|
||||
if args.ts == 'adl' and name == 'nnictl-resume-2':
|
||||
time.sleep(30)
|
||||
print('## {}Testing: {}{} ##'.format(GREEN, name, CLEAR))
|
||||
begin_time = time.time()
|
||||
|
||||
|
@ -263,7 +281,7 @@ if __name__ == '__main__':
|
|||
parser.add_argument("--cases", type=str, default=None)
|
||||
parser.add_argument("--exclude", type=str, default=None)
|
||||
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai',
|
||||
'kubeflow', 'frameworkcontroller'], default='local')
|
||||
'kubeflow', 'frameworkcontroller', 'adl'], default='local')
|
||||
args = parser.parse_args()
|
||||
|
||||
run(args)
|
||||
|
|
Загрузка…
Ссылка в новой задаче