This commit is contained in:
SparkSnail 2021-01-04 10:38:44 +08:00 коммит произвёл GitHub
Родитель ae50ed1466
Коммит fb26187de0
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 162 добавлений и 4 удалений

Просмотреть файл

@ -0,0 +1,63 @@
trigger: none
pr: none
schedules:
- cron: 0 16 * * *
branches:
include: [ master ]
jobs:
- job: adl
pool: NNI CI KUBE CLI
timeoutInMinutes: 120
steps:
- script: |
export NNI_RELEASE=999.$(date -u +%Y%m%d%H%M%S)
echo "##vso[task.setvariable variable=PATH]${PATH}:${HOME}/.local/bin"
echo "##vso[task.setvariable variable=NNI_RELEASE]${NNI_RELEASE}"
echo "Working directory: ${PWD}"
echo "NNI version: ${NNI_RELEASE}"
echo "Build docker image: $(build_docker_image)"
python3 -m pip install --upgrade pip setuptools
displayName: Prepare
- script: |
set -e
python3 setup.py build_ts
python3 setup.py bdist_wheel -p manylinux1_x86_64
python3 -m pip install dist/nni-${NNI_RELEASE}-py3-none-manylinux1_x86_64.whl[SMAC,BOHB]
displayName: Build and install NNI
- script: |
set -e
cd examples/tuners/customized_tuner
python3 setup.py develop --user
nnictl algo register --meta meta_file.yml
displayName: Install customized tuner
- script: |
set -e
docker login -u nnidev -p $(docker_hub_password)
sed -i '$a RUN python3 -m pip install adaptdl tensorboard' Dockerfile
sed -i '$a COPY examples /examples' Dockerfile
sed -i '$a COPY test /test' Dockerfile
echo '## Build docker image ##'
docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly .
echo '## Upload docker image ##'
docker push nnidev/nni-nightly
condition: eq(variables['build_docker_image'], 'true')
displayName: Build and upload docker image
- script: |
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts adl \
--nni_docker_image nnidev/nni-nightly \
--checkpoint_storage_class $(checkpoint_storage_class) \
--checkpoint_storage_size $(checkpoint_storage_size) \
--nni_manager_ip $(nni_manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts adl
displayName: Integration test

Просмотреть файл

@ -0,0 +1,23 @@
authorName: nni
experimentName: default_test
maxExecDuration: 15m
maxTrialNum: 1
trialConcurrency: 1
searchSpacePath: ./cifar10_adl_search_space.json
tuner:
builtinTunerName: Random
assessor:
builtinAssessorName: Medianstop
classArgs:
optimize_mode: maximize
trial:
codeDir: /examples/trials/cifar10_pytorch
command: python3 main_adl.py --epochs 1
gpuNum: 1
useAnnotation: false
multiPhase: false
multiThread: false
trainingServicePlatform: adl

Просмотреть файл

@ -0,0 +1,5 @@
{
"lr":{"_type":"choice", "_value":[0.1, 0.01, 0.001]},
"bs":{"_type":"choice","_value":[64, 96, 128]},
"model":{"_type":"choice", "_value":["ResNet18", "SENet18", "MobileNet"]}
}

Просмотреть файл

@ -75,6 +75,10 @@ testCases:
command: python3 main.py --epochs 1 --batches 1
gpuNum: 0
- name: cifar10-pytorch-adl
configFile: test/config/examples/cifar10-pytorch-adl.yml
trainingService: adl
#- name: nested-ss
# configFile: test/config/examples/mnist-nested-search-space.yml

Просмотреть файл

@ -52,6 +52,10 @@ testCases:
command: python3 main.py --epochs 1 --batches 1
gpuNum: 0
- name: cifar10-pytorch-adl
configFile: test/config/examples/cifar10-pytorch-adl.yml
trainingService: adl
- name: classic-nas-gen-ss
configFile: test/config/examples/classic-nas-tf2.yml
launchCommand: nnictl ss_gen --trial_command="python3 train.py --epochs 1" --trial_dir=../examples/nas/classic_nas-tf --file=config/examples/nni-nas-search-space-tf2.json

Просмотреть файл

@ -103,3 +103,22 @@ remote:
port:
username:
trainingServicePlatform: remote
adl:
maxExecDuration: 15m
nniManagerIp:
# use a small trial number to make IT faster
maxTrialNum: 2
trialConcurrency: 2
trial:
namespace: default
command:
codeDir:
gpuNum: 1
cpuNum: 1
image:
memorySize: 1Gi
checkpoint:
storageClass:
storageSize:
trainingServicePlatform: adl

Просмотреть файл

@ -88,13 +88,28 @@ def update_training_service_config(args):
config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
if args.remote_reuse is not None:
config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true'
elif args.ts == 'adl':
if args.nni_docker_image is not None:
config[args.ts]['trial']['image'] = args.nni_docker_image
if args.checkpoint_storage_class is not None:
config[args.ts]['trial']['checkpoint']['storageClass'] = args.checkpoint_storage_class
if args.checkpoint_storage_size is not None:
config[args.ts]['trial']['checkpoint']['storageSize'] = args.checkpoint_storage_size
if args.adaptive is not None:
config[args.ts]['trial']['adaptive'] = args.adaptive
if args.adl_nfs_server is not None and args.adl_nfs_path is not None and args.adl_nfs_container_mount_path is not None:
# default keys in nfs is empty, need to initialize
config[args.ts]['trial']['nfs'] = {}
config[args.ts]['trial']['nfs']['server'] = args.adl_nfs_server
config[args.ts]['trial']['nfs']['path'] = args.adl_nfs_path
config[args.ts]['trial']['nfs']['container_mount_path'] = args.nadl_fs_container_mount_path
dump_yml_content(TRAINING_SERVICE_FILE, config)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local', 'frameworkcontroller'], default='pai')
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local', 'frameworkcontroller', 'adl'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
@ -122,6 +137,13 @@ if __name__ == '__main__':
parser.add_argument("--remote_host", type=str)
parser.add_argument("--remote_port", type=int)
parser.add_argument("--remote_reuse", type=str)
# args for adl
parser.add_argument("--checkpoint_storage_class", type=str)
parser.add_argument("--checkpoint_storage_size", type=str)
parser.add_argument("--adaptive", type=str)
parser.add_argument("--adl_nfs_server", type=str)
parser.add_argument("--adl_nfs_path", type=str)
parser.add_argument("--adl_nfs_container_mount_path", type=str)
args = parser.parse_args()
update_training_service_config(args)

Просмотреть файл

@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables = {}
def update_training_service_config(config, training_service):
def update_training_service_config(config, training_service, config_file_path):
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
# hack for kubeflow trial config
@ -38,6 +38,20 @@ def update_training_service_config(config, training_service):
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
if training_service == 'adl':
# hack for adl trial config, codeDir in adl mode refers to path in container
containerCodeDir = config['trial']['codeDir']
# replace metric test folders to container folder
if config['trial']['codeDir'] == '.':
containerCodeDir = '/' + config_file_path[:config_file_path.rfind('/')]
elif config['trial']['codeDir'] == '../naive_trial':
containerCodeDir = '/test/config/naive_trial'
elif '../../../' in config['trial']['codeDir']:
# replace example folders to container folder
containerCodeDir = config['trial']['codeDir'].replace('../../../', '/')
it_ts_config[training_service]['trial']['codeDir'] = containerCodeDir
it_ts_config[training_service]['trial']['command'] = 'cd {0} && {1}'.format(containerCodeDir, config['trial']['command'])
deep_update(config, it_ts_config['all'])
deep_update(config, it_ts_config[training_service])
@ -58,7 +72,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
update_training_service_config(test_yml_config, args.ts)
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'])
# generate temporary config yml file to launch experiment
new_config_file = config_path + '.tmp'
@ -249,6 +263,10 @@ def run(args):
wait_for_port_available(8080, 180)
else:
wait_for_port_available(8080, 30)
# adl mode need more time to cleanup PVC
if args.ts == 'adl' and name == 'nnictl-resume-2':
time.sleep(30)
print('## {}Testing: {}{} ##'.format(GREEN, name, CLEAR))
begin_time = time.time()
@ -263,7 +281,7 @@ if __name__ == '__main__':
parser.add_argument("--cases", type=str, default=None)
parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai',
'kubeflow', 'frameworkcontroller'], default='local')
'kubeflow', 'frameworkcontroller', 'adl'], default='local')
args = parser.parse_args()
run(args)