Add shared storage integration test (#3455)

This commit is contained in:
Ni Hao 2021-06-30 16:19:08 +08:00 коммит произвёл GitHub
Родитель 32fdd32bfe
Коммит 8f01c779ba
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
13 изменённых файлов: 215 добавлений и 21 удалений

Просмотреть файл

@ -46,6 +46,7 @@ class CustomAlgorithmConfig(_AlgorithmConfig):
class TrainingServiceConfig(ConfigBase):
platform: str
@dataclass(init=False)
class SharedStorageConfig(ConfigBase):
storage_type: str
local_mount_point: str

Просмотреть файл

@ -83,6 +83,21 @@ jobs:
commands: python3 /tmp/nnitest/$(Build.BuildId)/test/vso_tools/start_docker.py $(NNI_RELEASE) $(Build.BuildId) $(password_in_docker)
displayName: Install NNI and run docker on Linux worker
- script: |
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts remote \
--remote_reuse true \
--remote_user nni \
--remote_host $(worker_ip) \
--remote_port $(docker_port) \
--remote_pwd $(password_in_docker) \
--nni_manager_ip $(manager_ip) \
--azurestoragetoken $(azureblob_token_test) \
--nfs_server $(NFS_IP)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: Integration test (reuse mode)
- script: |
cd test
python3 nni_test/nnitest/generate_ts_config.py \
@ -96,18 +111,6 @@ jobs:
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: Integration test
- script: |
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts remote \
--remote_reuse true \
--remote_user nni \
--remote_host $(worker_ip) \
--remote_port $(docker_port) \
--remote_pwd $(password_in_docker) \
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: Integration test (reuse mode)
- task: SSH@0
inputs:

Просмотреть файл

@ -34,6 +34,34 @@ testCases:
# check status of experiment before calling validator
experimentStatusCheck: True
- name: shared-storage-remote-azureblob
configFile: test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml
config:
sharedStorage:
localMountPoint: /tmp/nnimount/testlocalrootpath
remoteMountPoint: /tmp/nnimount/testremoterootpath
storageAccountName: nennistorage
storageAccountKey: $(azureblob_token_test)
containerName: sharedstorage
validator:
class: FileExistValidator
kwargs:
rootpath: /tmp/nnimount/testlocalrootpath
# TODO: Enable this case after nfs server is ready
#- name: shared-storage-remote-nfs
# configFile: test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml
# config:
# sharedStorage:
# localMountPoint: /tmp/nnimount/testlocalrootpath
# remoteMountPoint: /tmp/nnimount/testremoterootpath
# nfsServer: $(NFS_IP)
# exportedDirectory: /home/nni/mnt/
# validator:
# class: FileExistValidator
# kwargs:
# rootpath: /tmp/nnimount/testlocalrootpath
- name: sklearn-regression
configFile: test/config/examples/sklearn-regression.yml
@ -227,4 +255,3 @@ testCases:
#########################################################################
- name: customized-tuners-demotuner
configFile: test/config/customized_tuners/demotuner-sklearn-classification.yml

Просмотреть файл

@ -0,0 +1,43 @@
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 1
trainingServicePlatform: remote
searchSpacePath: config_sharedstorage_search_space.json
#choice: true, false
useAnnotation: false
nniManagerIp: 127.0.0.1
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
#SMAC (SMAC should be installed through nnictl)
builtinTunerName: TPE
classArgs:
#choice: maximize, minimize
optimize_mode: maximize
trial:
command: python3 config_sharedstorage_trial.py
codeDir: .
gpuNum: 0
sharedStorage:
storageType: AzureBlob
localMountPoint: ${your/local/mount/point}
remoteMountPoint: ${your/remote/mount/point}
storageAccountName: ${replace_to_your_storageAccountName}
storageAccountKey: ${replace_to_your_storageAccountKey}
# If you did not set storageAccountKey, you need use `az login` with Azure CLI at first and set resourceGroupName.
# resourceGroupName: ${replace_to_your_resourceGroupName}
containerName: ${replace_to_your_containerName}
# usermount means you have already mount this storage on localMountPoint
# nnimount means nni will try to mount this storage on localMountPoint
# nomount means storage will not mount in local machine, will support partial storages in the future
localMounted: nnimount
#machineList can be empty if the platform is local
machineList:
- ip: 10.1.1.1
username: bob
passwd: bob123
#port can be skip if using default ssh port 22
#port: 22
remoteConfig:
reuse: true

Просмотреть файл

@ -0,0 +1,40 @@
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 1
trainingServicePlatform: remote
searchSpacePath: config_sharedstorage_search_space.json
#choice: true, false
useAnnotation: false
nniManagerIp: 127.0.0.1
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
#SMAC (SMAC should be installed through nnictl)
builtinTunerName: TPE
classArgs:
#choice: maximize, minimize
optimize_mode: maximize
trial:
command: python3 config_sharedstorage_trial.py
codeDir: .
gpuNum: 0
sharedStorage:
storageType: NFS
localMountPoint: ${your/local/mount/point}
remoteMountPoint: ${your/remote/mount/point}
nfsServer: ${nfs-server-ip}
exportedDirectory: ${nfs/exported/directory}
# usermount means you have already mount this storage on localMountPoint
# nnimount means nni will try to mount this storage on localMountPoint
# nomount means storage will not mount in local machine, will support partial storages in the future
localMounted: nnimount
#machineList can be empty if the platform is local
machineList:
- ip: 10.1.1.1
username: bob
passwd: bob123
#port can be skip if using default ssh port 22
#port: 22
remoteConfig:
reuse: true

Просмотреть файл

@ -0,0 +1,7 @@
{
"dropout_rate":{"_type":"uniform","_value":[0.5, 0.9]},
"conv_size":{"_type":"choice","_value":[2,3,5,7]},
"hidden_size":{"_type":"choice","_value":[124, 512, 1024]},
"batch_size": {"_type":"choice", "_value": [16, 32]},
"learning_rate":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]}
}

Просмотреть файл

@ -0,0 +1,24 @@
"""
A deep MNIST classifier using convolutional layers.
This file is a modification of the official pytorch mnist example:
https://github.com/pytorch/examples/blob/master/mnist/main.py
"""
import os
import logging
import nni
logger = logging.getLogger('mnist_AutoML')
if __name__ == '__main__':
try:
logger.debug(os.environ.get('NNI_OUTPUT_DIR'))
filename = os.path.join(os.environ.get('NNI_OUTPUT_DIR'), 'checkingfile.txt')
f = open(filename, "a")
tuner_params = nni.get_next_parameter()
f.write(str(tuner_params))
nni.report_final_result(1)
f.close()
except Exception as exception:
logger.exception(exception)
raise

Просмотреть файл

@ -87,6 +87,9 @@ remote:
port:
username:
trainingServicePlatform: remote
sharedStorage:
storageAccountKey:
nfsServer:
hybrid:
maxExecDuration: 15m
nniManagerIp:

Просмотреть файл

@ -74,6 +74,10 @@ def update_training_service_config(args):
config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
if args.remote_reuse is not None:
config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true'
if args.azurestoragetoken is not None:
config[args.ts]['sharedStorage']['storageAccountKey'] = args.azurestoragetoken
if args.nfs_server is not None:
config[args.ts]['sharedStorage']['nfsServer'] = args.nfs_server
elif args.ts == 'adl':
if args.nni_docker_image is not None:
config[args.ts]['trial']['image'] = args.nni_docker_image
@ -118,6 +122,8 @@ if __name__ == '__main__':
parser.add_argument("--config_version", type=str, choices=['v1', 'v2'], default='v1')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
parser.add_argument("--azurestoragetoken", type=str)
parser.add_argument("--nfs_server", type=str)
# args for PAI
parser.add_argument("--pai_user", type=str)
parser.add_argument("--pai_pwd", type=str)
@ -131,7 +137,6 @@ if __name__ == '__main__':
parser.add_argument("--nni_manager_nfs_mount_path", type=str)
parser.add_argument("--container_nfs_mount_path", type=str)
# args for kubeflow and frameworkController
parser.add_argument("--nfs_server", type=str)
parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str)
parser.add_argument("--keyvault_name", type=str)

Просмотреть файл

@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables = {}
def update_training_service_config(config, training_service, config_file_path):
def update_training_service_config(config, training_service, config_file_path, nni_source_dir):
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
# hack for kubeflow trial config
@ -38,7 +38,7 @@ def update_training_service_config(config, training_service, config_file_path):
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
if training_service == 'adl':
# hack for adl trial config, codeDir in adl mode refers to path in container
containerCodeDir = config['trial']['codeDir']
@ -52,6 +52,18 @@ def update_training_service_config(config, training_service, config_file_path):
containerCodeDir = config['trial']['codeDir'].replace('../../../', '/')
it_ts_config[training_service]['trial']['codeDir'] = containerCodeDir
it_ts_config[training_service]['trial']['command'] = 'cd {0} && {1}'.format(containerCodeDir, config['trial']['command'])
if training_service == 'remote':
testcase_config = get_yml_content(nni_source_dir + config_file_path)
sharedStorage = testcase_config.get('sharedStorage')
if sharedStorage is None:
it_ts_config[training_service].pop('sharedStorage')
elif str(sharedStorage.get('storageType')).lower() == 'nfs':
it_ts_config[training_service].get('sharedStorage').pop('storageAccountKey')
elif str(sharedStorage.get('storageType')).lower() == 'azureblob':
it_ts_config[training_service].get('sharedStorage').pop('nfsServer')
else:
it_ts_config[training_service].pop('sharedStorage')
if training_service == 'hybrid':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
@ -75,7 +87,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'])
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'], args.nni_source_dir)
# generate temporary config yml file to launch experiment
new_config_file = config_path + '.tmp'
@ -238,6 +250,15 @@ def match_training_service(test_case_config, cur_training_service):
return True
return False
def match_remoteConfig(test_case_config, nni_source_dir):
trainingservice_config = get_yml_content(os.path.join('config', 'training_service.yml'))
trainingservice_config_reuse_value = str(trainingservice_config['remote']['remoteConfig']['reuse']).lower()
testcase_config = get_yml_content(nni_source_dir + test_case_config['configFile'])
if testcase_config.get('remoteConfig') is not None:
if testcase_config['remoteConfig'].get('reuse') is not None:
return str(testcase_config['remoteConfig']['reuse']).lower() == trainingservice_config_reuse_value
return True
def run(args):
it_config = get_yml_content(args.config)
@ -264,8 +285,13 @@ def run(args):
print('skipped {}, training service {} not match [{}]'.format(
name, args.ts, test_case_config['trainingService']))
continue
# remote mode need more time to cleanup
if args.ts == 'remote' or args.ts == 'hybrid':
if args.ts == 'remote':
if not match_remoteConfig(test_case_config, args.nni_source_dir):
print('skipped {}, remoteConfig not match.'.format(name))
continue
wait_for_port_available(8080, 240)
else:
wait_for_port_available(8080, 60)

Просмотреть файл

@ -97,3 +97,17 @@ class NnicliValidator(ITValidator):
print(exp.get_job_statistics())
print(exp.get_experiment_status())
print(exp.list_trial_jobs())
class FileExistValidator(ITValidator):
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
print(rest_endpoint)
exp_id = osp.split(experiment_dir)[-1]
rootpath = kwargs.get('rootpath')
metrics = requests.get(METRICS_URL).json()
for metric in metrics:
trial_id = metric['trialJobId']
checkpath = osp.join(rootpath, 'nni', exp_id, 'trials', trial_id, 'nnioutput', 'checkingfile.txt')
print('Checking shared storage log exists on trial ',trial_id)
assert osp.exists(checkpath)

Просмотреть файл

@ -25,8 +25,9 @@ container = sys.argv[2]
password = sys.argv[3]
run_command(f'docker build --build-arg NNI_RELEASE={version} -t nnidev/nni-nightly .')
run_command(f'docker run -d -t -p {port}:22 --name {container} nnidev/nni-nightly')
run_command(f'docker run --privileged -d -t -p {port}:22 --name {container} nnidev/nni-nightly')
run_command(f'docker exec {container} useradd --create-home --password {password} nni')
run_command(['docker', 'exec', container, 'bash', '-c', f'echo "nni:{password}" | chpasswd'])
run_command(['docker', 'exec', container, 'bash', '-c', 'echo "nni ALL=(ALL:ALL) NOPASSWD:ALL" >> /etc/sudoers'])
run_command(f'docker exec {container} service ssh start')
set_variable('docker_port', port)

Просмотреть файл

@ -34,13 +34,13 @@ fi
id=$(lsb_release -i | cut -c16- | sed s/[[:space:]]//g)
version=$(lsb_release -r | cut -c9- | sed s/[[:space:]]//g)
if [ $id = "Ubuntu" ]
if [ "$id" = "Ubuntu" ]
then
wget https://packages.microsoft.com/config/ubuntu/$version/packages-microsoft-prod.deb
sudo dpkg -i packages-microsoft-prod.deb
sudo DEBIAN_FRONTEND=noninteractive dpkg -i packages-microsoft-prod.deb
sudo apt-get update
sudo apt-get install -y blobfuse fuse
elif [ $id = "CentOS" ] || [ $id = "RHEL" ]
elif [ "$id" = "CentOS" ] || [ "$id" = "RHEL" ]
then
sudo rpm -Uvh https://packages.microsoft.com/config/rhel/$(echo $version | cut -c1)/packages-microsoft-prod.rpm
sudo yum install -y blobfuse fuse