From 8f01c779baea536f7dcd86370ef4b2265c5e660b Mon Sep 17 00:00:00 2001 From: Ni Hao Date: Wed, 30 Jun 2021 16:19:08 +0800 Subject: [PATCH] Add shared storage integration test (#3455) --- nni/experiment/config/common.py | 1 + pipelines/integration-test-remote-l2l.yml | 27 ++++++------ test/config/integration_tests.yml | 29 ++++++++++++- .../config_sharedstorage_remote_azureblob.yml | 43 +++++++++++++++++++ .../config_sharedstorage_remote_nfs.yml | 40 +++++++++++++++++ .../config_sharedstorage_search_space.json | 7 +++ .../config_sharedstorage_trial.py | 24 +++++++++++ test/config/training_service.yml | 3 ++ test/nni_test/nnitest/generate_ts_config.py | 7 ++- test/nni_test/nnitest/run_tests.py | 32 ++++++++++++-- test/nni_test/nnitest/validators.py | 14 ++++++ test/vso_tools/start_docker.py | 3 +- .../azureblobStorageService.ts | 6 +-- 13 files changed, 215 insertions(+), 21 deletions(-) create mode 100644 test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml create mode 100644 test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml create mode 100644 test/config/sharedstorage_test/config_sharedstorage_search_space.json create mode 100644 test/config/sharedstorage_test/config_sharedstorage_trial.py diff --git a/nni/experiment/config/common.py b/nni/experiment/config/common.py index c7cd64a7d..7bcad43c5 100644 --- a/nni/experiment/config/common.py +++ b/nni/experiment/config/common.py @@ -46,6 +46,7 @@ class CustomAlgorithmConfig(_AlgorithmConfig): class TrainingServiceConfig(ConfigBase): platform: str +@dataclass(init=False) class SharedStorageConfig(ConfigBase): storage_type: str local_mount_point: str diff --git a/pipelines/integration-test-remote-l2l.yml b/pipelines/integration-test-remote-l2l.yml index baf9f479d..fb417bc1b 100644 --- a/pipelines/integration-test-remote-l2l.yml +++ b/pipelines/integration-test-remote-l2l.yml @@ -83,6 +83,21 @@ jobs: commands: python3 /tmp/nnitest/$(Build.BuildId)/test/vso_tools/start_docker.py $(NNI_RELEASE) $(Build.BuildId) $(password_in_docker) displayName: Install NNI and run docker on Linux worker + - script: | + cd test + python3 nni_test/nnitest/generate_ts_config.py \ + --ts remote \ + --remote_reuse true \ + --remote_user nni \ + --remote_host $(worker_ip) \ + --remote_port $(docker_port) \ + --remote_pwd $(password_in_docker) \ + --nni_manager_ip $(manager_ip) \ + --azurestoragetoken $(azureblob_token_test) \ + --nfs_server $(NFS_IP) + python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote + displayName: Integration test (reuse mode) + - script: | cd test python3 nni_test/nnitest/generate_ts_config.py \ @@ -96,18 +111,6 @@ jobs: python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote displayName: Integration test - - script: | - cd test - python3 nni_test/nnitest/generate_ts_config.py \ - --ts remote \ - --remote_reuse true \ - --remote_user nni \ - --remote_host $(worker_ip) \ - --remote_port $(docker_port) \ - --remote_pwd $(password_in_docker) \ - --nni_manager_ip $(manager_ip) - python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote - displayName: Integration test (reuse mode) - task: SSH@0 inputs: diff --git a/test/config/integration_tests.yml b/test/config/integration_tests.yml index 0476f6d92..61d2abe27 100644 --- a/test/config/integration_tests.yml +++ b/test/config/integration_tests.yml @@ -34,6 +34,34 @@ testCases: # check status of experiment before calling validator experimentStatusCheck: True +- name: shared-storage-remote-azureblob + configFile: test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml + config: + sharedStorage: + localMountPoint: /tmp/nnimount/testlocalrootpath + remoteMountPoint: /tmp/nnimount/testremoterootpath + storageAccountName: nennistorage + storageAccountKey: $(azureblob_token_test) + containerName: sharedstorage + validator: + class: FileExistValidator + kwargs: + rootpath: /tmp/nnimount/testlocalrootpath + +# TODO: Enable this case after nfs server is ready +#- name: shared-storage-remote-nfs +# configFile: test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml +# config: +# sharedStorage: +# localMountPoint: /tmp/nnimount/testlocalrootpath +# remoteMountPoint: /tmp/nnimount/testremoterootpath +# nfsServer: $(NFS_IP) +# exportedDirectory: /home/nni/mnt/ +# validator: +# class: FileExistValidator +# kwargs: +# rootpath: /tmp/nnimount/testlocalrootpath + - name: sklearn-regression configFile: test/config/examples/sklearn-regression.yml @@ -227,4 +255,3 @@ testCases: ######################################################################### - name: customized-tuners-demotuner configFile: test/config/customized_tuners/demotuner-sklearn-classification.yml - diff --git a/test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml b/test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml new file mode 100644 index 000000000..20043172f --- /dev/null +++ b/test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml @@ -0,0 +1,43 @@ +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 1 +trainingServicePlatform: remote +searchSpacePath: config_sharedstorage_search_space.json +#choice: true, false +useAnnotation: false +nniManagerIp: 127.0.0.1 +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 config_sharedstorage_trial.py + codeDir: . + gpuNum: 0 +sharedStorage: + storageType: AzureBlob + localMountPoint: ${your/local/mount/point} + remoteMountPoint: ${your/remote/mount/point} + storageAccountName: ${replace_to_your_storageAccountName} + storageAccountKey: ${replace_to_your_storageAccountKey} + # If you did not set storageAccountKey, you need use `az login` with Azure CLI at first and set resourceGroupName. + # resourceGroupName: ${replace_to_your_resourceGroupName} + containerName: ${replace_to_your_containerName} + # usermount means you have already mount this storage on localMountPoint + # nnimount means nni will try to mount this storage on localMountPoint + # nomount means storage will not mount in local machine, will support partial storages in the future + localMounted: nnimount +#machineList can be empty if the platform is local +machineList: + - ip: 10.1.1.1 + username: bob + passwd: bob123 + #port can be skip if using default ssh port 22 + #port: 22 +remoteConfig: + reuse: true \ No newline at end of file diff --git a/test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml b/test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml new file mode 100644 index 000000000..7bf458b43 --- /dev/null +++ b/test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml @@ -0,0 +1,40 @@ +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 1 +trainingServicePlatform: remote +searchSpacePath: config_sharedstorage_search_space.json +#choice: true, false +useAnnotation: false +nniManagerIp: 127.0.0.1 +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 config_sharedstorage_trial.py + codeDir: . + gpuNum: 0 +sharedStorage: + storageType: NFS + localMountPoint: ${your/local/mount/point} + remoteMountPoint: ${your/remote/mount/point} + nfsServer: ${nfs-server-ip} + exportedDirectory: ${nfs/exported/directory} + # usermount means you have already mount this storage on localMountPoint + # nnimount means nni will try to mount this storage on localMountPoint + # nomount means storage will not mount in local machine, will support partial storages in the future + localMounted: nnimount +#machineList can be empty if the platform is local +machineList: + - ip: 10.1.1.1 + username: bob + passwd: bob123 + #port can be skip if using default ssh port 22 + #port: 22 +remoteConfig: + reuse: true \ No newline at end of file diff --git a/test/config/sharedstorage_test/config_sharedstorage_search_space.json b/test/config/sharedstorage_test/config_sharedstorage_search_space.json new file mode 100644 index 000000000..dd05405e2 --- /dev/null +++ b/test/config/sharedstorage_test/config_sharedstorage_search_space.json @@ -0,0 +1,7 @@ +{ + "dropout_rate":{"_type":"uniform","_value":[0.5, 0.9]}, + "conv_size":{"_type":"choice","_value":[2,3,5,7]}, + "hidden_size":{"_type":"choice","_value":[124, 512, 1024]}, + "batch_size": {"_type":"choice", "_value": [16, 32]}, + "learning_rate":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]} +} diff --git a/test/config/sharedstorage_test/config_sharedstorage_trial.py b/test/config/sharedstorage_test/config_sharedstorage_trial.py new file mode 100644 index 000000000..adfc46cba --- /dev/null +++ b/test/config/sharedstorage_test/config_sharedstorage_trial.py @@ -0,0 +1,24 @@ +""" +A deep MNIST classifier using convolutional layers. + +This file is a modification of the official pytorch mnist example: +https://github.com/pytorch/examples/blob/master/mnist/main.py +""" +import os +import logging +import nni +logger = logging.getLogger('mnist_AutoML') +if __name__ == '__main__': + try: + logger.debug(os.environ.get('NNI_OUTPUT_DIR')) + filename = os.path.join(os.environ.get('NNI_OUTPUT_DIR'), 'checkingfile.txt') + f = open(filename, "a") + + tuner_params = nni.get_next_parameter() + f.write(str(tuner_params)) + nni.report_final_result(1) + + f.close() + except Exception as exception: + logger.exception(exception) + raise diff --git a/test/config/training_service.yml b/test/config/training_service.yml index 94d228836..07e809414 100644 --- a/test/config/training_service.yml +++ b/test/config/training_service.yml @@ -87,6 +87,9 @@ remote: port: username: trainingServicePlatform: remote + sharedStorage: + storageAccountKey: + nfsServer: hybrid: maxExecDuration: 15m nniManagerIp: diff --git a/test/nni_test/nnitest/generate_ts_config.py b/test/nni_test/nnitest/generate_ts_config.py index d40613166..b1208f385 100644 --- a/test/nni_test/nnitest/generate_ts_config.py +++ b/test/nni_test/nnitest/generate_ts_config.py @@ -74,6 +74,10 @@ def update_training_service_config(args): config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd if args.remote_reuse is not None: config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true' + if args.azurestoragetoken is not None: + config[args.ts]['sharedStorage']['storageAccountKey'] = args.azurestoragetoken + if args.nfs_server is not None: + config[args.ts]['sharedStorage']['nfsServer'] = args.nfs_server elif args.ts == 'adl': if args.nni_docker_image is not None: config[args.ts]['trial']['image'] = args.nni_docker_image @@ -118,6 +122,8 @@ if __name__ == '__main__': parser.add_argument("--config_version", type=str, choices=['v1', 'v2'], default='v1') parser.add_argument("--nni_docker_image", type=str) parser.add_argument("--nni_manager_ip", type=str) + parser.add_argument("--azurestoragetoken", type=str) + parser.add_argument("--nfs_server", type=str) # args for PAI parser.add_argument("--pai_user", type=str) parser.add_argument("--pai_pwd", type=str) @@ -131,7 +137,6 @@ if __name__ == '__main__': parser.add_argument("--nni_manager_nfs_mount_path", type=str) parser.add_argument("--container_nfs_mount_path", type=str) # args for kubeflow and frameworkController - parser.add_argument("--nfs_server", type=str) parser.add_argument("--nfs_path", type=str) parser.add_argument("--keyvault_vaultname", type=str) parser.add_argument("--keyvault_name", type=str) diff --git a/test/nni_test/nnitest/run_tests.py b/test/nni_test/nnitest/run_tests.py index f87416a5f..ac422207d 100644 --- a/test/nni_test/nnitest/run_tests.py +++ b/test/nni_test/nnitest/run_tests.py @@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT, it_variables = {} -def update_training_service_config(config, training_service, config_file_path): +def update_training_service_config(config, training_service, config_file_path, nni_source_dir): it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml')) # hack for kubeflow trial config @@ -38,7 +38,7 @@ def update_training_service_config(config, training_service, config_file_path): config['trial'].pop('command') if 'gpuNum' in config['trial']: config['trial'].pop('gpuNum') - + if training_service == 'adl': # hack for adl trial config, codeDir in adl mode refers to path in container containerCodeDir = config['trial']['codeDir'] @@ -52,6 +52,18 @@ def update_training_service_config(config, training_service, config_file_path): containerCodeDir = config['trial']['codeDir'].replace('../../../', '/') it_ts_config[training_service]['trial']['codeDir'] = containerCodeDir it_ts_config[training_service]['trial']['command'] = 'cd {0} && {1}'.format(containerCodeDir, config['trial']['command']) + + if training_service == 'remote': + testcase_config = get_yml_content(nni_source_dir + config_file_path) + sharedStorage = testcase_config.get('sharedStorage') + if sharedStorage is None: + it_ts_config[training_service].pop('sharedStorage') + elif str(sharedStorage.get('storageType')).lower() == 'nfs': + it_ts_config[training_service].get('sharedStorage').pop('storageAccountKey') + elif str(sharedStorage.get('storageType')).lower() == 'azureblob': + it_ts_config[training_service].get('sharedStorage').pop('nfsServer') + else: + it_ts_config[training_service].pop('sharedStorage') if training_service == 'hybrid': it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml')) @@ -75,7 +87,7 @@ def prepare_config_file(test_case_config, it_config, args): # apply training service config # user's gpuNum, logCollection config is overwritten by the config in training_service.yml # the hack for kubeflow should be applied at last step - update_training_service_config(test_yml_config, args.ts, test_case_config['configFile']) + update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'], args.nni_source_dir) # generate temporary config yml file to launch experiment new_config_file = config_path + '.tmp' @@ -238,6 +250,15 @@ def match_training_service(test_case_config, cur_training_service): return True return False +def match_remoteConfig(test_case_config, nni_source_dir): + trainingservice_config = get_yml_content(os.path.join('config', 'training_service.yml')) + trainingservice_config_reuse_value = str(trainingservice_config['remote']['remoteConfig']['reuse']).lower() + testcase_config = get_yml_content(nni_source_dir + test_case_config['configFile']) + if testcase_config.get('remoteConfig') is not None: + if testcase_config['remoteConfig'].get('reuse') is not None: + return str(testcase_config['remoteConfig']['reuse']).lower() == trainingservice_config_reuse_value + return True + def run(args): it_config = get_yml_content(args.config) @@ -264,8 +285,13 @@ def run(args): print('skipped {}, training service {} not match [{}]'.format( name, args.ts, test_case_config['trainingService'])) continue + # remote mode need more time to cleanup if args.ts == 'remote' or args.ts == 'hybrid': + if args.ts == 'remote': + if not match_remoteConfig(test_case_config, args.nni_source_dir): + print('skipped {}, remoteConfig not match.'.format(name)) + continue wait_for_port_available(8080, 240) else: wait_for_port_available(8080, 60) diff --git a/test/nni_test/nnitest/validators.py b/test/nni_test/nnitest/validators.py index 3352899d0..3fdbf94ea 100644 --- a/test/nni_test/nnitest/validators.py +++ b/test/nni_test/nnitest/validators.py @@ -97,3 +97,17 @@ class NnicliValidator(ITValidator): print(exp.get_job_statistics()) print(exp.get_experiment_status()) print(exp.list_trial_jobs()) + +class FileExistValidator(ITValidator): + def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs): + print(rest_endpoint) + exp_id = osp.split(experiment_dir)[-1] + rootpath = kwargs.get('rootpath') + + metrics = requests.get(METRICS_URL).json() + for metric in metrics: + trial_id = metric['trialJobId'] + checkpath = osp.join(rootpath, 'nni', exp_id, 'trials', trial_id, 'nnioutput', 'checkingfile.txt') + print('Checking shared storage log exists on trial ',trial_id) + assert osp.exists(checkpath) + diff --git a/test/vso_tools/start_docker.py b/test/vso_tools/start_docker.py index 3057162f7..817e22928 100644 --- a/test/vso_tools/start_docker.py +++ b/test/vso_tools/start_docker.py @@ -25,8 +25,9 @@ container = sys.argv[2] password = sys.argv[3] run_command(f'docker build --build-arg NNI_RELEASE={version} -t nnidev/nni-nightly .') -run_command(f'docker run -d -t -p {port}:22 --name {container} nnidev/nni-nightly') +run_command(f'docker run --privileged -d -t -p {port}:22 --name {container} nnidev/nni-nightly') run_command(f'docker exec {container} useradd --create-home --password {password} nni') run_command(['docker', 'exec', container, 'bash', '-c', f'echo "nni:{password}" | chpasswd']) +run_command(['docker', 'exec', container, 'bash', '-c', 'echo "nni ALL=(ALL:ALL) NOPASSWD:ALL" >> /etc/sudoers']) run_command(f'docker exec {container} service ssh start') set_variable('docker_port', port) diff --git a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts index 7a0185f86..1335cddeb 100644 --- a/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts +++ b/ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts @@ -34,13 +34,13 @@ fi id=$(lsb_release -i | cut -c16- | sed s/[[:space:]]//g) version=$(lsb_release -r | cut -c9- | sed s/[[:space:]]//g) -if [ $id = "Ubuntu" ] +if [ "$id" = "Ubuntu" ] then wget https://packages.microsoft.com/config/ubuntu/$version/packages-microsoft-prod.deb - sudo dpkg -i packages-microsoft-prod.deb + sudo DEBIAN_FRONTEND=noninteractive dpkg -i packages-microsoft-prod.deb sudo apt-get update sudo apt-get install -y blobfuse fuse -elif [ $id = "CentOS" ] || [ $id = "RHEL" ] +elif [ "$id" = "CentOS" ] || [ "$id" = "RHEL" ] then sudo rpm -Uvh https://packages.microsoft.com/config/rhel/$(echo $version | cut -c1)/packages-microsoft-prod.rpm sudo yum install -y blobfuse fuse