зеркало из https://github.com/microsoft/nni.git
Add shared storage integration test (#3455)
This commit is contained in:
Родитель
32fdd32bfe
Коммит
8f01c779ba
|
@ -46,6 +46,7 @@ class CustomAlgorithmConfig(_AlgorithmConfig):
|
|||
class TrainingServiceConfig(ConfigBase):
|
||||
platform: str
|
||||
|
||||
@dataclass(init=False)
|
||||
class SharedStorageConfig(ConfigBase):
|
||||
storage_type: str
|
||||
local_mount_point: str
|
||||
|
|
|
@ -83,6 +83,21 @@ jobs:
|
|||
commands: python3 /tmp/nnitest/$(Build.BuildId)/test/vso_tools/start_docker.py $(NNI_RELEASE) $(Build.BuildId) $(password_in_docker)
|
||||
displayName: Install NNI and run docker on Linux worker
|
||||
|
||||
- script: |
|
||||
cd test
|
||||
python3 nni_test/nnitest/generate_ts_config.py \
|
||||
--ts remote \
|
||||
--remote_reuse true \
|
||||
--remote_user nni \
|
||||
--remote_host $(worker_ip) \
|
||||
--remote_port $(docker_port) \
|
||||
--remote_pwd $(password_in_docker) \
|
||||
--nni_manager_ip $(manager_ip) \
|
||||
--azurestoragetoken $(azureblob_token_test) \
|
||||
--nfs_server $(NFS_IP)
|
||||
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
|
||||
displayName: Integration test (reuse mode)
|
||||
|
||||
- script: |
|
||||
cd test
|
||||
python3 nni_test/nnitest/generate_ts_config.py \
|
||||
|
@ -96,18 +111,6 @@ jobs:
|
|||
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
|
||||
displayName: Integration test
|
||||
|
||||
- script: |
|
||||
cd test
|
||||
python3 nni_test/nnitest/generate_ts_config.py \
|
||||
--ts remote \
|
||||
--remote_reuse true \
|
||||
--remote_user nni \
|
||||
--remote_host $(worker_ip) \
|
||||
--remote_port $(docker_port) \
|
||||
--remote_pwd $(password_in_docker) \
|
||||
--nni_manager_ip $(manager_ip)
|
||||
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
|
||||
displayName: Integration test (reuse mode)
|
||||
|
||||
- task: SSH@0
|
||||
inputs:
|
||||
|
|
|
@ -34,6 +34,34 @@ testCases:
|
|||
# check status of experiment before calling validator
|
||||
experimentStatusCheck: True
|
||||
|
||||
- name: shared-storage-remote-azureblob
|
||||
configFile: test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml
|
||||
config:
|
||||
sharedStorage:
|
||||
localMountPoint: /tmp/nnimount/testlocalrootpath
|
||||
remoteMountPoint: /tmp/nnimount/testremoterootpath
|
||||
storageAccountName: nennistorage
|
||||
storageAccountKey: $(azureblob_token_test)
|
||||
containerName: sharedstorage
|
||||
validator:
|
||||
class: FileExistValidator
|
||||
kwargs:
|
||||
rootpath: /tmp/nnimount/testlocalrootpath
|
||||
|
||||
# TODO: Enable this case after nfs server is ready
|
||||
#- name: shared-storage-remote-nfs
|
||||
# configFile: test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml
|
||||
# config:
|
||||
# sharedStorage:
|
||||
# localMountPoint: /tmp/nnimount/testlocalrootpath
|
||||
# remoteMountPoint: /tmp/nnimount/testremoterootpath
|
||||
# nfsServer: $(NFS_IP)
|
||||
# exportedDirectory: /home/nni/mnt/
|
||||
# validator:
|
||||
# class: FileExistValidator
|
||||
# kwargs:
|
||||
# rootpath: /tmp/nnimount/testlocalrootpath
|
||||
|
||||
- name: sklearn-regression
|
||||
configFile: test/config/examples/sklearn-regression.yml
|
||||
|
||||
|
@ -227,4 +255,3 @@ testCases:
|
|||
#########################################################################
|
||||
- name: customized-tuners-demotuner
|
||||
configFile: test/config/customized_tuners/demotuner-sklearn-classification.yml
|
||||
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
authorName: default
|
||||
experimentName: example_mnist
|
||||
trialConcurrency: 1
|
||||
maxExecDuration: 1h
|
||||
maxTrialNum: 1
|
||||
trainingServicePlatform: remote
|
||||
searchSpacePath: config_sharedstorage_search_space.json
|
||||
#choice: true, false
|
||||
useAnnotation: false
|
||||
nniManagerIp: 127.0.0.1
|
||||
tuner:
|
||||
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
|
||||
#SMAC (SMAC should be installed through nnictl)
|
||||
builtinTunerName: TPE
|
||||
classArgs:
|
||||
#choice: maximize, minimize
|
||||
optimize_mode: maximize
|
||||
trial:
|
||||
command: python3 config_sharedstorage_trial.py
|
||||
codeDir: .
|
||||
gpuNum: 0
|
||||
sharedStorage:
|
||||
storageType: AzureBlob
|
||||
localMountPoint: ${your/local/mount/point}
|
||||
remoteMountPoint: ${your/remote/mount/point}
|
||||
storageAccountName: ${replace_to_your_storageAccountName}
|
||||
storageAccountKey: ${replace_to_your_storageAccountKey}
|
||||
# If you did not set storageAccountKey, you need use `az login` with Azure CLI at first and set resourceGroupName.
|
||||
# resourceGroupName: ${replace_to_your_resourceGroupName}
|
||||
containerName: ${replace_to_your_containerName}
|
||||
# usermount means you have already mount this storage on localMountPoint
|
||||
# nnimount means nni will try to mount this storage on localMountPoint
|
||||
# nomount means storage will not mount in local machine, will support partial storages in the future
|
||||
localMounted: nnimount
|
||||
#machineList can be empty if the platform is local
|
||||
machineList:
|
||||
- ip: 10.1.1.1
|
||||
username: bob
|
||||
passwd: bob123
|
||||
#port can be skip if using default ssh port 22
|
||||
#port: 22
|
||||
remoteConfig:
|
||||
reuse: true
|
|
@ -0,0 +1,40 @@
|
|||
authorName: default
|
||||
experimentName: example_mnist
|
||||
trialConcurrency: 1
|
||||
maxExecDuration: 1h
|
||||
maxTrialNum: 1
|
||||
trainingServicePlatform: remote
|
||||
searchSpacePath: config_sharedstorage_search_space.json
|
||||
#choice: true, false
|
||||
useAnnotation: false
|
||||
nniManagerIp: 127.0.0.1
|
||||
tuner:
|
||||
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
|
||||
#SMAC (SMAC should be installed through nnictl)
|
||||
builtinTunerName: TPE
|
||||
classArgs:
|
||||
#choice: maximize, minimize
|
||||
optimize_mode: maximize
|
||||
trial:
|
||||
command: python3 config_sharedstorage_trial.py
|
||||
codeDir: .
|
||||
gpuNum: 0
|
||||
sharedStorage:
|
||||
storageType: NFS
|
||||
localMountPoint: ${your/local/mount/point}
|
||||
remoteMountPoint: ${your/remote/mount/point}
|
||||
nfsServer: ${nfs-server-ip}
|
||||
exportedDirectory: ${nfs/exported/directory}
|
||||
# usermount means you have already mount this storage on localMountPoint
|
||||
# nnimount means nni will try to mount this storage on localMountPoint
|
||||
# nomount means storage will not mount in local machine, will support partial storages in the future
|
||||
localMounted: nnimount
|
||||
#machineList can be empty if the platform is local
|
||||
machineList:
|
||||
- ip: 10.1.1.1
|
||||
username: bob
|
||||
passwd: bob123
|
||||
#port can be skip if using default ssh port 22
|
||||
#port: 22
|
||||
remoteConfig:
|
||||
reuse: true
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"dropout_rate":{"_type":"uniform","_value":[0.5, 0.9]},
|
||||
"conv_size":{"_type":"choice","_value":[2,3,5,7]},
|
||||
"hidden_size":{"_type":"choice","_value":[124, 512, 1024]},
|
||||
"batch_size": {"_type":"choice", "_value": [16, 32]},
|
||||
"learning_rate":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
A deep MNIST classifier using convolutional layers.
|
||||
|
||||
This file is a modification of the official pytorch mnist example:
|
||||
https://github.com/pytorch/examples/blob/master/mnist/main.py
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
import nni
|
||||
logger = logging.getLogger('mnist_AutoML')
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
logger.debug(os.environ.get('NNI_OUTPUT_DIR'))
|
||||
filename = os.path.join(os.environ.get('NNI_OUTPUT_DIR'), 'checkingfile.txt')
|
||||
f = open(filename, "a")
|
||||
|
||||
tuner_params = nni.get_next_parameter()
|
||||
f.write(str(tuner_params))
|
||||
nni.report_final_result(1)
|
||||
|
||||
f.close()
|
||||
except Exception as exception:
|
||||
logger.exception(exception)
|
||||
raise
|
|
@ -87,6 +87,9 @@ remote:
|
|||
port:
|
||||
username:
|
||||
trainingServicePlatform: remote
|
||||
sharedStorage:
|
||||
storageAccountKey:
|
||||
nfsServer:
|
||||
hybrid:
|
||||
maxExecDuration: 15m
|
||||
nniManagerIp:
|
||||
|
|
|
@ -74,6 +74,10 @@ def update_training_service_config(args):
|
|||
config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
|
||||
if args.remote_reuse is not None:
|
||||
config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true'
|
||||
if args.azurestoragetoken is not None:
|
||||
config[args.ts]['sharedStorage']['storageAccountKey'] = args.azurestoragetoken
|
||||
if args.nfs_server is not None:
|
||||
config[args.ts]['sharedStorage']['nfsServer'] = args.nfs_server
|
||||
elif args.ts == 'adl':
|
||||
if args.nni_docker_image is not None:
|
||||
config[args.ts]['trial']['image'] = args.nni_docker_image
|
||||
|
@ -118,6 +122,8 @@ if __name__ == '__main__':
|
|||
parser.add_argument("--config_version", type=str, choices=['v1', 'v2'], default='v1')
|
||||
parser.add_argument("--nni_docker_image", type=str)
|
||||
parser.add_argument("--nni_manager_ip", type=str)
|
||||
parser.add_argument("--azurestoragetoken", type=str)
|
||||
parser.add_argument("--nfs_server", type=str)
|
||||
# args for PAI
|
||||
parser.add_argument("--pai_user", type=str)
|
||||
parser.add_argument("--pai_pwd", type=str)
|
||||
|
@ -131,7 +137,6 @@ if __name__ == '__main__':
|
|||
parser.add_argument("--nni_manager_nfs_mount_path", type=str)
|
||||
parser.add_argument("--container_nfs_mount_path", type=str)
|
||||
# args for kubeflow and frameworkController
|
||||
parser.add_argument("--nfs_server", type=str)
|
||||
parser.add_argument("--nfs_path", type=str)
|
||||
parser.add_argument("--keyvault_vaultname", type=str)
|
||||
parser.add_argument("--keyvault_name", type=str)
|
||||
|
|
|
@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
|
|||
it_variables = {}
|
||||
|
||||
|
||||
def update_training_service_config(config, training_service, config_file_path):
|
||||
def update_training_service_config(config, training_service, config_file_path, nni_source_dir):
|
||||
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
|
||||
|
||||
# hack for kubeflow trial config
|
||||
|
@ -38,7 +38,7 @@ def update_training_service_config(config, training_service, config_file_path):
|
|||
config['trial'].pop('command')
|
||||
if 'gpuNum' in config['trial']:
|
||||
config['trial'].pop('gpuNum')
|
||||
|
||||
|
||||
if training_service == 'adl':
|
||||
# hack for adl trial config, codeDir in adl mode refers to path in container
|
||||
containerCodeDir = config['trial']['codeDir']
|
||||
|
@ -52,6 +52,18 @@ def update_training_service_config(config, training_service, config_file_path):
|
|||
containerCodeDir = config['trial']['codeDir'].replace('../../../', '/')
|
||||
it_ts_config[training_service]['trial']['codeDir'] = containerCodeDir
|
||||
it_ts_config[training_service]['trial']['command'] = 'cd {0} && {1}'.format(containerCodeDir, config['trial']['command'])
|
||||
|
||||
if training_service == 'remote':
|
||||
testcase_config = get_yml_content(nni_source_dir + config_file_path)
|
||||
sharedStorage = testcase_config.get('sharedStorage')
|
||||
if sharedStorage is None:
|
||||
it_ts_config[training_service].pop('sharedStorage')
|
||||
elif str(sharedStorage.get('storageType')).lower() == 'nfs':
|
||||
it_ts_config[training_service].get('sharedStorage').pop('storageAccountKey')
|
||||
elif str(sharedStorage.get('storageType')).lower() == 'azureblob':
|
||||
it_ts_config[training_service].get('sharedStorage').pop('nfsServer')
|
||||
else:
|
||||
it_ts_config[training_service].pop('sharedStorage')
|
||||
|
||||
if training_service == 'hybrid':
|
||||
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
|
||||
|
@ -75,7 +87,7 @@ def prepare_config_file(test_case_config, it_config, args):
|
|||
# apply training service config
|
||||
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
|
||||
# the hack for kubeflow should be applied at last step
|
||||
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'])
|
||||
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'], args.nni_source_dir)
|
||||
|
||||
# generate temporary config yml file to launch experiment
|
||||
new_config_file = config_path + '.tmp'
|
||||
|
@ -238,6 +250,15 @@ def match_training_service(test_case_config, cur_training_service):
|
|||
return True
|
||||
return False
|
||||
|
||||
def match_remoteConfig(test_case_config, nni_source_dir):
|
||||
trainingservice_config = get_yml_content(os.path.join('config', 'training_service.yml'))
|
||||
trainingservice_config_reuse_value = str(trainingservice_config['remote']['remoteConfig']['reuse']).lower()
|
||||
testcase_config = get_yml_content(nni_source_dir + test_case_config['configFile'])
|
||||
if testcase_config.get('remoteConfig') is not None:
|
||||
if testcase_config['remoteConfig'].get('reuse') is not None:
|
||||
return str(testcase_config['remoteConfig']['reuse']).lower() == trainingservice_config_reuse_value
|
||||
return True
|
||||
|
||||
|
||||
def run(args):
|
||||
it_config = get_yml_content(args.config)
|
||||
|
@ -264,8 +285,13 @@ def run(args):
|
|||
print('skipped {}, training service {} not match [{}]'.format(
|
||||
name, args.ts, test_case_config['trainingService']))
|
||||
continue
|
||||
|
||||
# remote mode need more time to cleanup
|
||||
if args.ts == 'remote' or args.ts == 'hybrid':
|
||||
if args.ts == 'remote':
|
||||
if not match_remoteConfig(test_case_config, args.nni_source_dir):
|
||||
print('skipped {}, remoteConfig not match.'.format(name))
|
||||
continue
|
||||
wait_for_port_available(8080, 240)
|
||||
else:
|
||||
wait_for_port_available(8080, 60)
|
||||
|
|
|
@ -97,3 +97,17 @@ class NnicliValidator(ITValidator):
|
|||
print(exp.get_job_statistics())
|
||||
print(exp.get_experiment_status())
|
||||
print(exp.list_trial_jobs())
|
||||
|
||||
class FileExistValidator(ITValidator):
|
||||
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
|
||||
print(rest_endpoint)
|
||||
exp_id = osp.split(experiment_dir)[-1]
|
||||
rootpath = kwargs.get('rootpath')
|
||||
|
||||
metrics = requests.get(METRICS_URL).json()
|
||||
for metric in metrics:
|
||||
trial_id = metric['trialJobId']
|
||||
checkpath = osp.join(rootpath, 'nni', exp_id, 'trials', trial_id, 'nnioutput', 'checkingfile.txt')
|
||||
print('Checking shared storage log exists on trial ',trial_id)
|
||||
assert osp.exists(checkpath)
|
||||
|
||||
|
|
|
@ -25,8 +25,9 @@ container = sys.argv[2]
|
|||
password = sys.argv[3]
|
||||
|
||||
run_command(f'docker build --build-arg NNI_RELEASE={version} -t nnidev/nni-nightly .')
|
||||
run_command(f'docker run -d -t -p {port}:22 --name {container} nnidev/nni-nightly')
|
||||
run_command(f'docker run --privileged -d -t -p {port}:22 --name {container} nnidev/nni-nightly')
|
||||
run_command(f'docker exec {container} useradd --create-home --password {password} nni')
|
||||
run_command(['docker', 'exec', container, 'bash', '-c', f'echo "nni:{password}" | chpasswd'])
|
||||
run_command(['docker', 'exec', container, 'bash', '-c', 'echo "nni ALL=(ALL:ALL) NOPASSWD:ALL" >> /etc/sudoers'])
|
||||
run_command(f'docker exec {container} service ssh start')
|
||||
set_variable('docker_port', port)
|
||||
|
|
|
@ -34,13 +34,13 @@ fi
|
|||
id=$(lsb_release -i | cut -c16- | sed s/[[:space:]]//g)
|
||||
version=$(lsb_release -r | cut -c9- | sed s/[[:space:]]//g)
|
||||
|
||||
if [ $id = "Ubuntu" ]
|
||||
if [ "$id" = "Ubuntu" ]
|
||||
then
|
||||
wget https://packages.microsoft.com/config/ubuntu/$version/packages-microsoft-prod.deb
|
||||
sudo dpkg -i packages-microsoft-prod.deb
|
||||
sudo DEBIAN_FRONTEND=noninteractive dpkg -i packages-microsoft-prod.deb
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y blobfuse fuse
|
||||
elif [ $id = "CentOS" ] || [ $id = "RHEL" ]
|
||||
elif [ "$id" = "CentOS" ] || [ "$id" = "RHEL" ]
|
||||
then
|
||||
sudo rpm -Uvh https://packages.microsoft.com/config/rhel/$(echo $version | cut -c1)/packages-microsoft-prod.rpm
|
||||
sudo yum install -y blobfuse fuse
|
||||
|
|
Загрузка…
Ссылка в новой задаче