Support adl pipeline (#3233)

2021-01-04 10:38:44 +08:00 · 2021-01-04 10:38:44 +08:00 · fb26187de0
--- a/pipelines/integration-test-adl.yml
+++ b/pipelines/integration-test-adl.yml
@ -0,0 +1,63 @@
+trigger: none
+pr: none
+schedules:
+- cron: 0 16 * * *
+  branches:
+    include: [ master ]
+
+jobs:
+- job: adl
+  pool: NNI CI KUBE CLI
+  timeoutInMinutes: 120
+
+  steps:
+  - script: |
+      export NNI_RELEASE=999.$(date -u +%Y%m%d%H%M%S)
+      echo "##vso[task.setvariable variable=PATH]${PATH}:${HOME}/.local/bin"
+      echo "##vso[task.setvariable variable=NNI_RELEASE]${NNI_RELEASE}"
+
+      echo "Working directory: ${PWD}"
+      echo "NNI version: ${NNI_RELEASE}"
+      echo "Build docker image: $(build_docker_image)"
+
+      python3 -m pip install --upgrade pip setuptools
+    displayName: Prepare
+
+  - script: |
+      set -e
+      python3 setup.py build_ts
+      python3 setup.py bdist_wheel -p manylinux1_x86_64
+      python3 -m pip install dist/nni-${NNI_RELEASE}-py3-none-manylinux1_x86_64.whl[SMAC,BOHB]
+    displayName: Build and install NNI
+
+  - script: |
+      set -e
+      cd examples/tuners/customized_tuner
+      python3 setup.py develop --user
+      nnictl algo register --meta meta_file.yml
+    displayName: Install customized tuner
+
+  - script: |
+      set -e
+      docker login -u nnidev -p $(docker_hub_password)
+      sed -i '$a RUN python3 -m pip install adaptdl tensorboard' Dockerfile
+      sed -i '$a COPY examples /examples' Dockerfile
+      sed -i '$a COPY test /test' Dockerfile
+      echo '## Build docker image ##'
+      docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly .
+      echo '## Upload docker image ##'
+      docker push nnidev/nni-nightly
+    condition: eq(variables['build_docker_image'], 'true')
+    displayName: Build and upload docker image
+
+  - script: |
+      set -e
+      cd test
+      python3 nni_test/nnitest/generate_ts_config.py \
+          --ts adl \
+          --nni_docker_image nnidev/nni-nightly \
+          --checkpoint_storage_class $(checkpoint_storage_class) \
+          --checkpoint_storage_size $(checkpoint_storage_size) \
+          --nni_manager_ip $(nni_manager_ip)
+      python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts adl
+    displayName: Integration test
--- a/test/config/examples/cifar10-pytorch-adl.yml
+++ b/test/config/examples/cifar10-pytorch-adl.yml
@ -0,0 +1,23 @@
+authorName: nni
+experimentName: default_test
+maxExecDuration: 15m
+maxTrialNum: 1
+trialConcurrency: 1
+searchSpacePath: ./cifar10_adl_search_space.json
+
+tuner:
+  builtinTunerName: Random
+assessor:
+  builtinAssessorName: Medianstop
+  classArgs:
+    optimize_mode: maximize
+trial:
+  codeDir: /examples/trials/cifar10_pytorch
+  command: python3 main_adl.py --epochs 1
+  gpuNum: 1
+
+useAnnotation: false
+multiPhase: false
+multiThread: false
+
+trainingServicePlatform: adl
--- a/test/config/examples/cifar10_adl_search_space.json
+++ b/test/config/examples/cifar10_adl_search_space.json
@ -0,0 +1,5 @@
+{
+    "lr":{"_type":"choice", "_value":[0.1, 0.01, 0.001]},
+    "bs":{"_type":"choice","_value":[64, 96, 128]},
+    "model":{"_type":"choice", "_value":["ResNet18", "SENet18", "MobileNet"]}
+}
--- a/test/config/integration_tests.yml
+++ b/test/config/integration_tests.yml
@ -75,6 +75,10 @@ testCases:
      command: python3 main.py --epochs 1 --batches 1
      gpuNum: 0

+- name: cifar10-pytorch-adl
+  configFile: test/config/examples/cifar10-pytorch-adl.yml
+  trainingService: adl
+
 #- name: nested-ss
 #  configFile: test/config/examples/mnist-nested-search-space.yml

--- a/test/config/integration_tests_tf2.yml
+++ b/test/config/integration_tests_tf2.yml
@ -52,6 +52,10 @@ testCases:
      command: python3 main.py --epochs 1 --batches 1
      gpuNum: 0

+- name: cifar10-pytorch-adl
+  configFile: test/config/examples/cifar10-pytorch-adl.yml
+  trainingService: adl
+
 - name: classic-nas-gen-ss
  configFile: test/config/examples/classic-nas-tf2.yml
  launchCommand: nnictl ss_gen --trial_command="python3 train.py --epochs 1" --trial_dir=../examples/nas/classic_nas-tf --file=config/examples/nni-nas-search-space-tf2.json
--- a/test/config/training_service.yml
+++ b/test/config/training_service.yml
@ -103,3 +103,22 @@ remote:
    port:
    username:
  trainingServicePlatform: remote
+adl:
+  maxExecDuration: 15m
+  nniManagerIp:
+  # use a small trial number to make IT faster
+  maxTrialNum: 2
+  trialConcurrency: 2
+  trial:
+    namespace: default
+    command:
+    codeDir:
+    gpuNum: 1
+    cpuNum: 1
+    image:
+    memorySize: 1Gi
+    checkpoint:
+      storageClass:
+      storageSize:
+  trainingServicePlatform: adl
+
--- a/test/nni_test/nnitest/generate_ts_config.py
+++ b/test/nni_test/nnitest/generate_ts_config.py
@ -88,13 +88,28 @@ def update_training_service_config(args):
            config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
        if args.remote_reuse is not None:
            config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true'
+    elif args.ts == 'adl':
+        if args.nni_docker_image is not None:
+            config[args.ts]['trial']['image'] = args.nni_docker_image
+        if args.checkpoint_storage_class is not None:
+            config[args.ts]['trial']['checkpoint']['storageClass'] = args.checkpoint_storage_class
+        if args.checkpoint_storage_size is not None:
+            config[args.ts]['trial']['checkpoint']['storageSize'] = args.checkpoint_storage_size
+        if args.adaptive is not None:
+            config[args.ts]['trial']['adaptive'] = args.adaptive
+        if args.adl_nfs_server is not None and args.adl_nfs_path is not None and args.adl_nfs_container_mount_path is not None:
+            # default keys in nfs is empty, need to initialize
+            config[args.ts]['trial']['nfs'] = {}
+            config[args.ts]['trial']['nfs']['server'] = args.adl_nfs_server
+            config[args.ts]['trial']['nfs']['path'] = args.adl_nfs_path
+            config[args.ts]['trial']['nfs']['container_mount_path'] = args.nadl_fs_container_mount_path

    dump_yml_content(TRAINING_SERVICE_FILE, config)


 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local', 'frameworkcontroller'], default='pai')
+    parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local', 'frameworkcontroller', 'adl'], default='pai')
    parser.add_argument("--nni_docker_image", type=str)
    parser.add_argument("--nni_manager_ip", type=str)
    # args for PAI
@ -122,6 +137,13 @@ if __name__ == '__main__':
    parser.add_argument("--remote_host", type=str)
    parser.add_argument("--remote_port", type=int)
    parser.add_argument("--remote_reuse", type=str)
+    # args for adl
+    parser.add_argument("--checkpoint_storage_class", type=str)
+    parser.add_argument("--checkpoint_storage_size", type=str)
+    parser.add_argument("--adaptive", type=str)
+    parser.add_argument("--adl_nfs_server", type=str)
+    parser.add_argument("--adl_nfs_path", type=str)
+    parser.add_argument("--adl_nfs_container_mount_path", type=str)
    args = parser.parse_args()

    update_training_service_config(args)
--- a/test/nni_test/nnitest/run_tests.py
+++ b/test/nni_test/nnitest/run_tests.py
@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
 it_variables = {}


-def update_training_service_config(config, training_service):
+def update_training_service_config(config, training_service, config_file_path):
    it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))

    # hack for kubeflow trial config
@ -38,6 +38,20 @@ def update_training_service_config(config, training_service):
        config['trial'].pop('command')
        if 'gpuNum' in config['trial']:
            config['trial'].pop('gpuNum')
+    
+    if training_service == 'adl':
+        # hack for adl trial config, codeDir in adl mode refers to path in container
+        containerCodeDir = config['trial']['codeDir']
+        # replace metric test folders to container folder
+        if config['trial']['codeDir'] == '.':
+            containerCodeDir = '/' + config_file_path[:config_file_path.rfind('/')]
+        elif config['trial']['codeDir'] == '../naive_trial':
+            containerCodeDir = '/test/config/naive_trial'
+        elif '../../../' in config['trial']['codeDir']:
+            # replace example folders to container folder
+            containerCodeDir = config['trial']['codeDir'].replace('../../../', '/')
+        it_ts_config[training_service]['trial']['codeDir'] = containerCodeDir
+        it_ts_config[training_service]['trial']['command'] = 'cd {0} && {1}'.format(containerCodeDir, config['trial']['command'])

    deep_update(config, it_ts_config['all'])
    deep_update(config, it_ts_config[training_service])
@ -58,7 +72,7 @@ def prepare_config_file(test_case_config, it_config, args):
    # apply training service config
    # user's gpuNum, logCollection config is overwritten by the config in training_service.yml
    # the hack for kubeflow should be applied at last step
-    update_training_service_config(test_yml_config, args.ts)
+    update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'])

    # generate temporary config yml file to launch experiment
    new_config_file = config_path + '.tmp'
@ -249,6 +263,10 @@ def run(args):
            wait_for_port_available(8080, 180)
        else:
            wait_for_port_available(8080, 30)
+
+        # adl mode need more time to cleanup PVC
+        if args.ts == 'adl' and name == 'nnictl-resume-2':
+            time.sleep(30)
        print('## {}Testing: {}{} ##'.format(GREEN, name, CLEAR))
        begin_time = time.time()

@ -263,7 +281,7 @@ if __name__ == '__main__':
    parser.add_argument("--cases", type=str, default=None)
    parser.add_argument("--exclude", type=str, default=None)
    parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai',
-                                                   'kubeflow', 'frameworkcontroller'], default='local')
+                                                   'kubeflow', 'frameworkcontroller', 'adl'], default='local')
    args = parser.parse_args()

    run(args)