diff --git a/docs/source/TrainingService/DLCMode.rst b/docs/source/TrainingService/DLCMode.rst index 9dc9e3443..d51125f11 100644 --- a/docs/source/TrainingService/DLCMode.rst +++ b/docs/source/TrainingService/DLCMode.rst @@ -51,10 +51,10 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's podCount: 1 ecsSpec: ecs.c6.large region: cn-hangzhou - nasDataSourceId: ${your_nas_data_source_id} accessKeyId: ${your_ak_id} accessKeySecret: ${your_ak_key} - nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a + nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a + ossDataSourceId: ${your_oss_data_source_id} # OSS datasource ID, in case your data is on oss localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting diff --git a/docs/source/reference/experiment_config.rst b/docs/source/reference/experiment_config.rst index f12c3138b..8bc52e71c 100644 --- a/docs/source/reference/experiment_config.rst +++ b/docs/source/reference/experiment_config.rst @@ -584,6 +584,10 @@ Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__. - ``str`` - The NAS datasource id configurated in PAI-DLC side. + * - ossDataSourceId + - ``str`` + - The OSS datasource id configurated in PAI-DLC side, this is optional. + * - accessKeyId - ``str`` - The accessKeyId of your cloud account. diff --git a/examples/trials/mnist-pytorch/config_dlc.yml b/examples/trials/mnist-pytorch/config_dlc.yml index d4372acad..8437928f0 100644 --- a/examples/trials/mnist-pytorch/config_dlc.yml +++ b/examples/trials/mnist-pytorch/config_dlc.yml @@ -17,9 +17,9 @@ trainingService: podCount: 1 ecsSpec: ecs.c6.large region: cn-hangzhou - nasDataSourceId: ${your_nas_data_source_id} accessKeyId: ${your_ak_id} accessKeySecret: ${your_ak_key} nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a + ossDataSourceId: ${your_oss_data_source_id} # optional, OSS data source id. localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW, MUST provide full path. containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting diff --git a/nni/experiment/config/training_services/dlc.py b/nni/experiment/config/training_services/dlc.py index 6c7b8b5e5..fae3f062c 100644 --- a/nni/experiment/config/training_services/dlc.py +++ b/nni/experiment/config/training_services/dlc.py @@ -2,6 +2,7 @@ # Licensed under the MIT license. from dataclasses import dataclass +from typing import Optional from ..training_service import TrainingServiceConfig @@ -17,6 +18,7 @@ class DlcConfig(TrainingServiceConfig): ecs_spec: str # e.g.,'ecs.c6.large' region: str nas_data_source_id: str + oss_data_source_id: Optional[str] = None access_key_id: str access_key_secret: str local_storage_mount_point: str diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index c0b42ed35..a4a38dcbf 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -93,6 +93,7 @@ export interface DlcConfig extends TrainingServiceConfig { ecsSpec: string; region: string; nasDataSourceId: string; + ossDataSourceId?: string; accessKeyId: string; accessKeySecret: string; localStorageMountPoint: string; diff --git a/ts/nni_manager/config/dlc/dlcUtil.py b/ts/nni_manager/config/dlc/dlcUtil.py index 333fc5e07..07e2b7dc5 100644 --- a/ts/nni_manager/config/dlc/dlcUtil.py +++ b/ts/nni_manager/config/dlc/dlcUtil.py @@ -1,10 +1,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. + import os import sys -import time -import json +import traceback from argparse import ArgumentParser # ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x from alibabacloud_pai_dlc20201203.client import Client @@ -20,10 +20,12 @@ if __name__ == "__main__": parser.add_argument('--ecs_spec', help='ecs spec') parser.add_argument('--region', help='region') parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration') + parser.add_argument('--oss_data_source_id', help='oss data_source_id of DLC dataset configuration') parser.add_argument('--access_key_id', help='access_key_id') parser.add_argument('--access_key_secret', help='access_key_secret') parser.add_argument('--experiment_name', help='the experiment name') parser.add_argument('--user_command', help='user command') + parser.add_argument('--log_dir', help='exception log dir') args = parser.parse_args() # init client @@ -37,10 +39,17 @@ if __name__ == "__main__": ) nas_1 = DataSourceItem( - data_source_type = 'nas', + data_source_type='nas', data_source_id=args.nas_data_source_id, ) + oss = None + if args.oss_data_source_id: + oss = DataSourceItem( + data_source_type='oss', + data_source_id=args.oss_data_source_id, + ) + # job spec spec = JobSpec( type=args.type, @@ -49,26 +58,34 @@ if __name__ == "__main__": ecs_spec=args.ecs_spec, ) + data_sources = [nas_1] + if oss: + data_sources = [nas_1, oss] req = CreateJobRequest( display_name=args.experiment_name, job_type=args.job_type, job_specs=[spec], - data_sources=[nas_1], + data_sources=data_sources, user_command=args.user_command ) # DLC submit - response = client.create_job(req) - job_id = response.body.job_id - print('job id: ' + job_id) + try: + response = client.create_job(req) + job_id = response.body.job_id + print('job id: ' + job_id) - while True: - line = sys.stdin.readline().rstrip() - if line == 'update_status': - print('status:' + client.get_job(job_id).body.status) - elif line == 'tracking_url': - #TODO: 1. get this url by api? 2. change this url in private dlc mode. - print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}®ionId={args.region}') - elif line == 'stop': - client.stop_job(job_id) - exit(0) + while True: + line = sys.stdin.readline().rstrip() + if line == 'update_status': + print('status:' + client.get_job(job_id).body.status) + elif line == 'tracking_url': + #TODO: 1. get this url by api? 2. change this url in private dlc mode. + print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}®ionId={args.region}') + elif line == 'stop': + client.stop_job(job_id) + exit(0) + except Exception as e: + with open(os.path.join(args.log_dir, 'dlc_exception.log'), 'w') as f: + f.write('DLC submit Exception: \n') + traceback.print_exc(file=f) diff --git a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts index 6f919edaa..97e889569 100644 --- a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts +++ b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts @@ -16,11 +16,14 @@ export class DlcClient { // e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC // create a NAS data and copy the 'DataSet ConfigurationID' public nasDataSourceId: string; + public ossDataSourceId: string; public accessKeyId: string; public accessKeySecret: string; public experimentId: string; public environmentId: string; public userCommand: string; + // dlcUtil exception log dir + public logDir: string; public pythonShellClient: undefined | PythonShell; constructor( @@ -36,6 +39,8 @@ export class DlcClient { accessKeyId: string, accessKeySecret: string, userCommand: string, + logDir: string, + ossDataSourceId?: string, ) { this.log = getLogger('DlcClient'); this.type = type; @@ -46,11 +51,17 @@ export class DlcClient { this.image = image; this.region = region; this.nasDataSourceId = nasDataSourceId; + if (ossDataSourceId !== undefined) { + this.ossDataSourceId = ossDataSourceId; + } else { + this.ossDataSourceId = ''; + } this.accessKeyId = accessKeyId; this.accessKeySecret = accessKeySecret this.experimentId = experimentId; this.environmentId = environmentId; this.userCommand = userCommand; + this.logDir = logDir; } public submit(): Promise { @@ -67,10 +78,12 @@ export class DlcClient { '--ecs_spec', this.ecsSpec, '--region', this.region, '--nas_data_source_id', this.nasDataSourceId, + '--oss_data_source_id', this.ossDataSourceId, '--access_key_id', this.accessKeyId, '--access_key_secret', this.accessKeySecret, '--experiment_name', `nni_exp_${this.experimentId}_env_${this.environmentId}`, '--user_command', this.userCommand, + '--log_dir', this.logDir, ] }); this.log.debug(this.pythonShellClient.command); diff --git a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts index ac9293846..b47c46639 100644 --- a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts @@ -116,6 +116,8 @@ export class DlcEnvironmentService extends EnvironmentService { this.config.accessKeyId, this.config.accessKeySecret, environment.command, + dlcEnvironment.workingFolder, + this.config.ossDataSourceId, ); dlcEnvironment.id = await dlcClient.submit();