HPO: DLC mode support nas&oss at same time (#4506)

This commit is contained in:
Weidan Kong 2022-02-15 18:50:27 -08:00 коммит произвёл GitHub
Родитель f8d2ab312c
Коммит 31fbcf4160
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 59 добавлений и 20 удалений

Просмотреть файл

@ -51,10 +51,10 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
podCount: 1
ecsSpec: ecs.c6.large
region: cn-hangzhou
nasDataSourceId: ${your_nas_data_source_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource IDe.g., datat56by9n1xt0a
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a
ossDataSourceId: ${your_oss_data_source_id} # OSS datasource ID, in case your data is on oss
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting

Просмотреть файл

@ -584,6 +584,10 @@ Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__.
- ``str``
- The NAS datasource id configurated in PAI-DLC side.
* - ossDataSourceId
- ``str``
- The OSS datasource id configurated in PAI-DLC side, this is optional.
* - accessKeyId
- ``str``
- The accessKeyId of your cloud account.

Просмотреть файл

@ -17,9 +17,9 @@ trainingService:
podCount: 1
ecsSpec: ecs.c6.large
region: cn-hangzhou
nasDataSourceId: ${your_nas_data_source_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource IDe.g., datat56by9n1xt0a
ossDataSourceId: ${your_oss_data_source_id} # optional, OSS data source id.
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW, MUST provide full path.
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting

Просмотреть файл

@ -2,6 +2,7 @@
# Licensed under the MIT license.
from dataclasses import dataclass
from typing import Optional
from ..training_service import TrainingServiceConfig
@ -17,6 +18,7 @@ class DlcConfig(TrainingServiceConfig):
ecs_spec: str # e.g.,'ecs.c6.large'
region: str
nas_data_source_id: str
oss_data_source_id: Optional[str] = None
access_key_id: str
access_key_secret: str
local_storage_mount_point: str

Просмотреть файл

@ -93,6 +93,7 @@ export interface DlcConfig extends TrainingServiceConfig {
ecsSpec: string;
region: string;
nasDataSourceId: string;
ossDataSourceId?: string;
accessKeyId: string;
accessKeySecret: string;
localStorageMountPoint: string;

Просмотреть файл

@ -1,10 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import sys
import time
import json
import traceback
from argparse import ArgumentParser
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
from alibabacloud_pai_dlc20201203.client import Client
@ -20,10 +20,12 @@ if __name__ == "__main__":
parser.add_argument('--ecs_spec', help='ecs spec')
parser.add_argument('--region', help='region')
parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration')
parser.add_argument('--oss_data_source_id', help='oss data_source_id of DLC dataset configuration')
parser.add_argument('--access_key_id', help='access_key_id')
parser.add_argument('--access_key_secret', help='access_key_secret')
parser.add_argument('--experiment_name', help='the experiment name')
parser.add_argument('--user_command', help='user command')
parser.add_argument('--log_dir', help='exception log dir')
args = parser.parse_args()
# init client
@ -37,10 +39,17 @@ if __name__ == "__main__":
)
nas_1 = DataSourceItem(
data_source_type = 'nas',
data_source_type='nas',
data_source_id=args.nas_data_source_id,
)
oss = None
if args.oss_data_source_id:
oss = DataSourceItem(
data_source_type='oss',
data_source_id=args.oss_data_source_id,
)
# job spec
spec = JobSpec(
type=args.type,
@ -49,26 +58,34 @@ if __name__ == "__main__":
ecs_spec=args.ecs_spec,
)
data_sources = [nas_1]
if oss:
data_sources = [nas_1, oss]
req = CreateJobRequest(
display_name=args.experiment_name,
job_type=args.job_type,
job_specs=[spec],
data_sources=[nas_1],
data_sources=data_sources,
user_command=args.user_command
)
# DLC submit
response = client.create_job(req)
job_id = response.body.job_id
print('job id: ' + job_id)
try:
response = client.create_job(req)
job_id = response.body.job_id
print('job id: ' + job_id)
while True:
line = sys.stdin.readline().rstrip()
if line == 'update_status':
print('status:' + client.get_job(job_id).body.status)
elif line == 'tracking_url':
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}&regionId={args.region}')
elif line == 'stop':
client.stop_job(job_id)
exit(0)
while True:
line = sys.stdin.readline().rstrip()
if line == 'update_status':
print('status:' + client.get_job(job_id).body.status)
elif line == 'tracking_url':
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}&regionId={args.region}')
elif line == 'stop':
client.stop_job(job_id)
exit(0)
except Exception as e:
with open(os.path.join(args.log_dir, 'dlc_exception.log'), 'w') as f:
f.write('DLC submit Exception: \n')
traceback.print_exc(file=f)

Просмотреть файл

@ -16,11 +16,14 @@ export class DlcClient {
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
// create a NAS data and copy the 'DataSet ConfigurationID'
public nasDataSourceId: string;
public ossDataSourceId: string;
public accessKeyId: string;
public accessKeySecret: string;
public experimentId: string;
public environmentId: string;
public userCommand: string;
// dlcUtil exception log dir
public logDir: string;
public pythonShellClient: undefined | PythonShell;
constructor(
@ -36,6 +39,8 @@ export class DlcClient {
accessKeyId: string,
accessKeySecret: string,
userCommand: string,
logDir: string,
ossDataSourceId?: string,
) {
this.log = getLogger('DlcClient');
this.type = type;
@ -46,11 +51,17 @@ export class DlcClient {
this.image = image;
this.region = region;
this.nasDataSourceId = nasDataSourceId;
if (ossDataSourceId !== undefined) {
this.ossDataSourceId = ossDataSourceId;
} else {
this.ossDataSourceId = '';
}
this.accessKeyId = accessKeyId;
this.accessKeySecret = accessKeySecret
this.experimentId = experimentId;
this.environmentId = environmentId;
this.userCommand = userCommand;
this.logDir = logDir;
}
public submit(): Promise<string> {
@ -67,10 +78,12 @@ export class DlcClient {
'--ecs_spec', this.ecsSpec,
'--region', this.region,
'--nas_data_source_id', this.nasDataSourceId,
'--oss_data_source_id', this.ossDataSourceId,
'--access_key_id', this.accessKeyId,
'--access_key_secret', this.accessKeySecret,
'--experiment_name', `nni_exp_${this.experimentId}_env_${this.environmentId}`,
'--user_command', this.userCommand,
'--log_dir', this.logDir,
]
});
this.log.debug(this.pythonShellClient.command);

Просмотреть файл

@ -116,6 +116,8 @@ export class DlcEnvironmentService extends EnvironmentService {
this.config.accessKeyId,
this.config.accessKeySecret,
environment.command,
dlcEnvironment.workingFolder,
this.config.ossDataSourceId,
);
dlcEnvironment.id = await dlcClient.submit();