зеркало из https://github.com/microsoft/nni.git
HPO: DLC mode support nas&oss at same time (#4506)
This commit is contained in:
Родитель
f8d2ab312c
Коммит
31fbcf4160
|
@ -51,10 +51,10 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
|
|||
podCount: 1
|
||||
ecsSpec: ecs.c6.large
|
||||
region: cn-hangzhou
|
||||
nasDataSourceId: ${your_nas_data_source_id}
|
||||
accessKeyId: ${your_ak_id}
|
||||
accessKeySecret: ${your_ak_key}
|
||||
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
|
||||
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a
|
||||
ossDataSourceId: ${your_oss_data_source_id} # OSS datasource ID, in case your data is on oss
|
||||
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW
|
||||
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting
|
||||
|
||||
|
|
|
@ -584,6 +584,10 @@ Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__.
|
|||
- ``str``
|
||||
- The NAS datasource id configurated in PAI-DLC side.
|
||||
|
||||
* - ossDataSourceId
|
||||
- ``str``
|
||||
- The OSS datasource id configurated in PAI-DLC side, this is optional.
|
||||
|
||||
* - accessKeyId
|
||||
- ``str``
|
||||
- The accessKeyId of your cloud account.
|
||||
|
|
|
@ -17,9 +17,9 @@ trainingService:
|
|||
podCount: 1
|
||||
ecsSpec: ecs.c6.large
|
||||
region: cn-hangzhou
|
||||
nasDataSourceId: ${your_nas_data_source_id}
|
||||
accessKeyId: ${your_ak_id}
|
||||
accessKeySecret: ${your_ak_key}
|
||||
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
|
||||
ossDataSourceId: ${your_oss_data_source_id} # optional, OSS data source id.
|
||||
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW, MUST provide full path.
|
||||
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# Licensed under the MIT license.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from ..training_service import TrainingServiceConfig
|
||||
|
||||
|
@ -17,6 +18,7 @@ class DlcConfig(TrainingServiceConfig):
|
|||
ecs_spec: str # e.g.,'ecs.c6.large'
|
||||
region: str
|
||||
nas_data_source_id: str
|
||||
oss_data_source_id: Optional[str] = None
|
||||
access_key_id: str
|
||||
access_key_secret: str
|
||||
local_storage_mount_point: str
|
||||
|
|
|
@ -93,6 +93,7 @@ export interface DlcConfig extends TrainingServiceConfig {
|
|||
ecsSpec: string;
|
||||
region: string;
|
||||
nasDataSourceId: string;
|
||||
ossDataSourceId?: string;
|
||||
accessKeyId: string;
|
||||
accessKeySecret: string;
|
||||
localStorageMountPoint: string;
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import traceback
|
||||
from argparse import ArgumentParser
|
||||
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
|
||||
from alibabacloud_pai_dlc20201203.client import Client
|
||||
|
@ -20,10 +20,12 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--ecs_spec', help='ecs spec')
|
||||
parser.add_argument('--region', help='region')
|
||||
parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration')
|
||||
parser.add_argument('--oss_data_source_id', help='oss data_source_id of DLC dataset configuration')
|
||||
parser.add_argument('--access_key_id', help='access_key_id')
|
||||
parser.add_argument('--access_key_secret', help='access_key_secret')
|
||||
parser.add_argument('--experiment_name', help='the experiment name')
|
||||
parser.add_argument('--user_command', help='user command')
|
||||
parser.add_argument('--log_dir', help='exception log dir')
|
||||
args = parser.parse_args()
|
||||
|
||||
# init client
|
||||
|
@ -37,10 +39,17 @@ if __name__ == "__main__":
|
|||
)
|
||||
|
||||
nas_1 = DataSourceItem(
|
||||
data_source_type = 'nas',
|
||||
data_source_type='nas',
|
||||
data_source_id=args.nas_data_source_id,
|
||||
)
|
||||
|
||||
oss = None
|
||||
if args.oss_data_source_id:
|
||||
oss = DataSourceItem(
|
||||
data_source_type='oss',
|
||||
data_source_id=args.oss_data_source_id,
|
||||
)
|
||||
|
||||
# job spec
|
||||
spec = JobSpec(
|
||||
type=args.type,
|
||||
|
@ -49,26 +58,34 @@ if __name__ == "__main__":
|
|||
ecs_spec=args.ecs_spec,
|
||||
)
|
||||
|
||||
data_sources = [nas_1]
|
||||
if oss:
|
||||
data_sources = [nas_1, oss]
|
||||
req = CreateJobRequest(
|
||||
display_name=args.experiment_name,
|
||||
job_type=args.job_type,
|
||||
job_specs=[spec],
|
||||
data_sources=[nas_1],
|
||||
data_sources=data_sources,
|
||||
user_command=args.user_command
|
||||
)
|
||||
|
||||
# DLC submit
|
||||
response = client.create_job(req)
|
||||
job_id = response.body.job_id
|
||||
print('job id: ' + job_id)
|
||||
try:
|
||||
response = client.create_job(req)
|
||||
job_id = response.body.job_id
|
||||
print('job id: ' + job_id)
|
||||
|
||||
while True:
|
||||
line = sys.stdin.readline().rstrip()
|
||||
if line == 'update_status':
|
||||
print('status:' + client.get_job(job_id).body.status)
|
||||
elif line == 'tracking_url':
|
||||
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
|
||||
print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}®ionId={args.region}')
|
||||
elif line == 'stop':
|
||||
client.stop_job(job_id)
|
||||
exit(0)
|
||||
while True:
|
||||
line = sys.stdin.readline().rstrip()
|
||||
if line == 'update_status':
|
||||
print('status:' + client.get_job(job_id).body.status)
|
||||
elif line == 'tracking_url':
|
||||
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
|
||||
print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}®ionId={args.region}')
|
||||
elif line == 'stop':
|
||||
client.stop_job(job_id)
|
||||
exit(0)
|
||||
except Exception as e:
|
||||
with open(os.path.join(args.log_dir, 'dlc_exception.log'), 'w') as f:
|
||||
f.write('DLC submit Exception: \n')
|
||||
traceback.print_exc(file=f)
|
||||
|
|
|
@ -16,11 +16,14 @@ export class DlcClient {
|
|||
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
|
||||
// create a NAS data and copy the 'DataSet ConfigurationID'
|
||||
public nasDataSourceId: string;
|
||||
public ossDataSourceId: string;
|
||||
public accessKeyId: string;
|
||||
public accessKeySecret: string;
|
||||
public experimentId: string;
|
||||
public environmentId: string;
|
||||
public userCommand: string;
|
||||
// dlcUtil exception log dir
|
||||
public logDir: string;
|
||||
public pythonShellClient: undefined | PythonShell;
|
||||
|
||||
constructor(
|
||||
|
@ -36,6 +39,8 @@ export class DlcClient {
|
|||
accessKeyId: string,
|
||||
accessKeySecret: string,
|
||||
userCommand: string,
|
||||
logDir: string,
|
||||
ossDataSourceId?: string,
|
||||
) {
|
||||
this.log = getLogger('DlcClient');
|
||||
this.type = type;
|
||||
|
@ -46,11 +51,17 @@ export class DlcClient {
|
|||
this.image = image;
|
||||
this.region = region;
|
||||
this.nasDataSourceId = nasDataSourceId;
|
||||
if (ossDataSourceId !== undefined) {
|
||||
this.ossDataSourceId = ossDataSourceId;
|
||||
} else {
|
||||
this.ossDataSourceId = '';
|
||||
}
|
||||
this.accessKeyId = accessKeyId;
|
||||
this.accessKeySecret = accessKeySecret
|
||||
this.experimentId = experimentId;
|
||||
this.environmentId = environmentId;
|
||||
this.userCommand = userCommand;
|
||||
this.logDir = logDir;
|
||||
}
|
||||
|
||||
public submit(): Promise<string> {
|
||||
|
@ -67,10 +78,12 @@ export class DlcClient {
|
|||
'--ecs_spec', this.ecsSpec,
|
||||
'--region', this.region,
|
||||
'--nas_data_source_id', this.nasDataSourceId,
|
||||
'--oss_data_source_id', this.ossDataSourceId,
|
||||
'--access_key_id', this.accessKeyId,
|
||||
'--access_key_secret', this.accessKeySecret,
|
||||
'--experiment_name', `nni_exp_${this.experimentId}_env_${this.environmentId}`,
|
||||
'--user_command', this.userCommand,
|
||||
'--log_dir', this.logDir,
|
||||
]
|
||||
});
|
||||
this.log.debug(this.pythonShellClient.command);
|
||||
|
|
|
@ -116,6 +116,8 @@ export class DlcEnvironmentService extends EnvironmentService {
|
|||
this.config.accessKeyId,
|
||||
this.config.accessKeySecret,
|
||||
environment.command,
|
||||
dlcEnvironment.workingFolder,
|
||||
this.config.ossDataSourceId,
|
||||
);
|
||||
|
||||
dlcEnvironment.id = await dlcClient.submit();
|
||||
|
|
Загрузка…
Ссылка в новой задаче