зеркало из https://github.com/microsoft/nni.git
Fix 3rd-party training service bug (#3726)
Co-authored-by: liuzhe <zhe.liu@microsoft.com>
This commit is contained in:
Родитель
d9dd29f322
Коммит
6b52fb1200
|
@ -39,8 +39,8 @@ def register(args):
|
|||
|
||||
try:
|
||||
service_config = {
|
||||
'node_module_path': info.node_module_path,
|
||||
'node_class_name': info.node_class_name,
|
||||
'nodeModulePath': str(info.node_module_path),
|
||||
'nodeClassName': info.node_class_name,
|
||||
}
|
||||
json.dumps(service_config)
|
||||
except Exception:
|
||||
|
|
|
@ -6,36 +6,43 @@
|
|||
import * as assert from 'assert';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import * as component from '../common/component';
|
||||
|
||||
@component.Singleton
|
||||
class ExperimentStartupInfo {
|
||||
private readonly API_ROOT_URL: string = '/api/v1/nni';
|
||||
const API_ROOT_URL: string = '/api/v1/nni';
|
||||
|
||||
private experimentId: string = '';
|
||||
private newExperiment: boolean = true;
|
||||
private basePort: number = -1;
|
||||
private initialized: boolean = false;
|
||||
private logDir: string = '';
|
||||
private logLevel: string = '';
|
||||
private readonly: boolean = false;
|
||||
private dispatcherPipe: string | null = null;
|
||||
private platform: string = '';
|
||||
private urlprefix: string = '';
|
||||
let singleton: ExperimentStartupInfo | null = null;
|
||||
|
||||
public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, platform: string, logDir?: string, logLevel?: string, readonly?: boolean, dispatcherPipe?: string, urlprefix?: string): void {
|
||||
assert(!this.initialized);
|
||||
assert(experimentId.trim().length > 0);
|
||||
export class ExperimentStartupInfo {
|
||||
|
||||
public experimentId: string = '';
|
||||
public newExperiment: boolean = true;
|
||||
public basePort: number = -1;
|
||||
public initialized: boolean = false;
|
||||
public logDir: string = '';
|
||||
public logLevel: string = '';
|
||||
public readonly: boolean = false;
|
||||
public dispatcherPipe: string | null = null;
|
||||
public platform: string = '';
|
||||
public urlprefix: string = '';
|
||||
|
||||
constructor(
|
||||
newExperiment: boolean,
|
||||
experimentId: string,
|
||||
basePort: number,
|
||||
platform: string,
|
||||
logDir?: string,
|
||||
logLevel?: string,
|
||||
readonly?: boolean,
|
||||
dispatcherPipe?: string,
|
||||
urlprefix?: string) {
|
||||
this.newExperiment = newExperiment;
|
||||
this.experimentId = experimentId;
|
||||
this.basePort = basePort;
|
||||
this.initialized = true;
|
||||
this.platform = platform;
|
||||
|
||||
if (logDir !== undefined && logDir.length > 0) {
|
||||
this.logDir = path.join(path.normalize(logDir), this.getExperimentId());
|
||||
this.logDir = path.join(path.normalize(logDir), experimentId);
|
||||
} else {
|
||||
this.logDir = path.join(os.homedir(), 'nni-experiments', this.getExperimentId());
|
||||
this.logDir = path.join(os.homedir(), 'nni-experiments', experimentId);
|
||||
}
|
||||
|
||||
if (logLevel !== undefined && logLevel.length > 1) {
|
||||
|
@ -55,98 +62,67 @@ class ExperimentStartupInfo {
|
|||
}
|
||||
}
|
||||
|
||||
public getExperimentId(): string {
|
||||
assert(this.initialized);
|
||||
|
||||
return this.experimentId;
|
||||
public get apiRootUrl(): string {
|
||||
return this.urlprefix === '' ? API_ROOT_URL : `/${this.urlprefix}${API_ROOT_URL}`;
|
||||
}
|
||||
|
||||
public getBasePort(): number {
|
||||
assert(this.initialized);
|
||||
|
||||
return this.basePort;
|
||||
}
|
||||
|
||||
public isNewExperiment(): boolean {
|
||||
assert(this.initialized);
|
||||
|
||||
return this.newExperiment;
|
||||
}
|
||||
|
||||
public getPlatform(): string {
|
||||
assert(this.initialized);
|
||||
|
||||
return this.platform;
|
||||
}
|
||||
|
||||
public getLogDir(): string {
|
||||
assert(this.initialized);
|
||||
|
||||
return this.logDir;
|
||||
}
|
||||
|
||||
public getLogLevel(): string {
|
||||
assert(this.initialized);
|
||||
|
||||
return this.logLevel;
|
||||
}
|
||||
|
||||
public isReadonly(): boolean {
|
||||
assert(this.initialized);
|
||||
|
||||
return this.readonly;
|
||||
}
|
||||
|
||||
public getDispatcherPipe(): string | null {
|
||||
assert(this.initialized);
|
||||
return this.dispatcherPipe;
|
||||
}
|
||||
|
||||
public getAPIRootUrl(): string {
|
||||
assert(this.initialized);
|
||||
return this.urlprefix==''?this.API_ROOT_URL:`/${this.urlprefix}${this.API_ROOT_URL}`;
|
||||
public static getInstance(): ExperimentStartupInfo {
|
||||
assert(singleton !== null);
|
||||
return singleton!;
|
||||
}
|
||||
}
|
||||
|
||||
function getExperimentId(): string {
|
||||
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getExperimentId();
|
||||
export function getExperimentStartupInfo(): ExperimentStartupInfo {
|
||||
return ExperimentStartupInfo.getInstance();
|
||||
}
|
||||
|
||||
function getBasePort(): number {
|
||||
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getBasePort();
|
||||
export function setExperimentStartupInfo(
|
||||
newExperiment: boolean,
|
||||
experimentId: string,
|
||||
basePort: number,
|
||||
platform: string,
|
||||
logDir?: string,
|
||||
logLevel?: string,
|
||||
readonly?: boolean,
|
||||
dispatcherPipe?: string,
|
||||
urlprefix?: string): void {
|
||||
singleton = new ExperimentStartupInfo(
|
||||
newExperiment,
|
||||
experimentId,
|
||||
basePort,
|
||||
platform,
|
||||
logDir,
|
||||
logLevel,
|
||||
readonly,
|
||||
dispatcherPipe,
|
||||
urlprefix
|
||||
);
|
||||
}
|
||||
|
||||
function isNewExperiment(): boolean {
|
||||
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).isNewExperiment();
|
||||
export function getExperimentId(): string {
|
||||
return getExperimentStartupInfo().experimentId;
|
||||
}
|
||||
|
||||
function getPlatform(): string {
|
||||
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getPlatform();
|
||||
export function getBasePort(): number {
|
||||
return getExperimentStartupInfo().basePort;
|
||||
}
|
||||
|
||||
function getExperimentStartupInfo(): ExperimentStartupInfo {
|
||||
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo);
|
||||
export function isNewExperiment(): boolean {
|
||||
return getExperimentStartupInfo().newExperiment;
|
||||
}
|
||||
|
||||
function setExperimentStartupInfo(
|
||||
newExperiment: boolean, experimentId: string, basePort: number, platform: string, logDir?: string, logLevel?: string, readonly?: boolean, dispatcherPipe?: string, urlprefix?: string): void {
|
||||
component.get<ExperimentStartupInfo>(ExperimentStartupInfo)
|
||||
.setStartupInfo(newExperiment, experimentId, basePort, platform, logDir, logLevel, readonly, dispatcherPipe, urlprefix);
|
||||
export function getPlatform(): string {
|
||||
return getExperimentStartupInfo().platform;
|
||||
}
|
||||
|
||||
function isReadonly(): boolean {
|
||||
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).isReadonly();
|
||||
export function isReadonly(): boolean {
|
||||
return getExperimentStartupInfo().readonly;
|
||||
}
|
||||
|
||||
function getDispatcherPipe(): string | null {
|
||||
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getDispatcherPipe();
|
||||
export function getDispatcherPipe(): string | null {
|
||||
return getExperimentStartupInfo().dispatcherPipe;
|
||||
}
|
||||
|
||||
function getAPIRootUrl(): string {
|
||||
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getAPIRootUrl();
|
||||
export function getAPIRootUrl(): string {
|
||||
return getExperimentStartupInfo().apiRootUrl;
|
||||
}
|
||||
|
||||
export {
|
||||
ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getPlatform, getExperimentStartupInfo,
|
||||
setExperimentStartupInfo, isReadonly, getDispatcherPipe, getAPIRootUrl
|
||||
};
|
||||
|
|
|
@ -6,27 +6,32 @@
|
|||
import { spawn } from 'child_process';
|
||||
import { Logger, getLogger } from './log';
|
||||
|
||||
const python = process.platform === 'win32' ? 'python.exe' : 'python3';
|
||||
const logger: Logger = getLogger('pythonScript');
|
||||
|
||||
export async function runPythonScript(script: string, logger?: Logger): Promise<string> {
|
||||
const python: string = process.platform === 'win32' ? 'python.exe' : 'python3';
|
||||
|
||||
export async function runPythonScript(script: string, logTag?: string): Promise<string> {
|
||||
const proc = spawn(python, [ '-c', script ]);
|
||||
|
||||
let stdout: string = '';
|
||||
let stderr: string = '';
|
||||
proc.stdout.on('data', (data: string) => { stdout += data; });
|
||||
proc.stderr.on('data', (data: string) => { stderr += data; });
|
||||
|
||||
const procPromise = new Promise<void>((resolve, reject) => {
|
||||
proc.on('error', (err: Error) => { reject(err); });
|
||||
proc.on('exit', () => { resolve(); });
|
||||
});
|
||||
await procPromise;
|
||||
|
||||
const stdout = proc.stdout.read().toString();
|
||||
const stderr = proc.stderr.read().toString();
|
||||
|
||||
if (stderr) {
|
||||
if (logger === undefined) {
|
||||
logger = getLogger('pythonScript');
|
||||
if (logTag) {
|
||||
logger.warning(`Python script [${logTag}] has stderr:`, stderr);
|
||||
} else {
|
||||
logger.warning('Python script has stderr.');
|
||||
logger.warning(' script:', script);
|
||||
logger.warning(' stderr:', stderr);
|
||||
}
|
||||
logger.warning('python script has stderr.');
|
||||
logger.warning('script:', script);
|
||||
logger.warning('stderr:', stderr);
|
||||
}
|
||||
|
||||
return stdout;
|
||||
|
|
|
@ -19,13 +19,13 @@ import * as util from 'util';
|
|||
import * as glob from 'glob';
|
||||
|
||||
import { Database, DataStore } from './datastore';
|
||||
import { ExperimentStartupInfo, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
|
||||
import { getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
|
||||
import { ExperimentConfig, Manager } from './manager';
|
||||
import { ExperimentManager } from './experimentManager';
|
||||
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
|
||||
|
||||
function getExperimentRootDir(): string {
|
||||
return getExperimentStartupInfo().getLogDir();
|
||||
return getExperimentStartupInfo().logDir;
|
||||
}
|
||||
|
||||
function getLogDir(): string {
|
||||
|
@ -33,7 +33,7 @@ function getLogDir(): string {
|
|||
}
|
||||
|
||||
function getLogLevel(): string {
|
||||
return getExperimentStartupInfo().getLogLevel();
|
||||
return getExperimentStartupInfo().logLevel;
|
||||
}
|
||||
|
||||
function getDefaultDatabaseDir(): string {
|
||||
|
@ -184,7 +184,6 @@ function generateParamFileName(hyperParameters: HyperParameters): string {
|
|||
* Must be paired with `cleanupUnitTest()`.
|
||||
*/
|
||||
function prepareUnitTest(): void {
|
||||
Container.snapshot(ExperimentStartupInfo);
|
||||
Container.snapshot(Database);
|
||||
Container.snapshot(DataStore);
|
||||
Container.snapshot(TrainingService);
|
||||
|
@ -213,8 +212,9 @@ function cleanupUnitTest(): void {
|
|||
Container.restore(TrainingService);
|
||||
Container.restore(DataStore);
|
||||
Container.restore(Database);
|
||||
Container.restore(ExperimentStartupInfo);
|
||||
Container.restore(ExperimentManager);
|
||||
const logLevel: string = parseArg(['--log_level', '-ll']);
|
||||
setExperimentStartupInfo(true, 'unittest', 8080, 'unittest', undefined, logLevel);
|
||||
}
|
||||
|
||||
let cachedipv4Address: string = '';
|
||||
|
|
|
@ -8,6 +8,7 @@ import * as path from 'path';
|
|||
import * as component from '../../../common/component';
|
||||
import { getLogger, Logger } from '../../../common/log';
|
||||
import { ExperimentConfig, AmlConfig, flattenConfig } from '../../../common/experimentConfig';
|
||||
import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo';
|
||||
import { validateCodeDir } from '../../common/util';
|
||||
import { AMLClient } from '../aml/amlClient';
|
||||
import { AMLEnvironmentInformation } from '../aml/amlConfig';
|
||||
|
@ -29,10 +30,10 @@ export class AMLEnvironmentService extends EnvironmentService {
|
|||
private experimentRootDir: string;
|
||||
private config: FlattenAmlConfig;
|
||||
|
||||
constructor(experimentRootDir: string, experimentId: string, config: ExperimentConfig) {
|
||||
constructor(config: ExperimentConfig, info: ExperimentStartupInfo) {
|
||||
super();
|
||||
this.experimentId = experimentId;
|
||||
this.experimentRootDir = experimentRootDir;
|
||||
this.experimentId = info.experimentId;
|
||||
this.experimentRootDir = info.logDir;
|
||||
this.config = flattenConfig(config, 'aml');
|
||||
validateCodeDir(this.config.trialCodeDirectory);
|
||||
}
|
||||
|
|
|
@ -4,24 +4,22 @@ import { LocalEnvironmentService } from './localEnvironmentService';
|
|||
import { RemoteEnvironmentService } from './remoteEnvironmentService';
|
||||
import { EnvironmentService } from '../environment';
|
||||
import { ExperimentConfig } from '../../../common/experimentConfig';
|
||||
import { getExperimentId } from '../../../common/experimentStartupInfo';
|
||||
import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo';
|
||||
import { getCustomEnvironmentServiceConfig } from '../../../common/nniConfig';
|
||||
import { getExperimentRootDir, importModule } from '../../../common/utils';
|
||||
|
||||
import { importModule } from '../../../common/utils';
|
||||
|
||||
export async function createEnvironmentService(name: string, config: ExperimentConfig): Promise<EnvironmentService> {
|
||||
const expId = getExperimentId();
|
||||
const rootDir = getExperimentRootDir();
|
||||
const info = ExperimentStartupInfo.getInstance();
|
||||
|
||||
switch(name) {
|
||||
case 'local':
|
||||
return new LocalEnvironmentService(rootDir, expId, config);
|
||||
return new LocalEnvironmentService(config, info);
|
||||
case 'remote':
|
||||
return new RemoteEnvironmentService(rootDir, expId, config);
|
||||
return new RemoteEnvironmentService(config, info);
|
||||
case 'aml':
|
||||
return new AMLEnvironmentService(rootDir, expId, config);
|
||||
return new AMLEnvironmentService(config, info);
|
||||
case 'openpai':
|
||||
return new OpenPaiEnvironmentService(rootDir, expId, config);
|
||||
return new OpenPaiEnvironmentService(config, info);
|
||||
}
|
||||
|
||||
const esConfig = await getCustomEnvironmentServiceConfig(name);
|
||||
|
@ -30,5 +28,5 @@ export async function createEnvironmentService(name: string, config: ExperimentC
|
|||
}
|
||||
const esModule = importModule(esConfig.nodeModulePath);
|
||||
const esClass = esModule[esConfig.nodeClassName] as any;
|
||||
return new esClass(rootDir, expId, config);
|
||||
return new esClass(config, info);
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ import * as tkill from 'tree-kill';
|
|||
import * as component from '../../../common/component';
|
||||
import { getLogger, Logger } from '../../../common/log';
|
||||
import { ExperimentConfig } from '../../../common/experimentConfig';
|
||||
import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo';
|
||||
import { EnvironmentInformation, EnvironmentService } from '../environment';
|
||||
import { isAlive, getNewLine } from '../../../common/utils';
|
||||
import { execMkdir, runScript, getScriptName, execCopydir } from '../../common/util';
|
||||
|
@ -21,10 +22,10 @@ export class LocalEnvironmentService extends EnvironmentService {
|
|||
private experimentRootDir: string;
|
||||
private experimentId: string;
|
||||
|
||||
constructor(experimentRootDir: string, experimentId: string, _config: ExperimentConfig) {
|
||||
constructor(_config: ExperimentConfig, info: ExperimentStartupInfo) {
|
||||
super();
|
||||
this.experimentId = experimentId;
|
||||
this.experimentRootDir = experimentRootDir;
|
||||
this.experimentId = info.experimentId;
|
||||
this.experimentRootDir = info.logDir;
|
||||
}
|
||||
|
||||
public get environmentMaintenceLoopInterval(): number {
|
||||
|
|
|
@ -8,6 +8,7 @@ import * as request from 'request';
|
|||
import { Deferred } from 'ts-deferred';
|
||||
import * as component from '../../../common/component';
|
||||
import { ExperimentConfig, OpenpaiConfig, flattenConfig, toMegaBytes } from '../../../common/experimentConfig';
|
||||
import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo';
|
||||
import { getLogger, Logger } from '../../../common/log';
|
||||
import { PAIClusterConfig } from '../../pai/paiConfig';
|
||||
import { NNIPAITrialConfig } from '../../pai/paiConfig';
|
||||
|
@ -31,9 +32,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
|
|||
private experimentId: string;
|
||||
private config: FlattenOpenpaiConfig;
|
||||
|
||||
constructor(_experimentRootDir: string, experimentId: string, config: ExperimentConfig) {
|
||||
constructor(config: ExperimentConfig, info: ExperimentStartupInfo) {
|
||||
super();
|
||||
this.experimentId = experimentId;
|
||||
this.experimentId = info.experimentId;
|
||||
this.config = flattenConfig(config, 'openpai');
|
||||
this.paiToken = this.config.token;
|
||||
this.protocol = this.config.host.toLowerCase().startsWith('https://') ? 'https' : 'http';
|
||||
|
|
|
@ -10,6 +10,7 @@ import { getLogger, Logger } from '../../../common/log';
|
|||
import { EnvironmentInformation, EnvironmentService } from '../environment';
|
||||
import { getLogLevel } from '../../../common/utils';
|
||||
import { ExperimentConfig, RemoteConfig, RemoteMachineConfig, flattenConfig } from '../../../common/experimentConfig';
|
||||
import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo';
|
||||
import { execMkdir } from '../../common/util';
|
||||
import { ExecutorManager } from '../../remote_machine/remoteMachineData';
|
||||
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
|
||||
|
@ -32,13 +33,13 @@ export class RemoteEnvironmentService extends EnvironmentService {
|
|||
private experimentId: string;
|
||||
private config: FlattenRemoteConfig;
|
||||
|
||||
constructor(experimentRootDir: string, experimentId: string, config: ExperimentConfig) {
|
||||
constructor(config: ExperimentConfig, info: ExperimentStartupInfo) {
|
||||
super();
|
||||
this.experimentId = experimentId;
|
||||
this.experimentId = info.experimentId;
|
||||
this.environmentExecutorManagerMap = new Map<string, ExecutorManager>();
|
||||
this.machineExecutorManagerMap = new Map<RemoteMachineConfig, ExecutorManager>();
|
||||
this.remoteMachineMetaOccupiedMap = new Map<RemoteMachineConfig, boolean>();
|
||||
this.experimentRootDir = experimentRootDir;
|
||||
this.experimentRootDir = info.logDir;
|
||||
this.log = getLogger('RemoteEnvironmentService');
|
||||
this.config = flattenConfig(config, 'remote');
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче