зеркало из https://github.com/microsoft/nni.git
update main.ts (#4662)
This commit is contained in:
Родитель
3ac19db955
Коммит
5a7c6eca74
|
@ -1,71 +1,39 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
import assert from 'assert';
|
||||
import os from 'os';
|
||||
import assert from 'assert/strict';
|
||||
import path from 'path';
|
||||
|
||||
const API_ROOT_URL: string = '/api/v1/nni';
|
||||
import type { NniManagerArgs } from 'common/globals/arguments';
|
||||
|
||||
let singleton: ExperimentStartupInfo | null = null;
|
||||
|
||||
export class ExperimentStartupInfo {
|
||||
|
||||
public experimentId: string = '';
|
||||
public newExperiment: boolean = true;
|
||||
public basePort: number = -1;
|
||||
public initialized: boolean = false;
|
||||
public experimentId: string;
|
||||
public newExperiment: boolean;
|
||||
public basePort: number;
|
||||
public logDir: string = '';
|
||||
public logLevel: string = '';
|
||||
public readonly: boolean = false;
|
||||
public dispatcherPipe: string | null = null;
|
||||
public platform: string = '';
|
||||
public urlprefix: string = '';
|
||||
public logLevel: string;
|
||||
public readonly: boolean;
|
||||
public dispatcherPipe: string | null;
|
||||
public platform: string;
|
||||
public urlprefix: string;
|
||||
|
||||
constructor(
|
||||
newExperiment: boolean,
|
||||
experimentId: string,
|
||||
basePort: number,
|
||||
platform: string,
|
||||
logDir?: string,
|
||||
logLevel?: string,
|
||||
readonly?: boolean,
|
||||
dispatcherPipe?: string,
|
||||
urlprefix?: string) {
|
||||
this.newExperiment = newExperiment;
|
||||
this.experimentId = experimentId;
|
||||
this.basePort = basePort;
|
||||
this.platform = platform;
|
||||
|
||||
if (logDir !== undefined && logDir.length > 0) {
|
||||
this.logDir = path.join(path.normalize(logDir), experimentId);
|
||||
} else {
|
||||
this.logDir = path.join(os.homedir(), 'nni-experiments', experimentId);
|
||||
}
|
||||
|
||||
if (logLevel !== undefined && logLevel.length > 1) {
|
||||
this.logLevel = logLevel;
|
||||
}
|
||||
|
||||
if (readonly !== undefined) {
|
||||
this.readonly = readonly;
|
||||
}
|
||||
|
||||
if (dispatcherPipe != undefined && dispatcherPipe.length > 0) {
|
||||
this.dispatcherPipe = dispatcherPipe;
|
||||
}
|
||||
|
||||
if(urlprefix != undefined && urlprefix.length > 0){
|
||||
this.urlprefix = urlprefix;
|
||||
}
|
||||
}
|
||||
|
||||
public get apiRootUrl(): string {
|
||||
return this.urlprefix === '' ? API_ROOT_URL : `/${this.urlprefix}${API_ROOT_URL}`;
|
||||
constructor(args: NniManagerArgs) {
|
||||
this.experimentId = args.experimentId;
|
||||
this.newExperiment = (args.action === 'create');
|
||||
this.basePort = args.port;
|
||||
this.logDir = path.join(args.experimentsDirectory, args.experimentId); // TODO: handle in globals
|
||||
this.logLevel = args.logLevel;
|
||||
this.readonly = (args.action === 'view');
|
||||
this.dispatcherPipe = args.dispatcherPipe ?? null;
|
||||
this.platform = args.mode as string;
|
||||
this.urlprefix = args.urlPrefix;
|
||||
}
|
||||
|
||||
public static getInstance(): ExperimentStartupInfo {
|
||||
assert(singleton !== null);
|
||||
assert.notEqual(singleton, null);
|
||||
return singleton!;
|
||||
}
|
||||
}
|
||||
|
@ -74,27 +42,8 @@ export function getExperimentStartupInfo(): ExperimentStartupInfo {
|
|||
return ExperimentStartupInfo.getInstance();
|
||||
}
|
||||
|
||||
export function setExperimentStartupInfo(
|
||||
newExperiment: boolean,
|
||||
experimentId: string,
|
||||
basePort: number,
|
||||
platform: string,
|
||||
logDir?: string,
|
||||
logLevel?: string,
|
||||
readonly?: boolean,
|
||||
dispatcherPipe?: string,
|
||||
urlprefix?: string): void {
|
||||
singleton = new ExperimentStartupInfo(
|
||||
newExperiment,
|
||||
experimentId,
|
||||
basePort,
|
||||
platform,
|
||||
logDir,
|
||||
logLevel,
|
||||
readonly,
|
||||
dispatcherPipe,
|
||||
urlprefix
|
||||
);
|
||||
export function setExperimentStartupInfo(args: NniManagerArgs): void {
|
||||
singleton = new ExperimentStartupInfo(args);
|
||||
}
|
||||
|
||||
export function getExperimentId(): string {
|
||||
|
@ -120,12 +69,3 @@ export function isReadonly(): boolean {
|
|||
export function getDispatcherPipe(): string | null {
|
||||
return getExperimentStartupInfo().dispatcherPipe;
|
||||
}
|
||||
|
||||
export function getAPIRootUrl(): string {
|
||||
return getExperimentStartupInfo().apiRootUrl;
|
||||
}
|
||||
|
||||
export function getPrefixUrl(): string {
|
||||
const prefix = getExperimentStartupInfo().urlprefix === '' ? '' : `/${getExperimentStartupInfo().urlprefix}`;
|
||||
return prefix;
|
||||
}
|
||||
|
|
|
@ -104,18 +104,6 @@ function randomSelect<T>(a: T[]): T {
|
|||
return a[Math.floor(Math.random() * a.length)];
|
||||
}
|
||||
|
||||
function parseArg(names: string[]): string {
|
||||
if (process.argv.length >= 4) {
|
||||
for (let i: number = 2; i < process.argv.length - 1; i++) {
|
||||
if (names.includes(process.argv[i])) {
|
||||
return process.argv[i + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
function getCmdPy(): string {
|
||||
let cmd = 'python3';
|
||||
if (process.platform === 'win32') {
|
||||
|
@ -165,9 +153,17 @@ function prepareUnitTest(): void {
|
|||
Container.snapshot(Manager);
|
||||
Container.snapshot(ExperimentManager);
|
||||
|
||||
const logLevel: string = parseArg(['--log_level', '-ll']);
|
||||
|
||||
setExperimentStartupInfo(true, 'unittest', 8080, 'unittest', undefined, logLevel);
|
||||
setExperimentStartupInfo({
|
||||
port: 8080,
|
||||
experimentId: 'unittest',
|
||||
action: 'create',
|
||||
experimentsDirectory: path.join(os.homedir(), 'nni-experiments'),
|
||||
logLevel: 'info',
|
||||
foreground: false,
|
||||
urlPrefix: '',
|
||||
mode: 'unittest',
|
||||
dispatcherPipe: undefined,
|
||||
});
|
||||
mkDirPSync(getLogDir());
|
||||
|
||||
const sqliteFile: string = path.join(getDefaultDatabaseDir(), 'nni.sqlite');
|
||||
|
@ -188,8 +184,6 @@ function cleanupUnitTest(): void {
|
|||
Container.restore(DataStore);
|
||||
Container.restore(Database);
|
||||
Container.restore(ExperimentManager);
|
||||
const logLevel: string = parseArg(['--log_level', '-ll']);
|
||||
setExperimentStartupInfo(true, 'unittest', 8080, 'unittest', undefined, logLevel);
|
||||
}
|
||||
|
||||
let cachedIpv4Address: string | null = null;
|
||||
|
@ -434,5 +428,5 @@ export function importModule(modulePath: string): any {
|
|||
export {
|
||||
countFilesRecursively, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, getExperimentsInfoPath,
|
||||
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin, withLockSync, getFreePort, isPortOpen,
|
||||
mkDirP, mkDirPSync, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomInt, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine
|
||||
mkDirP, mkDirPSync, delay, prepareUnitTest, cleanupUnitTest, uniqueString, randomInt, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine
|
||||
};
|
||||
|
|
|
@ -1,87 +1,98 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
import 'app-module-path/register';
|
||||
/**
|
||||
* Entry point of NNI manager.
|
||||
*
|
||||
* NNI manager is normally started by "nni/experiment/launcher.py".
|
||||
* It requires command line arguments defined as NniManagerArgs in "common/globals/arguments.ts".
|
||||
*
|
||||
* Example usage:
|
||||
*
|
||||
* node main.js \
|
||||
* --port 8080 \
|
||||
* --experiment-id ID \
|
||||
* --action create \
|
||||
* --experiments-directory /home/USER/nni-experiments \
|
||||
* --log-level info \
|
||||
* --foreground false \ (optional)
|
||||
* --mode local (required for now, will be removed later)
|
||||
**/
|
||||
|
||||
import 'app-module-path/register'; // so we can use absolute path to import
|
||||
|
||||
import fs from 'fs';
|
||||
|
||||
import { Container, Scope } from 'typescript-ioc';
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as component from './common/component';
|
||||
import { Database, DataStore } from './common/datastore';
|
||||
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
|
||||
import { getLogger, setLogLevel, startLogging } from './common/log';
|
||||
import { Manager, ExperimentStartUpMode } from './common/manager';
|
||||
import { ExperimentManager } from './common/experimentManager';
|
||||
import { TensorboardManager } from './common/tensorboardManager';
|
||||
import { getLogDir, mkDirP, parseArg } from './common/utils';
|
||||
import { NNIDataStore } from './core/nniDataStore';
|
||||
import { NNIManager } from './core/nnimanager';
|
||||
import { SqlDB } from './core/sqlDatabase';
|
||||
import { NNIExperimentsManager } from './core/nniExperimentsManager';
|
||||
import { NNITensorboardManager } from './core/nniTensorboardManager';
|
||||
import { RestServer } from './rest_server';
|
||||
import { parseArgs } from 'common/globals/arguments';
|
||||
import * as component from 'common/component';
|
||||
import { Database, DataStore } from 'common/datastore';
|
||||
import { ExperimentManager } from 'common/experimentManager';
|
||||
import { NniManagerArgs, parseArgs } from 'common/globals/arguments';
|
||||
import { getLogger, setLogLevel, startLogging } from 'common/log';
|
||||
import { Manager } from 'common/manager';
|
||||
import { TensorboardManager } from 'common/tensorboardManager';
|
||||
import { NNIDataStore } from 'core/nniDataStore';
|
||||
import { NNIExperimentsManager } from 'core/nniExperimentsManager';
|
||||
import { NNITensorboardManager } from 'core/nniTensorboardManager';
|
||||
import { NNIManager } from 'core/nnimanager';
|
||||
import { SqlDB } from 'core/sqlDatabase';
|
||||
import { RestServer } from 'rest_server';
|
||||
|
||||
const args = parseArgs(process.argv.slice(2));
|
||||
import path from 'path';
|
||||
import { setExperimentStartupInfo } from 'common/experimentStartupInfo';
|
||||
|
||||
// TODO: this line should be inside initGlobals()
|
||||
const args: NniManagerArgs = parseArgs(process.argv.slice(2));
|
||||
|
||||
async function start(): Promise<void> {
|
||||
getLogger('main').info('Start NNI manager');
|
||||
|
||||
Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
|
||||
Container.bind(Database).to(SqlDB).scope(Scope.Singleton);
|
||||
Container.bind(DataStore).to(NNIDataStore).scope(Scope.Singleton);
|
||||
Container.bind(ExperimentManager).to(NNIExperimentsManager).scope(Scope.Singleton);
|
||||
Container.bind(TensorboardManager).to(NNITensorboardManager).scope(Scope.Singleton);
|
||||
|
||||
async function initContainer(): Promise<void> {
|
||||
Container.bind(Manager)
|
||||
.to(NNIManager)
|
||||
.scope(Scope.Singleton);
|
||||
Container.bind(Database)
|
||||
.to(SqlDB)
|
||||
.scope(Scope.Singleton);
|
||||
Container.bind(DataStore)
|
||||
.to(NNIDataStore)
|
||||
.scope(Scope.Singleton);
|
||||
Container.bind(ExperimentManager)
|
||||
.to(NNIExperimentsManager)
|
||||
.scope(Scope.Singleton);
|
||||
Container.bind(TensorboardManager)
|
||||
.to(NNITensorboardManager)
|
||||
.scope(Scope.Singleton);
|
||||
const DEFAULT_LOGFILE: string = path.join(getLogDir(), 'nnimanager.log');
|
||||
if (!args.foreground) {
|
||||
startLogging(DEFAULT_LOGFILE);
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-use-before-define
|
||||
setLogLevel(args.logLevel);
|
||||
const ds: DataStore = component.get(DataStore);
|
||||
|
||||
await ds.init();
|
||||
|
||||
const restServer = new RestServer(args.port, args.urlPrefix);
|
||||
await restServer.start();
|
||||
}
|
||||
|
||||
setExperimentStartupInfo(
|
||||
args.action === 'create',
|
||||
args.experimentId,
|
||||
args.port,
|
||||
args.mode,
|
||||
args.experimentsDirectory,
|
||||
args.logLevel,
|
||||
args.action === 'view',
|
||||
args.dispatcherPipe ?? '',
|
||||
args.urlPrefix
|
||||
);
|
||||
|
||||
mkDirP(getLogDir())
|
||||
.then(async () => {
|
||||
try {
|
||||
await initContainer();
|
||||
const restServer: RestServer = component.get(RestServer);
|
||||
await restServer.start();
|
||||
} catch (err) {
|
||||
getLogger('main').error(`${err.stack}`);
|
||||
throw err;
|
||||
}
|
||||
})
|
||||
.catch((err: Error) => {
|
||||
console.error(`Failed to create log dir: ${err.stack}`);
|
||||
});
|
||||
|
||||
function cleanUp(): void {
|
||||
function shutdown(): void {
|
||||
(component.get(Manager) as Manager).stopExperiment();
|
||||
}
|
||||
|
||||
// Register callbacks to free training service resources on unexpected shutdown.
|
||||
// A graceful stop should use REST API,
|
||||
// because interrupts can cause strange behaviors in children processes.
|
||||
process.on('SIGTERM', shutdown);
|
||||
process.on('SIGBREAK', shutdown);
|
||||
process.on('SIGINT', shutdown);
|
||||
|
||||
process.on('SIGTERM', cleanUp);
|
||||
process.on('SIGBREAK', cleanUp);
|
||||
process.on('SIGINT', cleanUp);
|
||||
/* main */
|
||||
|
||||
// TODO: these should be handled inside globals module
|
||||
setExperimentStartupInfo(args);
|
||||
const logDirectory = path.join(args.experimentsDirectory, args.experimentId, 'log');
|
||||
fs.mkdirSync(logDirectory, { recursive: true });
|
||||
startLogging(path.join(logDirectory, 'nnimanager.log'));
|
||||
setLogLevel(args.logLevel);
|
||||
|
||||
start().then(() => {
|
||||
getLogger('main').debug('start() returned.');
|
||||
}).catch((error) => {
|
||||
try {
|
||||
getLogger('main').error('Failed to start:', error);
|
||||
} catch (loggerError) {
|
||||
console.log('Failed to start:', error);
|
||||
console.log('Seems logger is faulty:', loggerError);
|
||||
}
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
// Node.js exits when there is no active handler,
|
||||
// and we have registered a lot of handlers which are never cleaned up.
|
||||
// So it runs forever until NNIManager calls `process.exit()`.
|
||||
|
|
|
@ -18,11 +18,10 @@
|
|||
* 2. Refactor ClusterJobRestServer to an express-ws application so it doesn't require extra port.
|
||||
* 3. Provide public API to register express app, so this can be decoupled with other modules' implementation.
|
||||
* 4. Refactor NNIRestHandler. It's a mess.
|
||||
* 5. Get rid of IOC.
|
||||
* 6. Deal with log path mismatch between REST API and file system.
|
||||
* 7. Strip slashes of URL prefix inside ExperimentStartupInfo.
|
||||
* 5. Deal with log path mismatch between REST API and file system.
|
||||
**/
|
||||
|
||||
import assert from 'assert/strict';
|
||||
import type { Server } from 'http';
|
||||
import type { AddressInfo } from 'net';
|
||||
import path from 'path';
|
||||
|
@ -32,7 +31,6 @@ import httpProxy from 'http-proxy';
|
|||
import { Deferred } from 'ts-deferred';
|
||||
|
||||
import { Singleton } from 'common/component';
|
||||
import { getBasePort, getPrefixUrl } from 'common/experimentStartupInfo';
|
||||
import { Logger, getLogger } from 'common/log';
|
||||
import { getLogDir } from 'common/utils';
|
||||
import { createRestHandler } from './restHandler';
|
||||
|
@ -50,25 +48,23 @@ export class RestServer {
|
|||
private server: Server | null = null;
|
||||
private logger: Logger = getLogger('RestServer');
|
||||
|
||||
// I would prefer to get port and urlPrefix by constructor parameters,
|
||||
// but this is impossible due to limitation of IOC.
|
||||
constructor() {
|
||||
this.port = getBasePort();
|
||||
// Stripping slashes should be done inside ExperimentInfo, but I don't want to touch it for now.
|
||||
this.urlPrefix = '/' + stripSlashes(getPrefixUrl());
|
||||
constructor(port: number, urlPrefix: string) {
|
||||
assert(!urlPrefix.startsWith('/') && !urlPrefix.endsWith('/'));
|
||||
this.port = port;
|
||||
this.urlPrefix = urlPrefix;
|
||||
}
|
||||
|
||||
// The promise is resolved when it's ready to serve requests.
|
||||
// This worth nothing for now,
|
||||
// but for example if we connect to tuner using WebSocket then it must be launched after promise resolved.
|
||||
public start(): Promise<void> {
|
||||
this.logger.info(`Starting REST server at port ${this.port}, URL prefix: "${this.urlPrefix}"`);
|
||||
this.logger.info(`Starting REST server at port ${this.port}, URL prefix: "/${this.urlPrefix}"`);
|
||||
|
||||
const app = express();
|
||||
// FIXME: We should have a global handler for critical errors.
|
||||
// `shutdown()` is not a callback and should not be passed to NNIRestHandler.
|
||||
app.use(this.urlPrefix, rootRouter(this.shutdown.bind(this)));
|
||||
app.all('*', (_req: Request, res: Response) => { res.status(404).send(`Outside prefix "${this.urlPrefix}"`); });
|
||||
app.use('/' + this.urlPrefix, rootRouter(this.shutdown.bind(this)));
|
||||
app.all('*', (_req: Request, res: Response) => { res.status(404).send(`Outside prefix "/${this.urlPrefix}"`); });
|
||||
this.server = app.listen(this.port);
|
||||
|
||||
const deferred = new Deferred<void>();
|
||||
|
@ -126,7 +122,7 @@ function rootRouter(stopCallback: () => Promise<void>): Router {
|
|||
// The REST API path "/logs" does not match file system path "/log".
|
||||
// Here we use an additional router to workaround this problem.
|
||||
const logRouter = Router();
|
||||
logRouter.get('*', express.static(getLogDir()));
|
||||
logRouter.get('*', express.static(logDirectory ?? getLogDir()));
|
||||
router.use('/logs', logRouter);
|
||||
|
||||
/* NAS model visualization */
|
||||
|
@ -153,12 +149,9 @@ function netronProxy(): Router {
|
|||
return router;
|
||||
}
|
||||
|
||||
function stripSlashes(str: string): string {
|
||||
return str.replace(/^\/+/, '').replace(/\/+$/, '');
|
||||
}
|
||||
|
||||
let webuiPath: string = path.resolve('static');
|
||||
let netronUrl: string = 'https://netron.app';
|
||||
let logDirectory: string | undefined = undefined;
|
||||
|
||||
export namespace UnitTestHelpers {
|
||||
export function getPort(server: RestServer): number {
|
||||
|
@ -172,4 +165,8 @@ export namespace UnitTestHelpers {
|
|||
export function setNetronUrl(mockUrl: string): void {
|
||||
netronUrl = mockUrl;
|
||||
}
|
||||
|
||||
export function setLogDirectory(path: string): void {
|
||||
logDirectory = path;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,8 @@ import { testExperimentManagerProvider } from '../mock/experimentManager';
|
|||
import { TensorboardManager } from '../../common/tensorboardManager';
|
||||
import { NNITensorboardManager } from '../../core/nniTensorboardManager';
|
||||
|
||||
let restServer: RestServer;
|
||||
|
||||
describe('Unit test for rest server', () => {
|
||||
|
||||
let ROOT_URL: string;
|
||||
|
@ -32,7 +34,7 @@ describe('Unit test for rest server', () => {
|
|||
Container.bind(TrainingService).to(MockedTrainingService);
|
||||
Container.bind(ExperimentManager).provider(testExperimentManagerProvider);
|
||||
Container.bind(TensorboardManager).to(NNITensorboardManager);
|
||||
const restServer: RestServer = component.get(RestServer);
|
||||
restServer = new RestServer(8080, '');
|
||||
restServer.start().then(() => {
|
||||
ROOT_URL = `http://localhost:8080/api/v1/nni`;
|
||||
done();
|
||||
|
@ -42,7 +44,7 @@ describe('Unit test for rest server', () => {
|
|||
});
|
||||
|
||||
after(() => {
|
||||
component.get<RestServer>(RestServer).shutdown();
|
||||
restServer.shutdown();
|
||||
cleanupUnitTest();
|
||||
});
|
||||
|
||||
|
|
|
@ -128,24 +128,10 @@ async function configRestServer(urlPrefix?: string) {
|
|||
await restServer.shutdown();
|
||||
}
|
||||
|
||||
// Set port, URL prefix, and log path.
|
||||
// There should be a better way to do this.
|
||||
// Maybe rewire? I can't get it work with TypeScript.
|
||||
setExperimentStartupInfo(
|
||||
true,
|
||||
path.basename(__dirname), // hacking getLogDir()
|
||||
0, // ask for a random idle port
|
||||
'local',
|
||||
path.dirname(__dirname),
|
||||
undefined,
|
||||
undefined,
|
||||
undefined,
|
||||
urlPrefix
|
||||
);
|
||||
|
||||
UnitTestHelpers.setLogDirectory(path.join(__dirname, 'log'));
|
||||
UnitTestHelpers.setWebuiPath(path.join(__dirname, 'static'));
|
||||
|
||||
restServer = new RestServer();
|
||||
restServer = new RestServer(0, urlPrefix ?? '');
|
||||
await restServer.start();
|
||||
const port = UnitTestHelpers.getPort(restServer);
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче