This commit is contained in:
liuzhe-lz 2022-03-28 17:55:08 +08:00 коммит произвёл GitHub
Родитель 3ac19db955
Коммит 5a7c6eca74
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 140 добавлений и 210 удалений

Просмотреть файл

@ -1,71 +1,39 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import assert from 'assert';
import os from 'os';
import assert from 'assert/strict';
import path from 'path';
const API_ROOT_URL: string = '/api/v1/nni';
import type { NniManagerArgs } from 'common/globals/arguments';
let singleton: ExperimentStartupInfo | null = null;
export class ExperimentStartupInfo {
public experimentId: string = '';
public newExperiment: boolean = true;
public basePort: number = -1;
public initialized: boolean = false;
public experimentId: string;
public newExperiment: boolean;
public basePort: number;
public logDir: string = '';
public logLevel: string = '';
public readonly: boolean = false;
public dispatcherPipe: string | null = null;
public platform: string = '';
public urlprefix: string = '';
public logLevel: string;
public readonly: boolean;
public dispatcherPipe: string | null;
public platform: string;
public urlprefix: string;
constructor(
newExperiment: boolean,
experimentId: string,
basePort: number,
platform: string,
logDir?: string,
logLevel?: string,
readonly?: boolean,
dispatcherPipe?: string,
urlprefix?: string) {
this.newExperiment = newExperiment;
this.experimentId = experimentId;
this.basePort = basePort;
this.platform = platform;
if (logDir !== undefined && logDir.length > 0) {
this.logDir = path.join(path.normalize(logDir), experimentId);
} else {
this.logDir = path.join(os.homedir(), 'nni-experiments', experimentId);
}
if (logLevel !== undefined && logLevel.length > 1) {
this.logLevel = logLevel;
}
if (readonly !== undefined) {
this.readonly = readonly;
}
if (dispatcherPipe != undefined && dispatcherPipe.length > 0) {
this.dispatcherPipe = dispatcherPipe;
}
if(urlprefix != undefined && urlprefix.length > 0){
this.urlprefix = urlprefix;
}
}
public get apiRootUrl(): string {
return this.urlprefix === '' ? API_ROOT_URL : `/${this.urlprefix}${API_ROOT_URL}`;
constructor(args: NniManagerArgs) {
this.experimentId = args.experimentId;
this.newExperiment = (args.action === 'create');
this.basePort = args.port;
this.logDir = path.join(args.experimentsDirectory, args.experimentId); // TODO: handle in globals
this.logLevel = args.logLevel;
this.readonly = (args.action === 'view');
this.dispatcherPipe = args.dispatcherPipe ?? null;
this.platform = args.mode as string;
this.urlprefix = args.urlPrefix;
}
public static getInstance(): ExperimentStartupInfo {
assert(singleton !== null);
assert.notEqual(singleton, null);
return singleton!;
}
}
@ -74,27 +42,8 @@ export function getExperimentStartupInfo(): ExperimentStartupInfo {
return ExperimentStartupInfo.getInstance();
}
export function setExperimentStartupInfo(
newExperiment: boolean,
experimentId: string,
basePort: number,
platform: string,
logDir?: string,
logLevel?: string,
readonly?: boolean,
dispatcherPipe?: string,
urlprefix?: string): void {
singleton = new ExperimentStartupInfo(
newExperiment,
experimentId,
basePort,
platform,
logDir,
logLevel,
readonly,
dispatcherPipe,
urlprefix
);
export function setExperimentStartupInfo(args: NniManagerArgs): void {
singleton = new ExperimentStartupInfo(args);
}
export function getExperimentId(): string {
@ -120,12 +69,3 @@ export function isReadonly(): boolean {
export function getDispatcherPipe(): string | null {
return getExperimentStartupInfo().dispatcherPipe;
}
export function getAPIRootUrl(): string {
return getExperimentStartupInfo().apiRootUrl;
}
export function getPrefixUrl(): string {
const prefix = getExperimentStartupInfo().urlprefix === '' ? '' : `/${getExperimentStartupInfo().urlprefix}`;
return prefix;
}

Просмотреть файл

@ -104,18 +104,6 @@ function randomSelect<T>(a: T[]): T {
return a[Math.floor(Math.random() * a.length)];
}
function parseArg(names: string[]): string {
if (process.argv.length >= 4) {
for (let i: number = 2; i < process.argv.length - 1; i++) {
if (names.includes(process.argv[i])) {
return process.argv[i + 1];
}
}
}
return '';
}
function getCmdPy(): string {
let cmd = 'python3';
if (process.platform === 'win32') {
@ -165,9 +153,17 @@ function prepareUnitTest(): void {
Container.snapshot(Manager);
Container.snapshot(ExperimentManager);
const logLevel: string = parseArg(['--log_level', '-ll']);
setExperimentStartupInfo(true, 'unittest', 8080, 'unittest', undefined, logLevel);
setExperimentStartupInfo({
port: 8080,
experimentId: 'unittest',
action: 'create',
experimentsDirectory: path.join(os.homedir(), 'nni-experiments'),
logLevel: 'info',
foreground: false,
urlPrefix: '',
mode: 'unittest',
dispatcherPipe: undefined,
});
mkDirPSync(getLogDir());
const sqliteFile: string = path.join(getDefaultDatabaseDir(), 'nni.sqlite');
@ -188,8 +184,6 @@ function cleanupUnitTest(): void {
Container.restore(DataStore);
Container.restore(Database);
Container.restore(ExperimentManager);
const logLevel: string = parseArg(['--log_level', '-ll']);
setExperimentStartupInfo(true, 'unittest', 8080, 'unittest', undefined, logLevel);
}
let cachedIpv4Address: string | null = null;
@ -434,5 +428,5 @@ export function importModule(modulePath: string): any {
export {
countFilesRecursively, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, getExperimentsInfoPath,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin, withLockSync, getFreePort, isPortOpen,
mkDirP, mkDirPSync, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomInt, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine
mkDirP, mkDirPSync, delay, prepareUnitTest, cleanupUnitTest, uniqueString, randomInt, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine
};

Просмотреть файл

@ -1,87 +1,98 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import 'app-module-path/register';
/**
* Entry point of NNI manager.
*
* NNI manager is normally started by "nni/experiment/launcher.py".
* It requires command line arguments defined as NniManagerArgs in "common/globals/arguments.ts".
*
* Example usage:
*
* node main.js \
* --port 8080 \
* --experiment-id ID \
* --action create \
* --experiments-directory /home/USER/nni-experiments \
* --log-level info \
* --foreground false \ (optional)
* --mode local (required for now, will be removed later)
**/
import 'app-module-path/register'; // so we can use absolute path to import
import fs from 'fs';
import { Container, Scope } from 'typescript-ioc';
import * as fs from 'fs';
import * as path from 'path';
import * as component from './common/component';
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, setLogLevel, startLogging } from './common/log';
import { Manager, ExperimentStartUpMode } from './common/manager';
import { ExperimentManager } from './common/experimentManager';
import { TensorboardManager } from './common/tensorboardManager';
import { getLogDir, mkDirP, parseArg } from './common/utils';
import { NNIDataStore } from './core/nniDataStore';
import { NNIManager } from './core/nnimanager';
import { SqlDB } from './core/sqlDatabase';
import { NNIExperimentsManager } from './core/nniExperimentsManager';
import { NNITensorboardManager } from './core/nniTensorboardManager';
import { RestServer } from './rest_server';
import { parseArgs } from 'common/globals/arguments';
import * as component from 'common/component';
import { Database, DataStore } from 'common/datastore';
import { ExperimentManager } from 'common/experimentManager';
import { NniManagerArgs, parseArgs } from 'common/globals/arguments';
import { getLogger, setLogLevel, startLogging } from 'common/log';
import { Manager } from 'common/manager';
import { TensorboardManager } from 'common/tensorboardManager';
import { NNIDataStore } from 'core/nniDataStore';
import { NNIExperimentsManager } from 'core/nniExperimentsManager';
import { NNITensorboardManager } from 'core/nniTensorboardManager';
import { NNIManager } from 'core/nnimanager';
import { SqlDB } from 'core/sqlDatabase';
import { RestServer } from 'rest_server';
const args = parseArgs(process.argv.slice(2));
import path from 'path';
import { setExperimentStartupInfo } from 'common/experimentStartupInfo';
// TODO: this line should be inside initGlobals()
const args: NniManagerArgs = parseArgs(process.argv.slice(2));
async function start(): Promise<void> {
getLogger('main').info('Start NNI manager');
Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
Container.bind(Database).to(SqlDB).scope(Scope.Singleton);
Container.bind(DataStore).to(NNIDataStore).scope(Scope.Singleton);
Container.bind(ExperimentManager).to(NNIExperimentsManager).scope(Scope.Singleton);
Container.bind(TensorboardManager).to(NNITensorboardManager).scope(Scope.Singleton);
async function initContainer(): Promise<void> {
Container.bind(Manager)
.to(NNIManager)
.scope(Scope.Singleton);
Container.bind(Database)
.to(SqlDB)
.scope(Scope.Singleton);
Container.bind(DataStore)
.to(NNIDataStore)
.scope(Scope.Singleton);
Container.bind(ExperimentManager)
.to(NNIExperimentsManager)
.scope(Scope.Singleton);
Container.bind(TensorboardManager)
.to(NNITensorboardManager)
.scope(Scope.Singleton);
const DEFAULT_LOGFILE: string = path.join(getLogDir(), 'nnimanager.log');
if (!args.foreground) {
startLogging(DEFAULT_LOGFILE);
}
// eslint-disable-next-line @typescript-eslint/no-use-before-define
setLogLevel(args.logLevel);
const ds: DataStore = component.get(DataStore);
await ds.init();
const restServer = new RestServer(args.port, args.urlPrefix);
await restServer.start();
}
setExperimentStartupInfo(
args.action === 'create',
args.experimentId,
args.port,
args.mode,
args.experimentsDirectory,
args.logLevel,
args.action === 'view',
args.dispatcherPipe ?? '',
args.urlPrefix
);
mkDirP(getLogDir())
.then(async () => {
try {
await initContainer();
const restServer: RestServer = component.get(RestServer);
await restServer.start();
} catch (err) {
getLogger('main').error(`${err.stack}`);
throw err;
}
})
.catch((err: Error) => {
console.error(`Failed to create log dir: ${err.stack}`);
});
function cleanUp(): void {
function shutdown(): void {
(component.get(Manager) as Manager).stopExperiment();
}
// Register callbacks to free training service resources on unexpected shutdown.
// A graceful stop should use REST API,
// because interrupts can cause strange behaviors in children processes.
process.on('SIGTERM', shutdown);
process.on('SIGBREAK', shutdown);
process.on('SIGINT', shutdown);
process.on('SIGTERM', cleanUp);
process.on('SIGBREAK', cleanUp);
process.on('SIGINT', cleanUp);
/* main */
// TODO: these should be handled inside globals module
setExperimentStartupInfo(args);
const logDirectory = path.join(args.experimentsDirectory, args.experimentId, 'log');
fs.mkdirSync(logDirectory, { recursive: true });
startLogging(path.join(logDirectory, 'nnimanager.log'));
setLogLevel(args.logLevel);
start().then(() => {
getLogger('main').debug('start() returned.');
}).catch((error) => {
try {
getLogger('main').error('Failed to start:', error);
} catch (loggerError) {
console.log('Failed to start:', error);
console.log('Seems logger is faulty:', loggerError);
}
process.exit(1);
});
// Node.js exits when there is no active handler,
// and we have registered a lot of handlers which are never cleaned up.
// So it runs forever until NNIManager calls `process.exit()`.

Просмотреть файл

@ -18,11 +18,10 @@
* 2. Refactor ClusterJobRestServer to an express-ws application so it doesn't require extra port.
* 3. Provide public API to register express app, so this can be decoupled with other modules' implementation.
* 4. Refactor NNIRestHandler. It's a mess.
* 5. Get rid of IOC.
* 6. Deal with log path mismatch between REST API and file system.
* 7. Strip slashes of URL prefix inside ExperimentStartupInfo.
* 5. Deal with log path mismatch between REST API and file system.
**/
import assert from 'assert/strict';
import type { Server } from 'http';
import type { AddressInfo } from 'net';
import path from 'path';
@ -32,7 +31,6 @@ import httpProxy from 'http-proxy';
import { Deferred } from 'ts-deferred';
import { Singleton } from 'common/component';
import { getBasePort, getPrefixUrl } from 'common/experimentStartupInfo';
import { Logger, getLogger } from 'common/log';
import { getLogDir } from 'common/utils';
import { createRestHandler } from './restHandler';
@ -50,25 +48,23 @@ export class RestServer {
private server: Server | null = null;
private logger: Logger = getLogger('RestServer');
// I would prefer to get port and urlPrefix by constructor parameters,
// but this is impossible due to limitation of IOC.
constructor() {
this.port = getBasePort();
// Stripping slashes should be done inside ExperimentInfo, but I don't want to touch it for now.
this.urlPrefix = '/' + stripSlashes(getPrefixUrl());
constructor(port: number, urlPrefix: string) {
assert(!urlPrefix.startsWith('/') && !urlPrefix.endsWith('/'));
this.port = port;
this.urlPrefix = urlPrefix;
}
// The promise is resolved when it's ready to serve requests.
// This worth nothing for now,
// but for example if we connect to tuner using WebSocket then it must be launched after promise resolved.
public start(): Promise<void> {
this.logger.info(`Starting REST server at port ${this.port}, URL prefix: "${this.urlPrefix}"`);
this.logger.info(`Starting REST server at port ${this.port}, URL prefix: "/${this.urlPrefix}"`);
const app = express();
// FIXME: We should have a global handler for critical errors.
// `shutdown()` is not a callback and should not be passed to NNIRestHandler.
app.use(this.urlPrefix, rootRouter(this.shutdown.bind(this)));
app.all('*', (_req: Request, res: Response) => { res.status(404).send(`Outside prefix "${this.urlPrefix}"`); });
app.use('/' + this.urlPrefix, rootRouter(this.shutdown.bind(this)));
app.all('*', (_req: Request, res: Response) => { res.status(404).send(`Outside prefix "/${this.urlPrefix}"`); });
this.server = app.listen(this.port);
const deferred = new Deferred<void>();
@ -126,7 +122,7 @@ function rootRouter(stopCallback: () => Promise<void>): Router {
// The REST API path "/logs" does not match file system path "/log".
// Here we use an additional router to workaround this problem.
const logRouter = Router();
logRouter.get('*', express.static(getLogDir()));
logRouter.get('*', express.static(logDirectory ?? getLogDir()));
router.use('/logs', logRouter);
/* NAS model visualization */
@ -153,12 +149,9 @@ function netronProxy(): Router {
return router;
}
function stripSlashes(str: string): string {
return str.replace(/^\/+/, '').replace(/\/+$/, '');
}
let webuiPath: string = path.resolve('static');
let netronUrl: string = 'https://netron.app';
let logDirectory: string | undefined = undefined;
export namespace UnitTestHelpers {
export function getPort(server: RestServer): number {
@ -172,4 +165,8 @@ export namespace UnitTestHelpers {
export function setNetronUrl(mockUrl: string): void {
netronUrl = mockUrl;
}
export function setLogDirectory(path: string): void {
logDirectory = path;
}
}

Просмотреть файл

@ -21,6 +21,8 @@ import { testExperimentManagerProvider } from '../mock/experimentManager';
import { TensorboardManager } from '../../common/tensorboardManager';
import { NNITensorboardManager } from '../../core/nniTensorboardManager';
let restServer: RestServer;
describe('Unit test for rest server', () => {
let ROOT_URL: string;
@ -32,7 +34,7 @@ describe('Unit test for rest server', () => {
Container.bind(TrainingService).to(MockedTrainingService);
Container.bind(ExperimentManager).provider(testExperimentManagerProvider);
Container.bind(TensorboardManager).to(NNITensorboardManager);
const restServer: RestServer = component.get(RestServer);
restServer = new RestServer(8080, '');
restServer.start().then(() => {
ROOT_URL = `http://localhost:8080/api/v1/nni`;
done();
@ -42,7 +44,7 @@ describe('Unit test for rest server', () => {
});
after(() => {
component.get<RestServer>(RestServer).shutdown();
restServer.shutdown();
cleanupUnitTest();
});

Просмотреть файл

@ -128,24 +128,10 @@ async function configRestServer(urlPrefix?: string) {
await restServer.shutdown();
}
// Set port, URL prefix, and log path.
// There should be a better way to do this.
// Maybe rewire? I can't get it work with TypeScript.
setExperimentStartupInfo(
true,
path.basename(__dirname), // hacking getLogDir()
0, // ask for a random idle port
'local',
path.dirname(__dirname),
undefined,
undefined,
undefined,
urlPrefix
);
UnitTestHelpers.setLogDirectory(path.join(__dirname, 'log'));
UnitTestHelpers.setWebuiPath(path.join(__dirname, 'static'));
restServer = new RestServer();
restServer = new RestServer(0, urlPrefix ?? '');
await restServer.start();
const port = UnitTestHelpers.getPort(restServer);