Add api endpoint and webportal page of job retry history (#3831)

This commit is contained in:
Mingliang Tao 2019-11-21 00:13:47 +08:00 коммит произвёл GitHub
Родитель bbb98167b1
Коммит 15ed057560
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
24 изменённых файлов: 1859 добавлений и 66 удалений

Просмотреть файл

@ -2889,6 +2889,172 @@ Status: 500
}
```
### `GET /api/v2/jobs/:frameworkName/jobAttempts/healthz`
Check if jobAttempts is healthy
*Request*
```json
GET /api/v2/jobs/:frameworkName/jobAttempts/healthz
```
*Response if succeeded*
```json
Status: 200
OK
```
*Response if job attempts API not work*
```json
Status: 501
Not healthy
```
### `GET /api/v2/jobs/:frameworkName/jobAttempts`
Get all attempts of a certain job.
*Request*
```json
GET /api/v2/jobs/:frameworkName/jobAttempts
```
*Response if succeeded*
```json
Status: 200
[
{
"jobName": string,
"frameworkName": string,
"userName": string,
"state": "FAILED",
"originState": "Completed",
"maxAttemptCount": 4,
"attemptIndex": 3,
"jobStartedTime": 1572592684000,
"attemptStartedTime": 1572592813000,
"attemptCompletedTime": 1572592840000,
"exitCode": 255,
"exitPhrase": "PAIRuntimeUnknownFailed",
"exitType": "Failed",
"diagnosticsSummary": string,
"totalGpuNumber": 1,
"totalTaskNumber": 1,
"totalTaskRoleNumber": 1,
"taskRoles": {
"taskrole": {
"taskRoleStatus": {
"name": "taskrole"
},
"taskStatuses": [
{
"taskIndex": 0,
"taskState": "FAILED",
"containerId": uuid string,
"containerIp": ip string,
"containerGpus": null,
"containerLog": url string,
"containerExitCode": 255
}
]
}
},
"isLatest": true
},
]
```
*Response if attempts not found*
```json
Status: 404
Not Found
```
*Response if a server error occurred*
```json
Status: 501
Internal Error
```
### `GET /api/v2/jobs/:frameworkName/jobAttempts/:attemptIndex`
Get a specific attempt by attempt index.
*Request*
```json
GET /api/v2/jobs/:frameworkName/jobAttempts/:attemptIndex
```
*Response if succeeded*
```json
Status: 200
{
"jobName": string,
"frameworkName": string,
"userName": string,
"state": "FAILED",
"originState": "Completed",
"maxAttemptCount": 4,
"attemptIndex": 3,
"jobStartedTime": 1572592684000,
"attemptStartedTime": 1572592813000,
"attemptCompletedTime": 1572592840000,
"exitCode": 255,
"exitPhrase": "PAIRuntimeUnknownFailed",
"exitType": "Failed",
"diagnosticsSummary": string,
"totalGpuNumber": 1,
"totalTaskNumber": 1,
"totalTaskRoleNumber": 1,
"taskRoles": {
"taskrole": {
"taskRoleStatus": {
"name": "taskrole"
},
"taskStatuses": [
{
"taskIndex": 0,
"taskState": "FAILED",
"containerId": uuid string,
"containerIp": ip string,
"containerGpus": null,
"containerLog": url string,
"containerExitCode": 255
}
]
}
},
"isLatest": true
},
```
*Response if attempts not found*
```json
Status: 404
Not Found
```
*Response if a server error occurred*
```json
Status: 501
Internal Error
```
## About legacy jobs
Since [Framework ACL](../../subprojects/frameworklauncher/yarn/doc/USERMANUAL.md#Framework_ACL) is enabled since this version,

Просмотреть файл

@ -24,6 +24,7 @@
"node": "^8.9.0"
},
"dependencies": {
"@elastic/elasticsearch": "^7.4.0",
"ajv": "^6.10.0",
"ajv-merge-patch": "~4.1.0",
"async": "~2.5.0",

Просмотреть файл

@ -0,0 +1,45 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// module dependencies
const asyncHandler = require('@pai/middlewares/v2/asyncHandler');
const jobAttempt = require('@pai/models/v2/job-attempt.js');
const healthCheck = asyncHandler(async (req, res) => {
const isHealthy = await jobAttempt.healthCheck();
if (!isHealthy) {
res.status(501).send('Not healthy');
} else {
res.status(200).send('ok');
}
});
const list = asyncHandler(async (req, res) => {
const result = await jobAttempt.list(req.params.frameworkName);
res.status(result.status).json(result.data);
});
const get = asyncHandler(async (req, res) => {
const result = await jobAttempt.get(req.params.frameworkName, Number(req.params.jobAttemptIndex));
res.status(result.status).json(result.data);
});
module.exports = {
healthCheck,
list,
get,
};

Просмотреть файл

@ -0,0 +1,287 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// module dependencies
const _ = require('lodash');
const axios = require('axios');
const {Client} = require('@elastic/elasticsearch');
const base32 = require('base32');
const {Agent} = require('https');
const {isNil} = require('lodash');
const {convertToJobAttempt} = require('@pai/utils/frameworkConverter');
const launcherConfig = require('@pai/config/launcher');
const {apiserver} = require('@pai/config/kubernetes');
const createError = require('@pai/utils/error');
let elasticSearchClient;
if (!_.isNil(process.env.ELASTICSEARCH_URI)) {
elasticSearchClient = new Client({node: process.env.ELASTICSEARCH_URI});
}
const convertName = (name) => {
// convert framework name to fit framework controller spec
return name.toLowerCase().replace(/[^a-z0-9]/g, '');
};
const encodeName = (name) => {
if (name.startsWith('unknown') || !name.includes('~')) {
// framework is not generated by PAI
return convertName(name.replace(/^unknown/g, ''));
} else {
// base32 encode
return base32.encode(name);
}
};
// job attempts api only works in k8s launcher and when elastic search exists
const healthCheck = async () => {
if (launcherConfig.type === 'yarn' === 'yarn') {
return false;
} else if (_.isNil(elasticSearchClient)) {
return false;
} else {
try {
const result = await elasticSearchClient.indices.get({
index: 'framework',
});
if (result.statusCode === 200) {
return true;
} else {
return false;
}
} catch (e) {
return false;
}
}
};
// list job attempts
const list = async (frameworkName) => {
if (!healthCheck) {
return {status: 501, data: null};
}
let attemptData = [];
let uid;
// get latest framework from k8s API
let response;
try {
response = await axios({
method: 'get',
url: launcherConfig.frameworkPath(encodeName(frameworkName)),
headers: launcherConfig.requestHeaders,
httpsAgent: apiserver.ca && new Agent({ca: apiserver.ca}),
});
} catch (error) {
if (error.response != null) {
response = error.response;
} else {
throw error;
}
}
if (response.status === 200) {
// get UID from k8s framework API
uid = response.data.metadata.uid;
attemptData.push({
...(await convertToJobAttempt(response.data)),
isLatest: true,
});
} else if (response.status === 404) {
return {status: 404, data: null};
} else {
throw createError(response.status, 'UnknownError', response.data.message);
}
if (isNil(uid)) {
return {status: 404, data: null};
}
// get history frameworks from elastic search
const body = {
query: {
bool: {
filter: {
term: {
'objectSnapshot.metadata.uid.keyword': uid,
},
},
},
},
size: 0,
aggs: {
attemptID_group: {
terms: {
field: 'objectSnapshot.status.attemptStatus.id',
order: {
_key: 'desc',
},
},
aggs: {
collectTime_latest_hits: {
top_hits: {
sort: [
{
collectTime: {
order: 'desc',
},
},
],
size: 1,
},
},
},
},
},
};
const esResult = await elasticSearchClient.search({
index: 'framework',
body: body,
});
const buckets = esResult.body.aggregations.attemptID_group.buckets;
if (_.isEmpty(buckets)) {
return {status: 404, data: null};
} else {
const retryFrameworks = buckets.map((bucket) => {
return bucket.collectTime_latest_hits.hits.hits[0]._source.objectSnapshot;
});
const jobRetries = await Promise.all(
retryFrameworks.map((attemptFramework) => {
return convertToJobAttempt(attemptFramework);
}),
);
attemptData.push(
...jobRetries.map((jobRetry) => {
return {...jobRetry, isLatest: false};
}),
);
return {status: 200, data: attemptData};
}
};
const get = async (frameworkName, jobAttemptIndex) => {
if (!healthCheck) {
return {status: 501, data: null};
}
let uid;
let attemptFramework;
let response;
try {
response = await axios({
method: 'get',
url: launcherConfig.frameworkPath(encodeName(frameworkName)),
headers: launcherConfig.requestHeaders,
httpsAgent: apiserver.ca && new Agent({ca: apiserver.ca}),
});
} catch (error) {
if (error.response != null) {
response = error.response;
} else {
throw error;
}
}
if (response.status === 200) {
// get uid from k8s framwork API
uid = response.data.metadata.uid;
attemptFramework = response.data;
} else if (response.status === 404) {
return {status: 404, data: null};
} else {
throw createError(response.status, 'UnknownError', response.data.message);
}
if (jobAttemptIndex < attemptFramework.spec.retryPolicy.maxRetryCount) {
if (isNil(uid)) {
return {status: 404, data: null};
}
// get history frameworks from elastic search
const body = {
query: {
bool: {
filter: {
term: {
'objectSnapshot.metadata.uid.keyword': uid,
},
},
},
},
size: 0,
aggs: {
attemptID_group: {
filter: {
term: {
'objectSnapshot.status.attemptStatus.id': jobAttemptIndex,
},
},
aggs: {
collectTime_latest_hits: {
top_hits: {
sort: [
{
collectTime: {
order: 'desc',
},
},
],
size: 1,
},
},
},
},
},
};
const esResult = await elasticSearchClient.search({
index: 'framework',
body: body,
});
const buckets =
esResult.body.aggregations.attemptID_group.collectTime_latest_hits.hits
.hits;
if (_.isEmpty(buckets)) {
return {status: 404, data: null};
} else {
attemptFramework = buckets[0]._source.objectSnapshot;
const attemptDetail = await convertToJobAttempt(attemptFramework);
return {status: 200, data: {...attemptDetail, isLatest: false}};
}
} else if (
jobAttemptIndex === attemptFramework.spec.retryPolicy.maxRetryCount
) {
// get latest frameworks from k8s API
const attemptDetail = await convertToJobAttempt(attemptFramework);
return {status: 200, data: {...attemptDetail, isLatest: true}};
} else {
return {status: 404, data: null};
}
};
module.exports = {
healthCheck,
list,
get,
};

Просмотреть файл

@ -0,0 +1,38 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// module dependencies
const express = require('express');
const controller = require('@pai/controllers/v2/job-attempt');
const router = new express.Router({mergeParams: true});
/** GET /api/v2/jobs/:frameworkName/job-attempts/healthz - health check of job retry endpoint*/
router.route('/healthz')
.get(controller.healthCheck);
/** GET /api/v2/jobs/:frameworkName/job-attempts - list job retries by job frameworkName */
router.route('/')
.get(controller.list);
/** GET /api/v2/jobs/:frameworkName/job-attempts/:jobAttemptIndex - get certain job retry by retry index */
router.route('/:jobAttemptIndex')
.get(controller.get);
module.exports = router;

Просмотреть файл

@ -21,6 +21,7 @@ const express = require('express');
const token = require('@pai/middlewares/token');
const controller = require('@pai/controllers/v2/job');
const protocol = require('@pai/middlewares/v2/protocol');
const jobAttemptRouter = require('@pai/routes/v2/job-attempt.js');
const router = new express.Router();
@ -51,5 +52,7 @@ router.route('/:frameworkName/ssh')
/** GET /api/v2/jobs/:frameworkName/ssh - Get job ssh info */
.get(controller.getSshInfo);
router.use('/:frameworkName/job-attempts', jobAttemptRouter);
// module exports
module.exports = router;

Просмотреть файл

@ -0,0 +1,75 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// module dependencies
const elasticsearch = require('@elastic/elasticsearch');
const client = new elasticsearch.Client({nodes: process.env.Elasticsearch_URI});
/**
* search framework
*/
async function search(index = '*', body = '{}', req) {
const esResult = await client.search({
index: index,
body: body,
});
let res;
if (req == 'attemptID') {
if (esResult.body.hits.hits.length == 0) {
res = {
status: 404,
data: {
message: `The specified ${index} is not found`,
},
};
} else {
res = {
status: 200,
data: esResult.body.hits.hits[0]._source.ObjectSnapshot,
};
}
} else {
let aggResults = esResult.body.aggregations.attemptID_group.buckets;
if (aggResults.length == 0) {
res = {
status: 404,
data: {
message: `The specified ${index} is not found`,
},
};
} else {
let resultObj = {items: []};
for (let i = 0; i < aggResults.length; i++) {
resultObj['items'].push(
aggResults[i].CollectTime_sort.buckets[0].top.hits.hits[0]._source
.ObjectSnapshot,
);
}
res = {
status: 200,
data: resultObj,
};
}
}
return res;
}
// module exports
module.exports = {
search,
};

Просмотреть файл

@ -0,0 +1,364 @@
const zlib = require('zlib');
const axios = require('axios');
const {Agent} = require('https');
const _ = require('lodash');
const yaml = require('js-yaml');
const path = require('path');
const fs = require('fs');
const launcherConfig = require('@pai/config/launcher');
const {apiserver} = require('@pai/config/kubernetes');
const k8s = require('@pai/utils/k8sUtils');
const logger = require('@pai/config/logger');
const env = require('@pai/utils/env');
const positiveFallbackExitCode = 256;
const negativeFallbackExitCode = -8000;
const generateSpecMap = () => {
let exitSpecPath;
if (process.env[env.exitSpecPath]) {
exitSpecPath = process.env[env.exitSpecPath];
if (!path.isAbsolute(exitSpecPath)) {
exitSpecPath = path.resolve(__dirname, '../../', exitSpecPath);
}
} else {
exitSpecPath = '/k8s-job-exit-spec-configuration/k8s-job-exit-spec.yaml';
}
const exitSpecList = yaml.safeLoad(fs.readFileSync(exitSpecPath));
let exitSpecMap = {};
exitSpecList.forEach((val) => {
exitSpecMap[val.code] = val;
});
return exitSpecMap;
};
const decodeName = (name, labels) => {
if (labels && labels.jobName) {
return labels.jobName;
} else {
// framework name has not been encoded
return name;
}
};
const decompressField = (val) => {
if (val == null) {
return null;
} else {
return JSON.parse(zlib.gunzipSync(Buffer.from(val, 'base64')).toString());
}
};
const extractRuntimeOutput = (podCompletionStatus) => {
if (_.isEmpty(podCompletionStatus)) {
return null;
}
let res = null;
for (const container of podCompletionStatus.containers) {
if (container.code <= 0) {
continue;
}
const message = container.message;
if (message == null) {
continue;
}
const anchor1 = /\[PAI_RUNTIME_ERROR_START\]/;
const anchor2 = /\[PAI_RUNTIME_ERROR_END\]/;
const match1 = message.match(anchor1);
const match2 = message.match(anchor2);
if (match1 !== null && match2 !== null) {
const start = match1.index + match1[0].length;
const end = match2.index;
const output = message.substring(start, end).trim();
try {
res = {
...yaml.safeLoad(output),
name: container.name,
};
} catch (error) {
logger.warn('failed to format runtime output:', output, error);
}
break;
}
}
return res;
};
const generateExitDiagnostics = (diag) => {
if (_.isEmpty(diag)) {
return null;
}
const exitDiagnostics = {
diagnosticsSummary: diag,
runtime: null,
launcher: diag,
};
const regex = /matched: (.*)/;
const matches = diag.match(regex);
// No container info here
if (matches === null || matches.length < 2) {
return exitDiagnostics;
}
let podCompletionStatus = null;
try {
podCompletionStatus = JSON.parse(matches[1]);
} catch (error) {
logger.warn('Get diagnostics info failed', error);
return exitDiagnostics;
}
const summmaryInfo = diag.substring(0, matches.index + 'matched:'.length);
exitDiagnostics.diagnosticsSummary =
summmaryInfo + '\n' + yaml.safeDump(podCompletionStatus);
exitDiagnostics.launcher = exitDiagnostics.diagnosticsSummary;
// Get runtime output, set launcher output to null. Otherwise, treat all message as launcher output
exitDiagnostics.runtime = extractRuntimeOutput(podCompletionStatus);
if (exitDiagnostics.runtime !== null) {
exitDiagnostics.launcher = null;
return exitDiagnostics;
}
return exitDiagnostics;
};
const convertState = (state, exitCode) => {
switch (state) {
case 'AttemptCreationPending':
case 'AttemptCreationRequested':
case 'AttemptPreparing':
return 'WAITING';
case 'AttemptRunning':
case 'AttemptDeletionPending':
case 'AttemptDeletionRequested':
case 'AttemptDeleting':
return 'RUNNING';
case 'AttemptCompleted':
if (exitCode === 0) {
return 'SUCCEEDED';
} else if (exitCode === -210 || exitCode === -220) {
return 'STOPPED';
} else {
return 'FAILED';
}
case 'Completed':
if (exitCode === 0) {
return 'SUCCEEDED';
} else if (exitCode === -210 || exitCode === -220) {
return 'STOPPED';
} else {
return 'FAILED';
}
default:
return 'UNKNOWN';
}
};
const generateExitSpec = (code) => {
const exitSpecMap = generateSpecMap();
if (!_.isNil(code)) {
if (!_.isNil(exitSpecMap[code])) {
return exitSpecMap[code];
} else {
if (code > 0) {
return {
...exitSpecMap[positiveFallbackExitCode],
code,
};
} else {
return {
...exitSpecMap[negativeFallbackExitCode],
code,
};
}
}
} else {
return null;
}
};
const convertToJobAttempt = async (framework) => {
const completionStatus = framework.status.attemptStatus.completionStatus;
const jobName = decodeName(
framework.metadata.name,
framework.metadata.labels,
);
const frameworkName = framework.metadata.name;
const uid = framework.metadata.uid;
const userName = framework.metadata.labels
? framework.metadata.labels.userName
: 'unknown';
const state = convertState(
framework.status.state,
completionStatus ? completionStatus.code : null,
framework.status.retryPolicyStatus.retryDelaySec,
);
const originState = framework.status.state;
const maxAttemptCount = framework.spec.retryPolicy.maxRetryCount + 1;
const attemptIndex = framework.status.attemptStatus.id;
const jobStartedTime = new Date(
framework.metadata.creationTimestamp,
).getTime();
const attemptStartedTime = new Date(
framework.status.attemptStatus.startTime,
).getTime();
const attemptCompletedTime = new Date(
framework.status.attemptStatus.completionTime,
).getTime();
const totalGpuNumber = framework.metadata.annotations
? framework.metadata.annotations.totalGpuNumber
: 0;
const totalTaskNumber = framework.spec.taskRoles.reduce(
(num, spec) => num + spec.taskNumber,
0,
);
const totalTaskRoleNumber = framework.spec.taskRoles.length;
const diagnostics = completionStatus ? completionStatus.diagnostics : null;
const exitDiagnostics = generateExitDiagnostics(diagnostics);
const appExitTriggerMessage =
completionStatus && completionStatus.trigger
? completionStatus.trigger.message
: null;
const appExitTriggerTaskRoleName =
completionStatus && completionStatus.trigger
? completionStatus.trigger.taskRoleName
: null;
const appExitTriggerTaskIndex =
completionStatus && completionStatus.trigger
? completionStatus.trigger.taskIndex
: null;
const appExitSpec = completionStatus
? generateExitSpec(completionStatus.code)
: generateExitSpec(null);
const appExitDiagnostics = exitDiagnostics
? exitDiagnostics.diagnosticsSummary
: null;
const appExitMessages = exitDiagnostics
? {
container: null,
runtime: exitDiagnostics.runtime,
launcher: exitDiagnostics.launcher,
}
: null;
// check fields which may be compressed
if (framework.status.attemptStatus.taskRoleStatuses == null) {
framework.status.attemptStatus.taskRoleStatuses = decompressField(
framework.status.attemptStatus.taskRoleStatusesCompressed,
);
}
let taskRoles = {};
const exitCode = completionStatus ? completionStatus.code : null;
const exitPhrase = completionStatus ? completionStatus.phrase : null;
const exitType = completionStatus ? completionStatus.type.name : null;
for (let taskRoleStatus of framework.status.attemptStatus.taskRoleStatuses) {
taskRoles[taskRoleStatus.name] = {
taskRoleStatus: {
name: taskRoleStatus.name,
},
taskStatuses: await Promise.all(
taskRoleStatus.taskStatuses.map(
async (status) =>
await convertTaskDetail(
status,
userName,
jobName,
taskRoleStatus.name,
),
),
),
};
}
return {
jobName,
frameworkName,
uid,
userName,
state,
originState,
maxAttemptCount,
attemptIndex,
jobStartedTime,
attemptStartedTime,
attemptCompletedTime,
exitCode,
exitPhrase,
exitType,
exitDiagnostics,
appExitTriggerMessage,
appExitTriggerTaskRoleName,
appExitTriggerTaskIndex,
appExitSpec,
appExitDiagnostics,
appExitMessages,
totalGpuNumber,
totalTaskNumber,
totalTaskRoleNumber,
taskRoles,
};
};
const convertTaskDetail = async (
taskStatus,
userName,
jobName,
taskRoleName,
) => {
// get container gpus
let containerGpus = null;
try {
const pod = (await axios({
method: 'get',
url: launcherConfig.podPath(taskStatus.attemptStatus.podName),
headers: launcherConfig.requestHeaders,
httpsAgent: apiserver.ca && new Agent({ca: apiserver.ca}),
})).data;
if (launcherConfig.enabledHived) {
const isolation =
pod.metadata.annotations[
'hivedscheduler.microsoft.com/pod-gpu-isolation'
];
containerGpus = isolation
.split(',')
.reduce((attr, id) => attr + Math.pow(2, id), 0);
} else {
const gpuNumber = k8s.atoi(
pod.spec.containers[0].resources.limits['nvidia.com/gpu'],
);
// mock GPU ids from 0 to (gpuNumber - 1)
containerGpus = Math.pow(2, gpuNumber) - 1;
}
} catch (err) {
containerGpus = null;
}
const completionStatus = taskStatus.attemptStatus.completionStatus;
return {
taskIndex: taskStatus.index,
taskState: convertState(
taskStatus.state,
completionStatus ? completionStatus.code : null,
taskStatus.retryPolicyStatus.retryDelaySec,
),
containerId: taskStatus.attemptStatus.podUID,
containerIp: taskStatus.attemptStatus.podHostIP,
containerGpus,
containerLog: `http://${taskStatus.attemptStatus.podHostIP}:${process.env.LOG_MANAGER_PORT}/log-manager/${userName}/${jobName}/${taskRoleName}/${taskStatus.attemptStatus.podUID}/`,
containerExitCode: completionStatus ? completionStatus.code : null,
};
};
// module exports
module.exports = {
convertToJobAttempt,
};

Просмотреть файл

@ -2,6 +2,18 @@
# yarn lockfile v1
"@elastic/elasticsearch@^7.4.0":
version "7.4.0"
resolved "https://registry.yarnpkg.com/@elastic/elasticsearch/-/elasticsearch-7.4.0.tgz#57f4066acf25e9d4e9b4f6376088433aae6f25d4"
integrity sha512-HpEKHH6mHQRvea3lw4NNJw9ZUS1KmkpwWKHucaHi1svDn+/fEAwY0wD8egL1vZJo4ZmWfCQMjVqGL+Hoy1HYRw==
dependencies:
debug "^4.1.1"
decompress-response "^4.2.0"
into-stream "^5.1.0"
ms "^2.1.1"
once "^1.4.0"
pump "^3.0.0"
accepts@~1.3.5:
version "1.3.5"
resolved "https://registry.yarnpkg.com/accepts/-/accepts-1.3.5.tgz#eb777df6011723a3b14e8a72c0805c8e86746bd2"
@ -767,6 +779,13 @@ debug@^3.1.0, debug@^3.2.6:
dependencies:
ms "^2.1.1"
debug@^4.1.1:
version "4.1.1"
resolved "https://registry.yarnpkg.com/debug/-/debug-4.1.1.tgz#3b72260255109c6b589cee050f1d516139664791"
integrity sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw==
dependencies:
ms "^2.1.1"
decamelize@^1.1.1:
version "1.2.0"
resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
@ -776,6 +795,13 @@ decode-uri-component@^0.2.0:
resolved "https://registry.yarnpkg.com/decode-uri-component/-/decode-uri-component-0.2.0.tgz#eb3913333458775cb84cd1a1fae062106bb87545"
integrity sha1-6zkTMzRYd1y4TNGh+uBiEGu4dUU=
decompress-response@^4.2.0:
version "4.2.1"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986"
integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==
dependencies:
mimic-response "^2.0.0"
deep-eql@^0.1.3:
version "0.1.3"
resolved "https://registry.yarnpkg.com/deep-eql/-/deep-eql-0.1.3.tgz#ef558acab8de25206cd713906d74e56930eb69f2"
@ -897,6 +923,13 @@ encodeurl@~1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/encodeurl/-/encodeurl-1.0.2.tgz#ad3ff4c86ec2d029322f5a02c3a9a606c95b3f59"
end-of-stream@^1.1.0:
version "1.4.4"
resolved "https://registry.yarnpkg.com/end-of-stream/-/end-of-stream-1.4.4.tgz#5ae64a5f45057baf3626ec14da0ca5e4b2431eb0"
integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==
dependencies:
once "^1.4.0"
error-ex@^1.2.0:
version "1.3.2"
resolved "https://registry.yarnpkg.com/error-ex/-/error-ex-1.3.2.tgz#b4ac40648107fdcdcfae242f428bea8a14d4f1bf"
@ -1329,6 +1362,14 @@ fresh@0.5.2:
version "0.5.2"
resolved "https://registry.yarnpkg.com/fresh/-/fresh-0.5.2.tgz#3d8cadd90d976569fa835ab1f8e4b23a105605a7"
from2@^2.3.0:
version "2.3.0"
resolved "https://registry.yarnpkg.com/from2/-/from2-2.3.0.tgz#8bfb5502bde4a4d36cfdeea007fcca21d7e382af"
integrity sha1-i/tVAr3kpNNs/e6gB/zKIdfjgq8=
dependencies:
inherits "^2.0.1"
readable-stream "^2.0.0"
fs-extra@~7.0.1:
version "7.0.1"
resolved "https://registry.yarnpkg.com/fs-extra/-/fs-extra-7.0.1.tgz#4f189c44aa123b895f722804f55ea23eadc348e9"
@ -1639,6 +1680,11 @@ inherits@2, inherits@2.0.3, inherits@^2.0.3, inherits@~2.0.1, inherits@~2.0.3:
version "2.0.3"
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.3.tgz#633c2c83e3da42a502f52466022480f4208261de"
inherits@^2.0.1:
version "2.0.4"
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
inquirer@^3.0.6:
version "3.3.0"
resolved "https://registry.yarnpkg.com/inquirer/-/inquirer-3.3.0.tgz#9dd2f2ad765dcab1ff0443b491442a20ba227dc9"
@ -1658,6 +1704,14 @@ inquirer@^3.0.6:
strip-ansi "^4.0.0"
through "^2.3.6"
into-stream@^5.1.0:
version "5.1.1"
resolved "https://registry.yarnpkg.com/into-stream/-/into-stream-5.1.1.tgz#f9a20a348a11f3c13face22763f2d02e127f4db8"
integrity sha512-krrAJ7McQxGGmvaYbB7Q1mcA+cRwg9Ij2RfWIeVesNBgVDZmzY/Fa4IpZUT3bmdRzMzdf/mzltCG2Dq99IZGBA==
dependencies:
from2 "^2.3.0"
p-is-promise "^3.0.0"
invariant@^2.2.2:
version "2.2.4"
resolved "https://registry.yarnpkg.com/invariant/-/invariant-2.2.4.tgz#610f3c92c9359ce1db616e538008d23ff35158e6"
@ -2313,6 +2367,11 @@ mimic-fn@^1.0.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/mimic-fn/-/mimic-fn-1.2.0.tgz#820c86a39334640e99516928bd03fca88057d022"
mimic-response@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.0.0.tgz#996a51c60adf12cb8a87d7fb8ef24c2f3d5ebb46"
integrity sha512-8ilDoEapqA4uQ3TwS0jakGONKXVJqpy+RpM+3b7pLdOjghCrEiGp9SRkFbUHAmZW9vdnrENWHjaweIoTIJExSQ==
minimatch@^3.0.2, minimatch@^3.0.4:
version "3.0.4"
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
@ -2557,7 +2616,7 @@ on-headers@~1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/on-headers/-/on-headers-1.0.1.tgz#928f5d0f470d49342651ea6794b0857c100693f7"
once@^1.3.0:
once@^1.3.0, once@^1.3.1, once@^1.4.0:
version "1.4.0"
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"
dependencies:
@ -2608,6 +2667,11 @@ p-finally@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/p-finally/-/p-finally-1.0.0.tgz#3fbcfb15b899a44123b34b6dcc18b724336a2cae"
p-is-promise@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/p-is-promise/-/p-is-promise-3.0.0.tgz#58e78c7dfe2e163cf2a04ff869e7c1dba64a5971"
integrity sha512-Wo8VsW4IRQSKVXsJCn7TomUaVtyfjVDn3nUP7kE967BQk0CwFpdbZs0X0uk5sW9mkBa9eNM7hCMaG93WUAwxYQ==
p-limit@^1.1.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-1.3.0.tgz#b86bd5f0c25690911c7590fcbfc2010d54b3ccb8"
@ -2762,6 +2826,14 @@ psl@^1.1.24:
version "1.1.29"
resolved "https://registry.yarnpkg.com/psl/-/psl-1.1.29.tgz#60f580d360170bb722a797cc704411e6da850c67"
pump@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/pump/-/pump-3.0.0.tgz#b4a2116815bde2f4e1ea602354e8c75565107a64"
integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==
dependencies:
end-of-stream "^1.1.0"
once "^1.3.1"
punycode@2.x.x, punycode@^2.1.0:
version "2.1.1"
resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec"
@ -2823,7 +2895,7 @@ read-pkg@^1.0.0:
normalize-package-data "^2.3.2"
path-type "^1.0.0"
readable-stream@^2.0.5, readable-stream@^2.2.2:
readable-stream@^2.0.0, readable-stream@^2.0.5, readable-stream@^2.2.2:
version "2.3.6"
resolved "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz#b11c27d88b8ff1fbe070643cf94b0c79ae1b0aaf"
dependencies:

Просмотреть файл

@ -69,6 +69,7 @@ const config = (env, argv) => ({
submit_v1: './src/app/job/job-submit-v1/job-submit.component.js',
jobList: './src/app/job/job-view/fabric/job-list.jsx',
jobDetail: './src/app/job/job-view/fabric/job-detail.jsx',
jobRetry: './src/app/job/job-view/fabric/job-retry.jsx',
virtualClusters: './src/app/vc/vc.component.js',
services: './src/app/cluster-view/services/services.component.js',
hardware: './src/app/cluster-view/hardware/hardware.component.js',
@ -319,6 +320,10 @@ const config = (env, argv) => ({
filename: 'job-detail.html',
chunks: ['layout', 'jobDetail'],
}),
generateHtml({
filename: 'job-retry.html',
chunks: ['layout', 'jobRetry'],
}),
generateHtml({
filename: 'virtual-clusters.html',
chunks: ['layout', 'virtualClusters'],

Просмотреть файл

@ -63,6 +63,10 @@ spec:
- name: AUTHN_METHOD
value: OIDC
{% endif %}
{%- if cluster_cfg['cluster']['common']['job-history'] == "true" %}
- name: JOB_HISTORY
value: "true"
{%- endif %}
- name: PROM_SCRAPE_TIME
value: {{ cluster_cfg['prometheus']['scrape_interval'] * 10 }}s
- name: WEBPORTAL_PLUGINS

Просмотреть файл

@ -73,8 +73,7 @@ export function getJobDuration(jobInfo) {
}
}
export function getJobDurationString(jobInfo) {
const dur = getJobDuration(jobInfo);
export function getDurationString(dur) {
if (!isNil(dur)) {
if (dur.days > 0) {
return dur.toFormat(`d'd' h'h' m'm' s's'`);

Просмотреть файл

@ -12,6 +12,7 @@ window.ENV = {
logType: '${LOG_TYPE}',
alertManagerUri: '${ALERT_MANAGER_URI}/alert-manager',
launcherType: '${LAUNCHER_TYPE}',
jobHistory: '${JOB_HISTORY}',
};
window.PAI_PLUGINS = [${WEBPORTAL_PLUGINS}][0] || [];

Просмотреть файл

@ -31,7 +31,8 @@ import React, { useCallback, useState } from 'react';
import Card from '../../components/card';
import {
getJobDurationString,
getJobDuration,
getDurationString,
getJobModifiedTimeString,
getHumanizedJobStateString,
isLowGpuUsageJob,
@ -70,9 +71,9 @@ const AbnormalJobList = ({ jobs, style }) => {
onRender(job) {
const { legacy, name, namespace, username } = job;
const href = legacy
? `/job-detail.html?jobName=${name}`
? `/job-detail.html?jobname=${name}`
: `/job-detail.html?username=${namespace ||
username}&jobName=${name}`;
username}&jobname=${name}`;
return <Link href={href}>{name}</Link>;
},
},
@ -126,11 +127,11 @@ const AbnormalJobList = ({ jobs, style }) => {
if (isLongRunJob(job)) {
return (
<div style={{ color: palette.red }}>
{getJobDurationString(job)}
{getDurationString(getJobDuration(job))}
</div>
);
}
return getJobDurationString(job);
return getDurationString(getJobDuration(job));
},
},
{

Просмотреть файл

@ -33,7 +33,8 @@ import React from 'react';
import Card from '../../components/card';
import {
getJobDurationString,
getJobDuration,
getDurationString,
getJobModifiedTimeString,
getHumanizedJobStateString,
getJobModifiedTime,
@ -99,8 +100,8 @@ const jobListColumns = [
onRender(job) {
const { legacy, name, namespace, username } = job;
const href = legacy
? `/job-detail.html?jobName=${name}`
: `/job-detail.html?username=${namespace || username}&jobName=${name}`;
? `/job-detail.html?jobname=${name}`
: `/job-detail.html?username=${namespace || username}&jobname=${name}`;
return <Link href={href}>{name}</Link>;
},
},
@ -123,7 +124,7 @@ const jobListColumns = [
headerClassName: FontClassNames.medium,
isResizable: true,
onRender(job) {
return getJobDurationString(job);
return getDurationString(getJobDuration(job));
},
},
{

Просмотреть файл

@ -201,7 +201,7 @@ export const SubmissionSection = props => {
try {
await populateProtocolWithDataCli(user, protocol, jobData);
await submitJob(protocol.toYaml());
window.location.href = `/job-detail.html?username=${user}&jobName=${protocol.name}`;
window.location.href = `/job-detail.html?username=${user}&jobname=${protocol.name}`;
} catch (err) {
alert(err);
}

Просмотреть файл

@ -22,7 +22,8 @@ import Filter from './Filter';
import Ordering from './Ordering';
import StatusBadge from '../../../../components/status-badge';
import {
getJobDurationString,
getJobDuration,
getDurationString,
isStoppable,
} from '../../../../components/util/job';
import StopJobConfirm from './StopJobConfirm';
@ -107,8 +108,8 @@ export default function Table() {
onRender(job) {
const { legacy, name, namespace, username } = job;
const href = legacy
? `/job-detail.html?jobName=${name}`
: `/job-detail.html?username=${namespace || username}&jobName=${name}`;
? `/job-detail.html?jobname=${name}`
: `/job-detail.html?username=${namespace || username}&jobname=${name}`;
return <Link href={href}>{name}</Link>;
},
});
@ -145,7 +146,7 @@ export default function Table() {
headerClassName: FontClassNames.medium,
isResizable: true,
onRender(job) {
return getJobDurationString(job);
return getDurationString(getJobDuration(job));
},
});
const virtualClusterColumn = applySortProps({

Просмотреть файл

@ -47,16 +47,18 @@ import t from '../../../../../components/tachyons.scss';
import Card from './card';
import Context from './context';
import Timer from './timer';
import { getTensorBoardUrl, getJobMetricsUrl, checkAttemptAPI } from '../conn';
import {
getTensorBoardUrl,
getJobMetricsUrl,
openJobAttemptsPage,
} from '../conn';
import { printDateTime, isJobV2 } from '../util';
printDateTime,
isJobV2,
HISTORY_API_ERROR_MESSAGE,
HISTORY_DISABLE_MESSAGE,
} from '../util';
import MonacoPanel from '../../../../../components/monaco-panel';
import StatusBadge from '../../../../../components/status-badge';
import {
getJobDurationString,
getJobDuration,
getDurationString,
getHumanizedJobStateString,
isStoppable,
} from '../../../../../components/util/job';
@ -92,6 +94,7 @@ export default class Summary extends React.Component {
modalTitle: '',
autoReloadInterval: 10 * 1000,
hideDialog: true,
isRetryHealthy: false,
};
this.onChangeInterval = this.onChangeInterval.bind(this);
@ -101,6 +104,16 @@ export default class Summary extends React.Component {
this.showJobConfig = this.showJobConfig.bind(this);
this.showStopJobConfirm = this.showStopJobConfirm.bind(this);
this.setHideDialog = this.setHideDialog.bind(this);
this.checkRetryHealthy = this.checkRetryHealthy.bind(this);
this.checkRetryLink = this.checkRetryLink.bind(this);
}
async componentDidMount() {
if (await this.checkRetryHealthy()) {
this.setState({ isRetryHealthy: true });
} else {
this.setState({ isRetryHealthy: false });
}
}
onChangeInterval(e, item) {
@ -256,6 +269,17 @@ export default class Summary extends React.Component {
return result;
}
async checkRetryHealthy() {
if (config.launcherType !== 'k8s') {
return false;
}
if (!(await checkAttemptAPI())) {
return false;
}
return true;
}
renderHintMessage() {
const { jobInfo } = this.props;
if (!jobInfo) {
@ -322,12 +346,29 @@ export default class Summary extends React.Component {
}
}
checkRetryLink() {
const { jobInfo } = this.props;
const { isRetryHealthy } = this.state;
if (
config.jobHistory !== 'true' ||
!isRetryHealthy ||
isNil(jobInfo.jobStatus.retries) ||
jobInfo.jobStatus.retries === 0
) {
return false;
} else {
return true;
}
}
render() {
const {
autoReloadInterval,
modalTitle,
monacoProps,
hideDialog,
isRetryHealthy,
} = this.state;
const { className, jobInfo, reloading, onStopJob, onReload } = this.props;
const { rawJobConfig } = this.context;
@ -335,7 +376,7 @@ export default class Summary extends React.Component {
const params = new URLSearchParams(window.location.search);
const namespace = params.get('username');
const jobName = params.get('jobName');
const jobName = params.get('jobname');
return (
<div className={className}>
@ -457,24 +498,23 @@ export default class Summary extends React.Component {
<div className={t.ml4}>
<div className={c(t.gray, FontClassNames.medium)}>Duration</div>
<div className={c(t.mt3, FontClassNames.mediumPlus)}>
{getJobDurationString(jobInfo.jobStatus)}
{getDurationString(getJobDuration(jobInfo.jobStatus))}
</div>
</div>
<div className={t.ml4}>
<div className={c(t.gray, FontClassNames.medium)}>Retries</div>
{config.launcherType === 'k8s' ||
isNil(jobInfo.jobStatus.retries) ? (
<div className={c(t.mt3, FontClassNames.mediumPlus)}>
{jobInfo.jobStatus.retries}
</div>
) : (
{this.checkRetryLink() ? (
<Link
onClick={() => openJobAttemptsPage(jobInfo.jobStatus.retries)}
href={`job-retry.html?username=${namespace}&jobname=${jobName}`}
>
<div className={c(t.mt3, FontClassNames.mediumPlus)}>
{jobInfo.jobStatus.retries}
</div>
</Link>
) : (
<div className={c(t.mt3, FontClassNames.mediumPlus)}>
{jobInfo.jobStatus.retries}
</div>
)}
</div>
</div>
@ -533,6 +573,75 @@ export default class Summary extends React.Component {
>
Go to TensorBoard Page
</Link>
<div className={c(t.bl, t.mh3)}></div>
<div className={c(t.flex)}>
<Link
styles={{ root: [FontClassNames.mediumPlus] }}
href={`job-retry.html?username=${namespace}&jobname=${jobName}`}
disabled={!this.checkRetryLink()}
target='_blank'
>
Go to Retry History Page
</Link>
{config.jobHistory !== 'true' && (
<div className={t.ml2}>
<TooltipHost
calloutProps={{
isBeakVisible: false,
}}
tooltipProps={{
onRenderContent: () => (
<div className={c(t.flex, t.itemsCenter)}>
{HISTORY_DISABLE_MESSAGE}
</div>
),
}}
directionalHint={DirectionalHint.topLeftEdge}
>
<div>
<Icon
iconName='Info'
styles={{
root: [
{ fontSize: IconFontSizes.medium },
ColorClassNames.neutralSecondary,
],
}}
/>
</div>
</TooltipHost>
</div>
)}
{config.jobHistory === 'true' && !isRetryHealthy && (
<div className={t.ml2}>
<TooltipHost
calloutProps={{
isBeakVisible: false,
}}
tooltipProps={{
onRenderContent: () => (
<div className={c(t.flex, t.itemsCenter)}>
{HISTORY_API_ERROR_MESSAGE}
</div>
),
}}
directionalHint={DirectionalHint.topLeftEdge}
>
<div>
<Icon
iconName='Warning'
styles={{
root: [
{ fontSize: IconFontSizes.medium },
ColorClassNames.neutralSecondary,
],
}}
/>
</div>
</TooltipHost>
</div>
)}
</div>
</div>
<div>
<span>

Просмотреть файл

@ -23,8 +23,8 @@ import { checkToken } from '../../../../user/user-auth/user-auth.component';
import config from '../../../../config/webportal.config';
const params = new URLSearchParams(window.location.search);
const namespace = params.get('username');
const jobName = params.get('jobName');
const userName = params.get('username');
const jobName = params.get('jobname');
const absoluteUrlRegExp = /^[a-z][a-z\d+.-]*:/;
export class NotFoundError extends Error {
@ -34,9 +34,52 @@ export class NotFoundError extends Error {
}
}
export async function checkAttemptAPI() {
const healthEndpoint = `${config.restServerUri}/api/v2/jobs/${userName}~${jobName}/job-attempts/healthz`;
const healthRes = await fetch(healthEndpoint);
if (healthRes.status !== 200) {
return false;
} else {
return true;
}
}
export async function fetchJobRetries() {
if (!(await checkAttemptAPI())) {
return {
isSucceeded: false,
errorMessage: 'Attempts API is not working!',
jobRetries: null,
};
}
const listAttemptsUrl = `${config.restServerUri}/api/v1/jobs/${userName}~${jobName}/job-attempts`;
const listRes = await fetch(listAttemptsUrl);
if (listRes.status === 404) {
return {
isSucceeded: false,
errorMessage: 'Could not find any attempts of this job!',
jobRetries: null,
};
} else if (listRes.status === 200) {
const jobAttempts = await listRes.json();
return {
isSucceeded: true,
errorMessage: null,
jobRetries: jobAttempts.filter(attempt => !attempt.isLatest),
};
} else {
return {
isSucceeded: false,
errorMessage: 'Some errors occured!',
jobRetries: null,
};
}
}
export async function fetchJobInfo() {
const url = namespace
? `${config.restServerUri}/api/v1/jobs/${namespace}~${jobName}`
const url = userName
? `${config.restServerUri}/api/v1/jobs/${userName}~${jobName}`
: `${config.restServerUri}/api/v1/jobs/${jobName}`;
const res = await fetch(url);
const json = await res.json();
@ -48,8 +91,8 @@ export async function fetchJobInfo() {
}
export async function fetchRawJobConfig() {
const url = namespace
? `${config.restServerUri}/api/v1/jobs/${namespace}~${jobName}/config`
const url = userName
? `${config.restServerUri}/api/v1/jobs/${userName}~${jobName}/config`
: `${config.restServerUri}/api/v1/jobs/${jobName}/config`;
const res = await fetch(url);
const text = await res.text();
@ -66,8 +109,8 @@ export async function fetchRawJobConfig() {
}
export async function fetchJobConfig() {
const url = namespace
? `${config.restServerUri}/api/v2/jobs/${namespace}~${jobName}/config`
const url = userName
? `${config.restServerUri}/api/v2/jobs/${userName}~${jobName}/config`
: `${config.restServerUri}/api/v1/jobs/${jobName}/config`;
const res = await fetch(url);
const text = await res.text();
@ -84,8 +127,8 @@ export async function fetchJobConfig() {
}
export async function fetchSshInfo() {
const url = namespace
? `${config.restServerUri}/api/v1/jobs/${namespace}~${jobName}/ssh`
const url = userName
? `${config.restServerUri}/api/v1/jobs/${userName}~${jobName}/ssh`
: `${config.restServerUri}/api/v1/jobs/${jobName}/ssh`;
const res = await fetch(url);
const json = await res.json();
@ -135,13 +178,13 @@ export function getJobMetricsUrl(jobInfo) {
to = jobInfo.jobStatus.completedTime;
}
return `${config.grafanaUri}/dashboard/db/joblevelmetrics?var-job=${
namespace ? `${namespace}~${jobName}` : jobName
userName ? `${userName}~${jobName}` : jobName
}&from=${from}&to=${to}`;
}
export async function stopJob() {
const url = namespace
? `${config.restServerUri}/api/v1/jobs/${namespace}~${jobName}/executionType`
const url = userName
? `${config.restServerUri}/api/v1/jobs/${userName}~${jobName}/executionType`
: `${config.restServerUri}/api/v1/jobs/${jobName}/executionType`;
const token = checkToken();
const res = await fetch(url, {
@ -224,23 +267,3 @@ export async function getContainerLog(logUrl) {
throw new Error(`Log not available`);
}
}
export function openJobAttemptsPage(retryCount) {
const search = namespace ? namespace + '~' + jobName : jobName;
const jobSessionTemplate = JSON.stringify({
iCreate: 1,
iStart: 0,
iEnd: retryCount + 1,
iLength: 20,
aaSorting: [[0, 'desc', 1]],
oSearch: {
bCaseInsensitive: true,
sSearch: search,
bRegex: false,
bSmart: true,
},
abVisCols: [],
});
sessionStorage.setItem('apps', jobSessionTemplate);
window.open(config.yarnWebPortalUri);
}

Просмотреть файл

@ -70,3 +70,8 @@ export function getTaskConfig(rawJobConfig, name) {
}
return null;
}
export const HISTORY_DISABLE_MESSAGE =
'The job history was not enabled when deploying.';
export const HISTORY_API_ERROR_MESSAGE =
'The job hisotry API is not healthy right now.';

Просмотреть файл

@ -0,0 +1,93 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import 'core-js/stable';
import 'regenerator-runtime/runtime';
import 'whatwg-fetch';
import { isNil } from 'lodash';
import {
initializeIcons,
Fabric,
Stack,
getTheme,
} from 'office-ui-fabric-react';
import React, { useEffect, useState } from 'react';
import ReactDOM from 'react-dom';
import Top from './job-retry/top';
import { SpinnerLoading } from '../../../components/loading';
import { JobRetryCard } from './job-retry/job-retry-card';
import { fetchJobRetries } from './job-detail/conn';
initializeIcons();
const { spacing } = getTheme();
const JobRetryPage = () => {
const [loading, setLoading] = useState(true);
const [jobRetries, setJobRetries] = useState(null);
useEffect(() => {
reload(true);
}, []);
const reload = async alertFlag => {
let errorMessage;
try {
const result = await fetchJobRetries();
if (result.isSucceeded) {
setJobRetries(result.jobRetries);
} else {
errorMessage = result.errorMessage;
}
} catch (err) {
errorMessage = `fetch job status failed: ${err.message}`;
}
if (alertFlag === true && !isNil(errorMessage)) {
alert(errorMessage);
}
setLoading(false);
};
return (
<Fabric
style={{
height: '100%',
margin: `${spacing.l1} auto`,
maxWidth: 1200,
}}
>
{loading && <SpinnerLoading />}
{!loading && (
<Stack gap='m'>
<Top />
<Stack gap='l1'>
{jobRetries.map(jobRetry => {
return (
<JobRetryCard key={jobRetry.attemptIndex} jobRetry={jobRetry} />
);
})}
</Stack>
</Stack>
)}
</Fabric>
);
};
ReactDOM.render(<JobRetryPage />, document.getElementById('content-wrapper'));
document.getElementById('sidebar-menu--job-view').classList.add('active');

Просмотреть файл

@ -0,0 +1,149 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import { FontClassNames, getTheme } from '@uifabric/styling';
import c from 'classnames';
import { capitalize, isNil } from 'lodash';
import { Link } from 'office-ui-fabric-react';
import {
DetailsList,
SelectionMode,
DetailsListLayoutMode,
} from 'office-ui-fabric-react/lib/DetailsList';
import PropTypes from 'prop-types';
import React from 'react';
import t from '../../../../components/tachyons.scss';
import StatusBadge from '../../../../components/status-badge';
const { palette } = getTheme();
export const ContainerList = ({ taskStatuses }) => {
const columns = [
{
key: 'number',
name: 'No.',
headerClassName: FontClassNames.medium,
minWidth: 50,
maxWidth: 50,
isResizable: true,
onRender: (item, idx) => {
return (
!isNil(idx) && <div className={FontClassNames.mediumPlus}>{idx}</div>
);
},
},
{
key: 'name',
name: 'Container ID',
headerClassName: FontClassNames.medium,
minWidth: 100,
maxWidth: 500,
isResizable: true,
onRender: item => {
const id = item.containerId;
return (
!isNil(id) && (
<div className={c(t.truncate, FontClassNames.mediumPlus)}>{id}</div>
)
);
},
},
{
key: 'containerIP',
name: 'Container IP',
headerClassName: FontClassNames.medium,
minWidth: 100,
maxWidth: 100,
isResizable: true,
onRender: (item, idx) => {
return (
<div className={FontClassNames.mediumPlus}>{item.containerIp}</div>
);
},
},
{
key: 'status',
name: 'Status',
headerClassName: FontClassNames.medium,
minWidth: 100,
maxWidth: 100,
isResizable: true,
onRender: item => <StatusBadge status={capitalize(item.taskState)} />,
},
{
key: 'userLog',
name: 'User Log',
headerClassName: FontClassNames.medium,
minWidth: 100,
maxWidth: 100,
onRender: item => {
const logUrl = item.containerLog;
const allLogUrl = `${logUrl}user.pai.all`;
return (
!isNil(logUrl) && (
<Link
styles={{ root: [FontClassNames.mediumPlus] }}
href={allLogUrl}
target='_blank'
>
User Log
</Link>
)
);
},
},
{
key: 'logFolder',
name: 'Log Folder',
headerClassName: FontClassNames.medium,
minWidth: 100,
maxWidth: 100,
onRender: item => {
const logUrl = item.containerLog;
return (
!isNil(logUrl) && (
<Link
styles={{ root: [FontClassNames.mediumPlus] }}
href={logUrl}
target='_blank'
>
Log Folder
</Link>
)
);
},
},
];
return (
<div style={{ backgroundColor: palette.neutralLight }}>
<DetailsList
columns={columns}
disableSelectionZone
items={taskStatuses}
layoutMode={DetailsListLayoutMode.justified}
selectionMode={SelectionMode.none}
/>
</div>
);
};
ContainerList.propTypes = {
taskStatuses: PropTypes.arrayOf(PropTypes.object),
};

Просмотреть файл

@ -0,0 +1,313 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import { FontClassNames, ColorClassNames, getTheme } from '@uifabric/styling';
import c from 'classnames';
import { Stack, IconButton, Link } from 'office-ui-fabric-react';
import PropTypes from 'prop-types';
import React, { useState } from 'react';
import { Interval, DateTime } from 'luxon';
import { capitalize, isNil, get } from 'lodash';
import styled from 'styled-components';
import yaml from 'js-yaml';
import { getDurationString } from '../../../../components/util/job';
import StatusBadge from '../../../../components/status-badge';
import { ContainerList } from './container-list';
import { printDateTime } from '../job-detail/util';
import MonacoPanel from '../../../../components/monaco-panel';
const { spacing, palette } = getTheme();
function getAttemptDurationString(attempt) {
const start =
attempt.attemptStartedTime &&
DateTime.fromMillis(attempt.attemptStartedTime);
const end = attempt.attemptCompletedTime
? DateTime.fromMillis(attempt.attemptCompletedTime)
: DateTime.utc();
if (start && end) {
return getDurationString(
Interval.fromDateTimes(start, end || DateTime.utc()).toDuration([
'days',
'hours',
'minutes',
'seconds',
]),
);
} else {
return 'N/A';
}
}
const RetryCard = styled.div`
background: #f8f8f8;
box-shadow: rgba(0, 0, 0, 0.06) 0px 2px 4px, rgba(0, 0, 0, 0.05) 0px 0.5px 1px;
`;
const TaskRoleCard = styled.div`
padding: ${spacing.l1};
background: ${palette.white};
box-shadow: rgba(0, 0, 0, 0.06) 0px 2px 4px, rgba(0, 0, 0, 0.05) 0px 0.5px 1px;
`;
const TaskRole = ({ name, taskrole }) => {
const [isExpanded, setIsExpanded] = useState(false);
return (
<TaskRoleCard>
<Stack gap='s1'>
<Stack
horizontal
horizontalAlign='space-between'
verticalAlign='center'
>
<div className={c(FontClassNames.medium)}>
<span style={{ marginRight: spacing.s1 }}>TaslRole Name:</span>
<span>{`${name} (${taskrole.taskStatuses.length})`}</span>
</div>
<div>
{isExpanded ? (
<IconButton
iconProps={{ iconName: 'ChevronUp' }}
onClick={() => setIsExpanded(false)}
/>
) : (
<IconButton
iconProps={{ iconName: 'ChevronDown' }}
onClick={() => setIsExpanded(true)}
/>
)}
</div>
</Stack>
{isExpanded && <ContainerList taskStatuses={taskrole.taskStatuses} />}
</Stack>
</TaskRoleCard>
);
};
TaskRole.propTypes = {
name: PropTypes.string,
taskrole: PropTypes.object,
};
export const JobRetryCard = ({ jobRetry }) => {
const [monacoProps, setMonacoProps] = useState(null);
const [modalTitle, setModalTile] = useState('');
const showEditor = (title, props) => {
setMonacoProps(props);
setModalTile(title);
};
const dismissEditor = () => {
setMonacoProps(null);
setModalTile('');
};
const showExitDiagnostics = () => {
const result = [];
// trigger info
result.push('[Exit Trigger Info]');
result.push('');
result.push(
`ExitTriggerMessage: ${get(jobRetry, 'appExitTriggerMessage')}`,
);
result.push(
`ExitTriggerTaskRole: ${get(jobRetry, 'appExitTriggerTaskRoleName')}`,
);
result.push(
`ExitTriggerTaskIndex: ${get(jobRetry, 'appExitTriggerTaskIndex')}`,
);
const userExitCode = get(
jobRetry,
'appExitMessages.runtime.originalUserExitCode',
);
if (userExitCode) {
// user exit code
result.push(`UserExitCode: ${userExitCode}`);
}
result.push('');
// exit spec
const spec = jobRetry.appExitSpec;
if (spec) {
// divider
result.push(Array.from({ length: 80 }, () => '-').join(''));
result.push('');
// content
result.push('[Exit Spec]');
result.push('');
result.push(yaml.safeDump(spec));
result.push('');
}
// diagnostics
const diag = jobRetry.appExitDiagnostics;
if (diag) {
// divider
result.push(Array.from({ length: 80 }, () => '-').join(''));
result.push('');
// content
result.push('[Exit Diagnostics]');
result.push('');
result.push(diag);
result.push('');
}
showEditor('Exit Diagnostics', {
language: 'text',
value: result.join('\n'),
});
};
return (
<RetryCard>
<Stack gap='l1'>
<Stack horizontal verticalAlign='center' gap='l1' padding='l1 l1 0 l1'>
<div
className={c(FontClassNames.large)}
style={{ marginRight: spacing.l1 }}
>
<span style={{ marginRight: spacing.s1 }}>Retry Index:</span>
<span>{jobRetry.attemptIndex}</span>
</div>
</Stack>
<Stack horizontal verticalAlign='baseline' gap='l1' padding='0 l1 0 l1'>
<div>
<div
className={c(
FontClassNames.medium,
ColorClassNames.neutralSecondary,
)}
style={{ marginBottom: spacing.s1 }}
>
Status:
</div>
<StatusBadge status={capitalize(jobRetry.state)} />
</div>
<div>
<div
className={c(
FontClassNames.medium,
ColorClassNames.neutralSecondary,
)}
style={{ marginBottom: spacing.s1 }}
>
Start Time:
</div>
<div className={c(FontClassNames.mediumPlus)}>
{printDateTime(DateTime.fromMillis(jobRetry.attemptStartedTime))}
</div>
</div>
<div>
<div
className={c(
FontClassNames.medium,
ColorClassNames.neutralSecondary,
)}
style={{ marginBottom: spacing.s1 }}
>
Duration:
</div>
<div className={c(FontClassNames.mediumPlus)}>
{getAttemptDurationString(jobRetry)}
</div>
</div>
<div>
<div
className={c(
FontClassNames.medium,
ColorClassNames.neutralSecondary,
)}
style={{ marginBottom: spacing.s1 }}
>
Exit Code:
</div>
<div className={c(FontClassNames.mediumPlus)}>
{`${jobRetry.exitCode}`}
</div>
</div>
<div>
<div
className={c(
FontClassNames.medium,
ColorClassNames.neutralSecondary,
)}
style={{ marginBottom: spacing.s1 }}
>
Exit Phrase:
</div>
<div className={c(FontClassNames.mediumPlus)}>
{`${jobRetry.exitPhrase}`}
</div>
</div>
<div>
<div
className={c(
FontClassNames.medium,
ColorClassNames.neutralSecondary,
)}
style={{ marginBottom: spacing.s1 }}
>
Exit Type:
</div>
<div className={c(FontClassNames.mediumPlus)}>
{`${jobRetry.exitType}`}
</div>
</div>
<div>
<div
className={c(
FontClassNames.medium,
ColorClassNames.neutralSecondary,
)}
style={{ marginBottom: spacing.s1 }}
>
Exit Diagnostics:
</div>
<Link
styles={{ root: [FontClassNames.mediumPlus] }}
href='#'
onClick={showExitDiagnostics}
>
View Exit Diagnostics
</Link>
</div>
</Stack>
<Stack gap='m'>
{Object.keys(jobRetry.taskRoles).map(name => (
<TaskRole
key={name}
name={name}
taskrole={jobRetry.taskRoles[name]}
/>
))}
</Stack>
</Stack>
<MonacoPanel
isOpen={!isNil(monacoProps)}
onDismiss={dismissEditor}
title={modalTitle}
monacoProps={monacoProps}
/>
</RetryCard>
);
};
JobRetryCard.propTypes = {
jobRetry: PropTypes.object,
};

Просмотреть файл

@ -0,0 +1,38 @@
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
// to permit persons to whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import React from 'react';
import { Stack, ActionButton } from 'office-ui-fabric-react';
const params = new URLSearchParams(window.location.search);
const username = params.get('username');
const jobname = params.get('jobname');
const Top = () => (
<Stack>
<div>
<ActionButton
iconProps={{ iconName: 'revToggleKey' }}
href={`job-detail.html?username=${username}&jobname=${jobname}`}
>
Back to Job Detail
</ActionButton>
</div>
</Stack>
);
export default Top;