This commit is contained in:
Tilman Kamp 2018-03-14 18:43:43 +01:00
Родитель b3d0aa0f39
Коммит 8591c7471e
5 изменённых файлов: 181 добавлений и 55 удалений

2
.gitignore поставляемый
Просмотреть файл

@ -1,5 +1,5 @@
config config
src/clusterParser.js
db.json db.json
# Logs # Logs

Просмотреть файл

@ -4,7 +4,8 @@
"description": "Machine learning job scheduler", "description": "Machine learning job scheduler",
"main": "app.js", "main": "app.js",
"scripts": { "scripts": {
"start": "node src/service.js" "start": "node src/service.js",
"postinstall": "node node_modules/pegjs/bin/pegjs src/clusterParser.pegjs"
}, },
"repository": { "repository": {
"type": "git", "type": "git",
@ -33,11 +34,11 @@
"dependencies": { "dependencies": {
"bcrypt": "^1.0.3", "bcrypt": "^1.0.3",
"body-parser": "^1.18.2", "body-parser": "^1.18.2",
"combined-stream": "^1.0.6",
"commander": "^2.14.1", "commander": "^2.14.1",
"express": "^4.16.2", "express": "^4.16.2",
"jsonwebtoken": "^8.1.1", "jsonwebtoken": "^8.1.1",
"morgan": "^1.9.0", "morgan": "^1.9.0",
"pegjs": "^0.10.0",
"process": "^0.11.10", "process": "^0.11.10",
"readable-stream": "^2.3.5", "readable-stream": "^2.3.5",
"request": "^2.83.0" "request": "^2.83.0"

Просмотреть файл

@ -17,10 +17,11 @@ exports.initApp = function(app) {
app.put('/aliases/:id', function(req, res) { app.put('/aliases/:id', function(req, res) {
if (req.user.admin) { if (req.user.admin) {
if (req.body && req.body.model) { console.log(req.body)
if (req.body && req.body.name) {
db.aliases[req.params.id] = { db.aliases[req.params.id] = {
id: req.params.id, id: req.params.id,
model: req.body.model name: req.body.name
} }
res.status(200).send() res.status(200).send()
} else { } else {

27
src/clusterParser.pegjs Normal file
Просмотреть файл

@ -0,0 +1,27 @@
start
= cluster
cluster
= left:processGroup "," right:cluster { return left.concat(right); }
/ solo:processGroup { return [solo]; }
processGroup
= left:integer ":" right:process { return { count: left, process: right }; }
/ solo:process { return { count: 1, process: solo } }
process
= "[" solo:resourceList "]" { return solo }
resourceList
= left:resourceGroup "," right:resourceList { return left.concat(right); }
/ solo:resourceGroup { return [solo]; }
resourceGroup
= left:integer ":" right:resource { return { count: left, name: right }; }
/ solo:resource { return { count: 1, name: solo } }
resource
= chars:[a-zA-Z]+[a-zA-Z0-9]* { return chars.join(""); }
integer
= digits:[0-9]+ { return parseInt(digits.join(""), 10); }

Просмотреть файл

@ -1,5 +1,6 @@
const store = require('./store.js') const store = require('./store.js')
const node = require('./nodes.js') const nodes = require('./nodes.js')
const parseClusterRequest = require('./clusterParser.js').parse
var exports = module.exports = {} var exports = module.exports = {}
var db = store.root var db = store.root
@ -17,62 +18,128 @@ exports.initDb = function() {
} }
function _getRunningJobs() { function _getRunningJobs() {
var jobs = [] var jobs = {}
Object.keys(db.nodes).forEach(id => { Object.keys(db.nodes).forEach(nodeId => {
let node = db.nodes[id] let node = db.nodes[nodeId]
if (node.state >= nodes.STATE_ACTIVE) { if (node.state >= nodes.STATE_ONLINE) {
let gpuCounter = numGpus Object.keys(node.resources).forEach(resourceType => {
gpuReservation = [] let resource = node.resources[resourceType]
for(let gpu = 0; gpu < node.gpus.length; gpu++) { if (resource.job) {
if (node.gpus[gpu].job == 0 || state == 0) { jobs[resource.job] = db.jobs[resource.job]
gpuReservation.push(gpu)
gpuCounter--
if (gpuCounter == 0) {
reservation.push({ node: id, gpuReservation: gpuReservation })
nodeCounter--
if (nodeCounter == 0) return reservation
gpuCounter = numGpus
gpuReservation = []
}
} }
} })
gpuCounter = numGpus
} }
}) })
return jobs return jobs
} }
function _reserve(numNodes, numGpus, state) { function _getJobProcesses(job) {
let reservation = [] var processes = {}
let nodeCounter = numNodes Object.keys(db.nodes).forEach(nodeId => {
Object.keys(db.nodes).forEach(id => { let node = db.nodes[nodeId]
let node = db.nodes[id] if (node.state >= nodes.STATE_ONLINE) {
if (node.state >= state) { let nodeProcesses = {}
let gpuCounter = numGpus Object.keys(node.resources).forEach(resourceType => {
gpuReservation = [] let resource = node.resources[resourceType]
for(let gpu = 0; gpu < node.gpus.length; gpu++) { if (resource.job == job.id && resource.pid) {
if (node.gpus[gpu].job == 0 || state == 0) { nodeProcesses[resource.pid] = true
gpuReservation.push(gpu) }
gpuCounter-- })
if (gpuCounter == 0) { Object.keys(nodeProcesses).forEach(pid => {
reservation.push({ node: id, gpuReservation: gpuReservation }) let pids = processes[nodeId] = processes[nodeId] || []
nodeCounter-- pids.push(pid)
if (nodeCounter == 0) return reservation })
gpuCounter = numGpus }
gpuReservation = [] })
return processes
}
function _mergeReservation(target, source) {
Object.keys(source).forEach(key => {
if (!target[key]) {
target[key] = source[key]
} else if (typeof target[key] === 'object') {
_mergeReservation(target[key], source[key])
}
})
}
function _reserve(reservation, nodeId, resourceType, resourceIndex) {
let node = reservation[nodeId] = reservation[nodeId] || {}
let resource = node[resourceType] = node[resourceType] || {}
resource[resourceIndex] = true
}
function _isReserved(reservation, nodeId, resourceType, resourceIndex) {
return reservation[nodeId] && reservation[nodeId][resourceType] && reservation[nodeId][resourceType][resourceIndex]
}
function _reserveProcessOnNode(node, reservation, resourceList) {
var nodeReservation = {}
if (!node || !node.resources) {
return null
}
for (let resource of resourceList) {
let resourceCounter = resource.count
let name = db.aliases[resource.name] ? db.aliases[resource.name].name : resource.name
Object.keys(node.resources).forEach(resourceType => {
if (resourceCounter > 0) {
let nodeResources = node.resources[resourceType]
for(let resourceIndex = 0; resourceIndex < nodeResources.length && resourceCounter > 0; resourceIndex++) {
let nodeResource = nodeResources[resourceIndex]
if (nodeResource.name == name &&
!_isReserved(reservation, node.id, resourceType, resourceIndex) &&
(!nodeResource.job || state == 0)) {
_reserve(nodeReservation, node.id, resourceType, resourceIndex)
resourceCounter--
} }
} }
} }
gpuCounter = numGpus })
}
return nodeReservation
}
function _reserveProcess(reservation, resourceList, state) {
Object.keys(db.nodes).forEach(nodeId => {
let node = db.nodes[nodeId]
if (node.state >= state) {
let nodeReservation = _reserveProcessOnNode(node, reservation, resourceList)
if (nodeReservation) {
return nodeReservation
}
} }
}) })
return false return null
}
function _reserveCluster(clusterRequest, state) {
let reservation = {}
clusterRequest.forEach(processRequest => {
for(let i=0; i<processRequest.count; i++) {
let processReservation = _reserveProcess(reservation, processRequest.process, state)
if (processReservation) {
_mergeReservation(reservation, processReservation)
} else {
return null
}
}
})
return reservation
} }
function _allocate(reservation, jobNumber) { function _allocate(reservation, jobNumber) {
reservation.forEach(instanceReservation => { Object.keys(reservation).forEach(nodeId => {
var node = db.nodes[instanceReservation.node] let node = db.nodes[nodeId]
instanceReservation.gpuReservation.forEach(reservedGpu => node.gpus[reservedGpu].job = jobNumber) Object.keys(reservation[nodeId]).forEach(resourceType => {
let resources = node[resourceType]
Object.keys(reservation[nodeId][resourceType]).forEach(resourceIndex => {
resources[resourceIndex].job = jobNumber
if (jobNumber == 0) {
resources[resourceIndex].pid = 0
}
})
})
}) })
} }
@ -80,6 +147,14 @@ function _deallocate(reservation) {
_allocate(reservation, 0) _allocate(reservation, 0)
} }
function _startJob(job) {
}
function _stopJob(job) {
}
exports.initApp = function(app) { exports.initApp = function(app) {
app.get('/jobs/:state', function(req, res) { app.get('/jobs/:state', function(req, res) {
res.status(200).send() res.status(200).send()
@ -87,18 +162,25 @@ exports.initApp = function(app) {
app.post('/jobs', function(req, res) { app.post('/jobs', function(req, res) {
store.lockAutoRelease('jobs', function() { store.lockAutoRelease('jobs', function() {
var id = db.jobIdCounter++ let id = db.jobIdCounter++
var job = req.body let job = req.body
var allocation = _getAllocation(job.numNodes, job.numGpus, _getEmptyClusterAllocation()) var clusterRequest
if (allocation) { try {
clusterRequest = parseClusterRequest(job.clusterRequest)
} catch (ex) {
console.log(ex)
res.status(400).send({ message: 'Problem parsing allocation' })
return
}
let reservation = _reserveCluster(clusterRequest, nodes.STATE_UNKNOWN)
if (reservation) {
db.jobs[id] = { db.jobs[id] = {
id: id, id: id,
user: req.user.id,
origin: job.origin, origin: job.origin,
hash: job.hash, hash: job.hash,
diff: job.diff, diff: job.diff,
description: job.description || (req.user.id + ' - ' + new Date().toISOString()), description: job.description || (req.user.id + ' - ' + new Date().toISOString()),
numNodes: job.numNodes,
numGpus: job.numGpus
} }
db.schedule.push(id) db.schedule.push(id)
res.status(200).send({ id: id }) res.status(200).send({ id: id })
@ -117,7 +199,22 @@ exports.initApp = function(app) {
}) })
app.delete('/jobs/:id', function(req, res) { app.delete('/jobs/:id', function(req, res) {
res.status(200).send() var id = Number(req.params.id)
var dbjob = db.jobs[id]
if (dbjob) {
if (req.user.id == dbjob.id || req.user.admin) {
delete db.jobs[id]
let scheduleIndex = db.schedule.indexOf(id)
if (scheduleIndex >= 0) {
db.schedule.splice(scheduleIndex, 1)
}
res.status(200).send()
} else {
res.status(403).send()
}
} else {
res.status(404).send()
}
}) })
} }