зеркало из https://github.com/mozilla/snakepit.git
This commit is contained in:
Родитель
b3d0aa0f39
Коммит
8591c7471e
|
@ -1,5 +1,5 @@
|
||||||
config
|
config
|
||||||
|
src/clusterParser.js
|
||||||
db.json
|
db.json
|
||||||
|
|
||||||
# Logs
|
# Logs
|
||||||
|
|
|
@ -4,7 +4,8 @@
|
||||||
"description": "Machine learning job scheduler",
|
"description": "Machine learning job scheduler",
|
||||||
"main": "app.js",
|
"main": "app.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "node src/service.js"
|
"start": "node src/service.js",
|
||||||
|
"postinstall": "node node_modules/pegjs/bin/pegjs src/clusterParser.pegjs"
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
@ -33,11 +34,11 @@
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"bcrypt": "^1.0.3",
|
"bcrypt": "^1.0.3",
|
||||||
"body-parser": "^1.18.2",
|
"body-parser": "^1.18.2",
|
||||||
"combined-stream": "^1.0.6",
|
|
||||||
"commander": "^2.14.1",
|
"commander": "^2.14.1",
|
||||||
"express": "^4.16.2",
|
"express": "^4.16.2",
|
||||||
"jsonwebtoken": "^8.1.1",
|
"jsonwebtoken": "^8.1.1",
|
||||||
"morgan": "^1.9.0",
|
"morgan": "^1.9.0",
|
||||||
|
"pegjs": "^0.10.0",
|
||||||
"process": "^0.11.10",
|
"process": "^0.11.10",
|
||||||
"readable-stream": "^2.3.5",
|
"readable-stream": "^2.3.5",
|
||||||
"request": "^2.83.0"
|
"request": "^2.83.0"
|
||||||
|
|
|
@ -17,10 +17,11 @@ exports.initApp = function(app) {
|
||||||
|
|
||||||
app.put('/aliases/:id', function(req, res) {
|
app.put('/aliases/:id', function(req, res) {
|
||||||
if (req.user.admin) {
|
if (req.user.admin) {
|
||||||
if (req.body && req.body.model) {
|
console.log(req.body)
|
||||||
|
if (req.body && req.body.name) {
|
||||||
db.aliases[req.params.id] = {
|
db.aliases[req.params.id] = {
|
||||||
id: req.params.id,
|
id: req.params.id,
|
||||||
model: req.body.model
|
name: req.body.name
|
||||||
}
|
}
|
||||||
res.status(200).send()
|
res.status(200).send()
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
start
|
||||||
|
= cluster
|
||||||
|
|
||||||
|
cluster
|
||||||
|
= left:processGroup "," right:cluster { return left.concat(right); }
|
||||||
|
/ solo:processGroup { return [solo]; }
|
||||||
|
|
||||||
|
processGroup
|
||||||
|
= left:integer ":" right:process { return { count: left, process: right }; }
|
||||||
|
/ solo:process { return { count: 1, process: solo } }
|
||||||
|
|
||||||
|
process
|
||||||
|
= "[" solo:resourceList "]" { return solo }
|
||||||
|
|
||||||
|
resourceList
|
||||||
|
= left:resourceGroup "," right:resourceList { return left.concat(right); }
|
||||||
|
/ solo:resourceGroup { return [solo]; }
|
||||||
|
|
||||||
|
resourceGroup
|
||||||
|
= left:integer ":" right:resource { return { count: left, name: right }; }
|
||||||
|
/ solo:resource { return { count: 1, name: solo } }
|
||||||
|
|
||||||
|
resource
|
||||||
|
= chars:[a-zA-Z]+[a-zA-Z0-9]* { return chars.join(""); }
|
||||||
|
|
||||||
|
integer
|
||||||
|
= digits:[0-9]+ { return parseInt(digits.join(""), 10); }
|
197
src/jobs.js
197
src/jobs.js
|
@ -1,5 +1,6 @@
|
||||||
const store = require('./store.js')
|
const store = require('./store.js')
|
||||||
const node = require('./nodes.js')
|
const nodes = require('./nodes.js')
|
||||||
|
const parseClusterRequest = require('./clusterParser.js').parse
|
||||||
|
|
||||||
var exports = module.exports = {}
|
var exports = module.exports = {}
|
||||||
var db = store.root
|
var db = store.root
|
||||||
|
@ -17,62 +18,128 @@ exports.initDb = function() {
|
||||||
}
|
}
|
||||||
|
|
||||||
function _getRunningJobs() {
|
function _getRunningJobs() {
|
||||||
var jobs = []
|
var jobs = {}
|
||||||
Object.keys(db.nodes).forEach(id => {
|
Object.keys(db.nodes).forEach(nodeId => {
|
||||||
let node = db.nodes[id]
|
let node = db.nodes[nodeId]
|
||||||
if (node.state >= nodes.STATE_ACTIVE) {
|
if (node.state >= nodes.STATE_ONLINE) {
|
||||||
let gpuCounter = numGpus
|
Object.keys(node.resources).forEach(resourceType => {
|
||||||
gpuReservation = []
|
let resource = node.resources[resourceType]
|
||||||
for(let gpu = 0; gpu < node.gpus.length; gpu++) {
|
if (resource.job) {
|
||||||
if (node.gpus[gpu].job == 0 || state == 0) {
|
jobs[resource.job] = db.jobs[resource.job]
|
||||||
gpuReservation.push(gpu)
|
|
||||||
gpuCounter--
|
|
||||||
if (gpuCounter == 0) {
|
|
||||||
reservation.push({ node: id, gpuReservation: gpuReservation })
|
|
||||||
nodeCounter--
|
|
||||||
if (nodeCounter == 0) return reservation
|
|
||||||
gpuCounter = numGpus
|
|
||||||
gpuReservation = []
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
gpuCounter = numGpus
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
return jobs
|
return jobs
|
||||||
}
|
}
|
||||||
|
|
||||||
function _reserve(numNodes, numGpus, state) {
|
function _getJobProcesses(job) {
|
||||||
let reservation = []
|
var processes = {}
|
||||||
let nodeCounter = numNodes
|
Object.keys(db.nodes).forEach(nodeId => {
|
||||||
Object.keys(db.nodes).forEach(id => {
|
let node = db.nodes[nodeId]
|
||||||
let node = db.nodes[id]
|
if (node.state >= nodes.STATE_ONLINE) {
|
||||||
if (node.state >= state) {
|
let nodeProcesses = {}
|
||||||
let gpuCounter = numGpus
|
Object.keys(node.resources).forEach(resourceType => {
|
||||||
gpuReservation = []
|
let resource = node.resources[resourceType]
|
||||||
for(let gpu = 0; gpu < node.gpus.length; gpu++) {
|
if (resource.job == job.id && resource.pid) {
|
||||||
if (node.gpus[gpu].job == 0 || state == 0) {
|
nodeProcesses[resource.pid] = true
|
||||||
gpuReservation.push(gpu)
|
}
|
||||||
gpuCounter--
|
})
|
||||||
if (gpuCounter == 0) {
|
Object.keys(nodeProcesses).forEach(pid => {
|
||||||
reservation.push({ node: id, gpuReservation: gpuReservation })
|
let pids = processes[nodeId] = processes[nodeId] || []
|
||||||
nodeCounter--
|
pids.push(pid)
|
||||||
if (nodeCounter == 0) return reservation
|
})
|
||||||
gpuCounter = numGpus
|
}
|
||||||
gpuReservation = []
|
})
|
||||||
|
return processes
|
||||||
|
}
|
||||||
|
|
||||||
|
function _mergeReservation(target, source) {
|
||||||
|
Object.keys(source).forEach(key => {
|
||||||
|
if (!target[key]) {
|
||||||
|
target[key] = source[key]
|
||||||
|
} else if (typeof target[key] === 'object') {
|
||||||
|
_mergeReservation(target[key], source[key])
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
function _reserve(reservation, nodeId, resourceType, resourceIndex) {
|
||||||
|
let node = reservation[nodeId] = reservation[nodeId] || {}
|
||||||
|
let resource = node[resourceType] = node[resourceType] || {}
|
||||||
|
resource[resourceIndex] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
function _isReserved(reservation, nodeId, resourceType, resourceIndex) {
|
||||||
|
return reservation[nodeId] && reservation[nodeId][resourceType] && reservation[nodeId][resourceType][resourceIndex]
|
||||||
|
}
|
||||||
|
|
||||||
|
function _reserveProcessOnNode(node, reservation, resourceList) {
|
||||||
|
var nodeReservation = {}
|
||||||
|
if (!node || !node.resources) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
for (let resource of resourceList) {
|
||||||
|
let resourceCounter = resource.count
|
||||||
|
let name = db.aliases[resource.name] ? db.aliases[resource.name].name : resource.name
|
||||||
|
Object.keys(node.resources).forEach(resourceType => {
|
||||||
|
if (resourceCounter > 0) {
|
||||||
|
let nodeResources = node.resources[resourceType]
|
||||||
|
for(let resourceIndex = 0; resourceIndex < nodeResources.length && resourceCounter > 0; resourceIndex++) {
|
||||||
|
let nodeResource = nodeResources[resourceIndex]
|
||||||
|
if (nodeResource.name == name &&
|
||||||
|
!_isReserved(reservation, node.id, resourceType, resourceIndex) &&
|
||||||
|
(!nodeResource.job || state == 0)) {
|
||||||
|
_reserve(nodeReservation, node.id, resourceType, resourceIndex)
|
||||||
|
resourceCounter--
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gpuCounter = numGpus
|
})
|
||||||
|
}
|
||||||
|
return nodeReservation
|
||||||
|
}
|
||||||
|
|
||||||
|
function _reserveProcess(reservation, resourceList, state) {
|
||||||
|
Object.keys(db.nodes).forEach(nodeId => {
|
||||||
|
let node = db.nodes[nodeId]
|
||||||
|
if (node.state >= state) {
|
||||||
|
let nodeReservation = _reserveProcessOnNode(node, reservation, resourceList)
|
||||||
|
if (nodeReservation) {
|
||||||
|
return nodeReservation
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
return false
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
function _reserveCluster(clusterRequest, state) {
|
||||||
|
let reservation = {}
|
||||||
|
clusterRequest.forEach(processRequest => {
|
||||||
|
for(let i=0; i<processRequest.count; i++) {
|
||||||
|
let processReservation = _reserveProcess(reservation, processRequest.process, state)
|
||||||
|
if (processReservation) {
|
||||||
|
_mergeReservation(reservation, processReservation)
|
||||||
|
} else {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return reservation
|
||||||
}
|
}
|
||||||
|
|
||||||
function _allocate(reservation, jobNumber) {
|
function _allocate(reservation, jobNumber) {
|
||||||
reservation.forEach(instanceReservation => {
|
Object.keys(reservation).forEach(nodeId => {
|
||||||
var node = db.nodes[instanceReservation.node]
|
let node = db.nodes[nodeId]
|
||||||
instanceReservation.gpuReservation.forEach(reservedGpu => node.gpus[reservedGpu].job = jobNumber)
|
Object.keys(reservation[nodeId]).forEach(resourceType => {
|
||||||
|
let resources = node[resourceType]
|
||||||
|
Object.keys(reservation[nodeId][resourceType]).forEach(resourceIndex => {
|
||||||
|
resources[resourceIndex].job = jobNumber
|
||||||
|
if (jobNumber == 0) {
|
||||||
|
resources[resourceIndex].pid = 0
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -80,6 +147,14 @@ function _deallocate(reservation) {
|
||||||
_allocate(reservation, 0)
|
_allocate(reservation, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function _startJob(job) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function _stopJob(job) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
exports.initApp = function(app) {
|
exports.initApp = function(app) {
|
||||||
app.get('/jobs/:state', function(req, res) {
|
app.get('/jobs/:state', function(req, res) {
|
||||||
res.status(200).send()
|
res.status(200).send()
|
||||||
|
@ -87,18 +162,25 @@ exports.initApp = function(app) {
|
||||||
|
|
||||||
app.post('/jobs', function(req, res) {
|
app.post('/jobs', function(req, res) {
|
||||||
store.lockAutoRelease('jobs', function() {
|
store.lockAutoRelease('jobs', function() {
|
||||||
var id = db.jobIdCounter++
|
let id = db.jobIdCounter++
|
||||||
var job = req.body
|
let job = req.body
|
||||||
var allocation = _getAllocation(job.numNodes, job.numGpus, _getEmptyClusterAllocation())
|
var clusterRequest
|
||||||
if (allocation) {
|
try {
|
||||||
|
clusterRequest = parseClusterRequest(job.clusterRequest)
|
||||||
|
} catch (ex) {
|
||||||
|
console.log(ex)
|
||||||
|
res.status(400).send({ message: 'Problem parsing allocation' })
|
||||||
|
return
|
||||||
|
}
|
||||||
|
let reservation = _reserveCluster(clusterRequest, nodes.STATE_UNKNOWN)
|
||||||
|
if (reservation) {
|
||||||
db.jobs[id] = {
|
db.jobs[id] = {
|
||||||
id: id,
|
id: id,
|
||||||
|
user: req.user.id,
|
||||||
origin: job.origin,
|
origin: job.origin,
|
||||||
hash: job.hash,
|
hash: job.hash,
|
||||||
diff: job.diff,
|
diff: job.diff,
|
||||||
description: job.description || (req.user.id + ' - ' + new Date().toISOString()),
|
description: job.description || (req.user.id + ' - ' + new Date().toISOString()),
|
||||||
numNodes: job.numNodes,
|
|
||||||
numGpus: job.numGpus
|
|
||||||
}
|
}
|
||||||
db.schedule.push(id)
|
db.schedule.push(id)
|
||||||
res.status(200).send({ id: id })
|
res.status(200).send({ id: id })
|
||||||
|
@ -117,7 +199,22 @@ exports.initApp = function(app) {
|
||||||
})
|
})
|
||||||
|
|
||||||
app.delete('/jobs/:id', function(req, res) {
|
app.delete('/jobs/:id', function(req, res) {
|
||||||
res.status(200).send()
|
var id = Number(req.params.id)
|
||||||
|
var dbjob = db.jobs[id]
|
||||||
|
if (dbjob) {
|
||||||
|
if (req.user.id == dbjob.id || req.user.admin) {
|
||||||
|
delete db.jobs[id]
|
||||||
|
let scheduleIndex = db.schedule.indexOf(id)
|
||||||
|
if (scheduleIndex >= 0) {
|
||||||
|
db.schedule.splice(scheduleIndex, 1)
|
||||||
|
}
|
||||||
|
res.status(200).send()
|
||||||
|
} else {
|
||||||
|
res.status(403).send()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
res.status(404).send()
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче