[medium] add "idle" status to agents that stop sending heartbeats

This commit is contained in:
Julien Vehent 2014-12-19 12:51:15 -05:00
Родитель ddbf9745ee
Коммит bd6d224822
5 изменённых файлов: 49 добавлений и 13 удалений

Просмотреть файл

@ -362,7 +362,7 @@ GET <root>/search
Status depends on the type. Below are the available statuses per type:
- `action`: init, preparing, invalid, inflight, completed
- `agent`: online, upgraded, destroyed, offline
- `agent`: online, upgraded, destroyed, offline, idle
- `command`: prepared, sent, success, timeout, cancelled, expired, failed
- `investigator`: active, disabled

Просмотреть файл

@ -391,7 +391,7 @@ http://localhost:1664/api/v1/investigator/create/</code></pre>
<blockquote>
<ul>
<li><cite>action</cite>: init, preparing, invalid, inflight, completed</li>
<li><cite>agent</cite>: online, upgraded, destroyed, offline</li>
<li><cite>agent</cite>: online, upgraded, destroyed, offline, idle</li>
<li><cite>command</cite>: prepared, sent, success, timeout, cancelled, expired, failed</li>
<li><cite>investigator</cite>: active, disabled</li>
</ul>

Просмотреть файл

@ -12,6 +12,7 @@ const (
AgtStatusUpgraded string = "upgraded"
AgtStatusDestroyed string = "destroyed"
AgtStatusOffline string = "offline"
AgtStatusIdle string = "idle"
)
// Agent stores the description of an agent and serves as a canvas

Просмотреть файл

@ -171,11 +171,11 @@ func (db *DB) ActiveAgentsByTarget(target string) (agents []mig.Agent, err error
_ = txn.Rollback()
return
}
rows, err := txn.Query(`SELECT DISTINCT ON (queueloc) id, name, queueloc, os, version, pid,
rows, err := txn.Query(fmt.Sprintf(`SELECT DISTINCT ON (queueloc) id, name, queueloc, os, version, pid,
starttime, destructiontime, heartbeattime, status
FROM agents
WHERE agents.status = 'online' AND (` + target + `)
ORDER BY agents.queueloc, agents.heartbeattime DESC`)
WHERE agents.status IN ('%s', '%s') AND (%s)
ORDER BY agents.queueloc, agents.heartbeattime DESC`, mig.AgtStatusOnline, mig.AgtStatusIdle, target))
if err != nil {
_ = txn.Rollback()
err = fmt.Errorf("Error while finding agents: '%v'", err)
@ -312,13 +312,24 @@ func (db *DB) CountDisappearedAgents(seenSince, activeSince time.Time) (sum floa
return
}
// MarkOfflineAgents updates the status of agents that have not sent a heartbeat since pointInTime
// MarkOfflineAgents updates the status of idle agents that have not sent a heartbeat since pointInTime
func (db *DB) MarkOfflineAgents(pointInTime time.Time) (err error) {
_, err = db.c.Exec(`UPDATE agents SET status=$1
WHERE heartbeattime<$2 AND status!=$3`,
mig.AgtStatusOffline, pointInTime, mig.AgtStatusOffline)
WHERE heartbeattime<$2 AND status=$3`,
mig.AgtStatusOffline, pointInTime, mig.AgtStatusIdle)
if err != nil {
return fmt.Errorf("Failed to mark agents as offline in database: '%v'", err)
}
return
}
// MarkIdleAgents updates the status of online agents that have not sent a heartbeat since pointInTime
func (db *DB) MarkIdleAgents(pointInTime time.Time) (err error) {
_, err = db.c.Exec(`UPDATE agents SET status=$1
WHERE heartbeattime<$2 AND status=$3`,
mig.AgtStatusIdle, pointInTime, mig.AgtStatusOnline)
if err != nil {
return fmt.Errorf("Failed to mark agents as idle in database: '%v'", err)
}
return
}

Просмотреть файл

@ -46,7 +46,11 @@ func spoolInspection(ctx Context) (err error) {
if err != nil {
panic(err)
}
err = timeoutAgents(ctx)
err = markOfflineAgents(ctx)
if err != nil {
panic(err)
}
err = markIdleAgents(ctx)
if err != nil {
panic(err)
}
@ -215,13 +219,13 @@ func cleanDir(ctx Context, targetDir string) (err error) {
return
}
// timeoutAgents updates the status of agents that are no longer heartbeating to "offline"
func timeoutAgents(ctx Context) (err error) {
// markOfflineAgents updates the status of idle agents that passed the agent timeout to "offline"
func markOfflineAgents(ctx Context) (err error) {
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("timeoutAgents() -> %v", e)
err = fmt.Errorf("markOfflineAgents() -> %v", e)
}
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: "leaving timeoutAgents()"}.Debug()
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: "leaving markOfflineAgents()"}.Debug()
}()
timeOutPeriod, err := time.ParseDuration(ctx.Agent.TimeOut)
if err != nil {
@ -234,3 +238,23 @@ func timeoutAgents(ctx Context) (err error) {
}
return
}
// markIdleAgents updates the status of agents that stopped sending heartbeats
func markIdleAgents(ctx Context) (err error) {
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("markIdleAgents() -> %v", e)
}
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: "leaving markIdleAgents()"}.Debug()
}()
hbFreq, err := time.ParseDuration(ctx.Agent.HeartbeatFreq)
if err != nil {
panic(err)
}
pointInTime := time.Now().Add(-hbFreq * 5)
err = ctx.DB.MarkIdleAgents(pointInTime)
if err != nil {
panic(err)
}
return
}