mig/mig-scheduler/agents_management.go

165 строки
4.9 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// Contributor: Julien Vehent jvehent@mozilla.com [:ulfr]
package main
import (
"encoding/json"
"fmt"
"github.com/mozilla/mig"
"time"
)
// Given an agent queue location queueLoc, send kill actions for duplicate
// agents. Where multiple agents exist on the same host, we will attempt
// to kill the older agents.
func killDupAgents(queueLoc string, ctx Context) (err error) {
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("killDupAgents() -> %v", e)
}
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: "leaving killDupAgents()"}.Debug()
}()
hbfreq, err := time.ParseDuration(ctx.Agent.HeartbeatFreq)
if err != nil {
return err
}
pointInTime := time.Now().Add(-hbfreq)
agents, err := ctx.DB.ActiveAgentsByQueue(queueLoc, pointInTime)
agentsCount := len(agents)
if agentsCount < 2 {
return
}
destroyedAgents := 0
leftAloneAgents := 0
for _, agent := range agents {
if agent.Status == "destroyed" {
// if the agent has already been marked as destroyed, check if
// that was done longer than 3 heartbeats ago. If it did, the
// destruction failed, and we need to reissue a destruction order
hbFreq, err := time.ParseDuration(ctx.Agent.HeartbeatFreq)
if err != nil {
panic(err)
}
pointInTime := time.Now().Add(-hbFreq * 3)
if agent.DestructionTime.Before(pointInTime) {
err = issueKillAction(agent, ctx)
if err != nil {
panic(err)
}
destroyedAgents++
desc := fmt.Sprintf("Re-issuing destruction action for "+
"agent '%s' with PID '%d'.", agent.Name, agent.PID)
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: desc}.Debug()
} else {
leftAloneAgents++
}
}
}
remainingAgents := agentsCount - destroyedAgents - leftAloneAgents
if remainingAgents > 1 {
// there's still some agents left. if killdupagents is set, issue kill orders
if ctx.Agent.KillDupAgents {
oldest := agents[0]
for _, agent := range agents {
if agent.Status != "online" {
continue
}
if agent.StartTime.Before(oldest.StartTime) {
oldest = agent
}
}
desc := fmt.Sprintf("Issuing destruction action for agent '%s' "+
"with PID '%d'.", oldest.Name, oldest.PID)
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: desc}
err = issueKillAction(oldest, ctx)
if err != nil {
panic(err)
}
// throttling to prevent issuing too many kill orders at the same time
time.Sleep(5 * time.Second)
} else {
// Build a list of relevant agent names to include in the manual inspection
// notification
var namelist string
for _, agent := range agents {
if namelist == "" {
namelist = agent.Name
} else {
namelist += ", " + agent.Name
}
}
desc := fmt.Sprintf("found %v agents running on %v. Require "+
"manual inspection (%v).", remainingAgents, queueLoc, namelist)
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: desc}.Warning()
}
}
return
}
// issueKillAction issues an `agentdestroy` action targeted to a specific agent
// and updates the status of the agent in the database
func issueKillAction(agent mig.Agent, ctx Context) (err error) {
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("issueKillAction() -> %v", e)
}
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: "leaving issueKillAction()"}.Debug()
}()
// generate an `agentdestroy` action for this agent
killAction := mig.Action{
ID: mig.GenID(),
Name: fmt.Sprintf("Kill agent %s", agent.Name),
Target: fmt.Sprintf("queueloc='%s'", agent.QueueLoc),
ValidFrom: time.Now().Add(-60 * time.Second).UTC(),
ExpireAfter: time.Now().Add(30 * time.Minute).UTC(),
SyntaxVersion: 2,
}
var opparams struct {
PID int `json:"pid"`
Version string `json:"version"`
}
opparams.PID = agent.PID
opparams.Version = agent.Version
killOperation := mig.Operation{
Module: "agentdestroy",
Parameters: opparams,
}
killAction.Operations = append(killAction.Operations, killOperation)
// sign the action with the scheduler PGP key
secring, err := getSecring(ctx)
if err != nil {
panic(err)
}
pgpsig, err := killAction.Sign(ctx.PGP.PrivKeyID, secring)
if err != nil {
panic(err)
}
killAction.PGPSignatures = append(killAction.PGPSignatures, pgpsig)
var jsonAction []byte
jsonAction, err = json.Marshal(killAction)
if err != nil {
panic(err)
}
// write the action to the spool for scheduling
dest := fmt.Sprintf("%s/%.0f.json", ctx.Directories.Action.New, killAction.ID)
err = safeWrite(ctx, dest, jsonAction)
if err != nil {
panic(err)
}
// mark the agent as `destroyed` in the database
err = ctx.DB.MarkAgentDestroyed(agent)
if err != nil {
panic(err)
}
ctx.Channels.Log <- mig.Log{Desc: fmt.Sprintf("issued kill action for agent '%s' "+
"with PID '%d'", agent.Name, agent.PID)}.Warning()
return
}