зеркало из https://github.com/mozilla/mig.git
165 строки
4.9 KiB
Go
165 строки
4.9 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
//
|
|
// Contributor: Julien Vehent jvehent@mozilla.com [:ulfr]
|
|
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"github.com/mozilla/mig"
|
|
"time"
|
|
)
|
|
|
|
// Given an agent queue location queueLoc, send kill actions for duplicate
|
|
// agents. Where multiple agents exist on the same host, we will attempt
|
|
// to kill the older agents.
|
|
func killDupAgents(queueLoc string, ctx Context) (err error) {
|
|
defer func() {
|
|
if e := recover(); e != nil {
|
|
err = fmt.Errorf("killDupAgents() -> %v", e)
|
|
}
|
|
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: "leaving killDupAgents()"}.Debug()
|
|
}()
|
|
hbfreq, err := time.ParseDuration(ctx.Agent.HeartbeatFreq)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
pointInTime := time.Now().Add(-hbfreq)
|
|
agents, err := ctx.DB.ActiveAgentsByQueue(queueLoc, pointInTime)
|
|
agentsCount := len(agents)
|
|
if agentsCount < 2 {
|
|
return
|
|
}
|
|
destroyedAgents := 0
|
|
leftAloneAgents := 0
|
|
for _, agent := range agents {
|
|
if agent.Status == "destroyed" {
|
|
// if the agent has already been marked as destroyed, check if
|
|
// that was done longer than 3 heartbeats ago. If it did, the
|
|
// destruction failed, and we need to reissue a destruction order
|
|
hbFreq, err := time.ParseDuration(ctx.Agent.HeartbeatFreq)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
pointInTime := time.Now().Add(-hbFreq * 3)
|
|
if agent.DestructionTime.Before(pointInTime) {
|
|
err = issueKillAction(agent, ctx)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
destroyedAgents++
|
|
desc := fmt.Sprintf("Re-issuing destruction action for "+
|
|
"agent '%s' with PID '%d'.", agent.Name, agent.PID)
|
|
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: desc}.Debug()
|
|
} else {
|
|
leftAloneAgents++
|
|
}
|
|
}
|
|
}
|
|
|
|
remainingAgents := agentsCount - destroyedAgents - leftAloneAgents
|
|
if remainingAgents > 1 {
|
|
// there's still some agents left. if killdupagents is set, issue kill orders
|
|
if ctx.Agent.KillDupAgents {
|
|
oldest := agents[0]
|
|
for _, agent := range agents {
|
|
if agent.Status != "online" {
|
|
continue
|
|
}
|
|
if agent.StartTime.Before(oldest.StartTime) {
|
|
oldest = agent
|
|
}
|
|
}
|
|
desc := fmt.Sprintf("Issuing destruction action for agent '%s' "+
|
|
"with PID '%d'.", oldest.Name, oldest.PID)
|
|
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: desc}
|
|
err = issueKillAction(oldest, ctx)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
// throttling to prevent issuing too many kill orders at the same time
|
|
time.Sleep(5 * time.Second)
|
|
} else {
|
|
// Build a list of relevant agent names to include in the manual inspection
|
|
// notification
|
|
var namelist string
|
|
for _, agent := range agents {
|
|
if namelist == "" {
|
|
namelist = agent.Name
|
|
} else {
|
|
namelist += ", " + agent.Name
|
|
}
|
|
}
|
|
desc := fmt.Sprintf("found %v agents running on %v. Require "+
|
|
"manual inspection (%v).", remainingAgents, queueLoc, namelist)
|
|
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: desc}.Warning()
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// issueKillAction issues an `agentdestroy` action targeted to a specific agent
|
|
// and updates the status of the agent in the database
|
|
func issueKillAction(agent mig.Agent, ctx Context) (err error) {
|
|
defer func() {
|
|
if e := recover(); e != nil {
|
|
err = fmt.Errorf("issueKillAction() -> %v", e)
|
|
}
|
|
ctx.Channels.Log <- mig.Log{OpID: ctx.OpID, Desc: "leaving issueKillAction()"}.Debug()
|
|
}()
|
|
// generate an `agentdestroy` action for this agent
|
|
killAction := mig.Action{
|
|
ID: mig.GenID(),
|
|
Name: fmt.Sprintf("Kill agent %s", agent.Name),
|
|
Target: fmt.Sprintf("queueloc='%s'", agent.QueueLoc),
|
|
ValidFrom: time.Now().Add(-60 * time.Second).UTC(),
|
|
ExpireAfter: time.Now().Add(30 * time.Minute).UTC(),
|
|
SyntaxVersion: 2,
|
|
}
|
|
var opparams struct {
|
|
PID int `json:"pid"`
|
|
Version string `json:"version"`
|
|
}
|
|
opparams.PID = agent.PID
|
|
opparams.Version = agent.Version
|
|
killOperation := mig.Operation{
|
|
Module: "agentdestroy",
|
|
Parameters: opparams,
|
|
}
|
|
killAction.Operations = append(killAction.Operations, killOperation)
|
|
|
|
// sign the action with the scheduler PGP key
|
|
secring, err := getSecring(ctx)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
pgpsig, err := killAction.Sign(ctx.PGP.PrivKeyID, secring)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
killAction.PGPSignatures = append(killAction.PGPSignatures, pgpsig)
|
|
var jsonAction []byte
|
|
jsonAction, err = json.Marshal(killAction)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
// write the action to the spool for scheduling
|
|
dest := fmt.Sprintf("%s/%.0f.json", ctx.Directories.Action.New, killAction.ID)
|
|
err = safeWrite(ctx, dest, jsonAction)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
// mark the agent as `destroyed` in the database
|
|
err = ctx.DB.MarkAgentDestroyed(agent)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
ctx.Channels.Log <- mig.Log{Desc: fmt.Sprintf("issued kill action for agent '%s' "+
|
|
"with PID '%d'", agent.Name, agent.PID)}.Warning()
|
|
return
|
|
}
|