cmd/coordinator, all: fix more things related to multi-zone buildlets

This fixes stuff in CL 210498 and CL 210237.

I renamed the Zone field to ControlZone both to make it more clear and
to force compilation errors wherever Zone was used previously, which
revealed some things that were missed.

Updates golang/go#35987

Change-Id: I2f890727ece86d093a90a3b47701caa58de6ccbc
Reviewed-on: https://go-review.googlesource.com/c/build/+/210541
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Alexander Rakoczy <alex@golang.org>
This commit is contained in:
Brad Fitzpatrick 2019-12-09 18:51:29 +00:00
Родитель c1b987df2a
Коммит 326548a346
9 изменённых файлов: 50 добавлений и 54 удалений

Просмотреть файл

@ -65,10 +65,10 @@ type Environment struct {
// disabled and the coordinator serves on 8119.
IsProd bool
// Zone is the GCE zone that the coordinator instance and Kubernetes cluster
// ControlZone is the GCE zone that the coordinator instance and Kubernetes cluster
// will run in. This field may be overridden as necessary without impacting
// other fields.
Zone string
ControlZone string
// VMZones are the GCE zones that the VMs will be deployed to. These
// GCE zones will be periodically cleaned by deleting old VMs. The zones
@ -137,14 +137,14 @@ func (e Environment) ComputePrefix() string {
// The Zone value will be returned if VMZones is not set.
func (e Environment) RandomVMZone() string {
if len(e.VMZones) == 0 {
return e.Zone
return e.ControlZone
}
return e.VMZones[rand.Intn(len(e.VMZones))]
}
// Region returns the GCE region, derived from its zone.
func (e Environment) Region() string {
return e.Zone[:strings.LastIndex(e.Zone, "-")]
return e.ControlZone[:strings.LastIndex(e.ControlZone, "-")]
}
// SnapshotURL returns the absolute URL of the .tar.gz containing a
@ -227,7 +227,7 @@ var Staging = &Environment{
ProjectName: "go-dashboard-dev",
ProjectNumber: 302018677728,
IsProd: true,
Zone: "us-central1-f",
ControlZone: "us-central1-f",
VMZones: []string{"us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f"},
StaticIP: "104.154.113.235",
MachineType: "n1-standard-1",
@ -258,7 +258,7 @@ var Production = &Environment{
ProjectName: "symbolic-datum-552",
ProjectNumber: 872405196845,
IsProd: true,
Zone: "us-central1-f",
ControlZone: "us-central1-f",
VMZones: []string{"us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f"},
StaticIP: "107.178.219.46",
MachineType: "n1-standard-4",

Просмотреть файл

@ -10,45 +10,41 @@ import (
func TestEnvironmentNextZone(t *testing.T) {
testCases := []struct {
name string
env Environment
wantZone []string // desired zone should appear in this slice
name string
env Environment
wantOneOf []string // desired zone should appear in this slice
}{
{
name: "zones-not-set",
env: Environment{
Zone: "kentucky",
VMZones: []string{},
ControlZone: "kentucky",
VMZones: []string{},
},
wantZone: []string{"kentucky"},
wantOneOf: []string{"kentucky"},
},
{
name: "zone-and-zones-set",
env: Environment{
Zone: "kentucky",
VMZones: []string{"texas", "california", "washington"},
ControlZone: "kentucky",
VMZones: []string{"texas", "california", "washington"},
},
wantZone: []string{"texas", "california", "washington"},
wantOneOf: []string{"texas", "california", "washington"},
},
{
name: "zones-only-contains-one-entry",
env: Environment{
Zone: "kentucky",
VMZones: []string{"texas"},
ControlZone: "kentucky",
VMZones: []string{"texas"},
},
wantZone: []string{"texas"},
wantOneOf: []string{"texas"},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
e := Environment{
Zone: tc.env.Zone,
VMZones: tc.env.VMZones,
}
got := e.RandomVMZone()
if !containsString(got, tc.wantZone) {
t.Errorf("got=%q; want %v", got, tc.wantZone)
got := tc.env.RandomVMZone()
if !containsString(got, tc.wantOneOf) {
t.Errorf("got=%q; want %v", got, tc.wantOneOf)
}
})
}

Просмотреть файл

@ -71,7 +71,7 @@ type VMOpts struct {
// OnInstanceCreated optionally specifies a hook to run synchronously
// after the computeService.Instances.Get call.
OnGotInstanceInfo func()
OnGotInstanceInfo func(*compute.Instance)
// OnBeginBuildletProbe optionally specifies a hook to run synchronously
// before StartNewVM tries to hit buildletURL to see if it's up yet.
@ -98,6 +98,7 @@ func StartNewVM(creds *google.Credentials, buildEnv *buildenv.Environment, instN
if opts.Zone == "" {
opts.Zone = buildEnv.RandomVMZone()
}
zone := opts.Zone
if opts.DeleteIn == 0 {
opts.DeleteIn = 30 * time.Minute
}
@ -110,12 +111,6 @@ func StartNewVM(creds *google.Credentials, buildEnv *buildenv.Environment, instN
return nil, fmt.Errorf("host %q is type %q; want either a VM or container type", hostType, hconf.PoolName())
}
zone := opts.Zone
if zone == "" {
// TODO: automatic? maybe that's not useful.
// For now just return an error.
return nil, errors.New("buildlet: missing required Zone option")
}
projectID := opts.ProjectID
if projectID == "" {
return nil, errors.New("buildlet: missing required ProjectID option")
@ -328,7 +323,9 @@ OpLoop:
buildletURL = "http://" + intIP
ipPort = intIP + ":80"
}
condRun(opts.OnGotInstanceInfo)
if opts.OnGotInstanceInfo != nil {
opts.OnGotInstanceInfo(inst)
}
const timeout = 5 * time.Minute
var alive bool

Просмотреть файл

@ -116,7 +116,7 @@ func initGCE() error {
// Convert the zone from "projects/1234/zones/us-central1-a" to "us-central1-a".
projectZone = path.Base(projectZone)
buildEnv.Zone = projectZone
buildEnv.ControlZone = projectZone
if buildEnv.StaticIP == "" {
buildEnv.StaticIP, err = metadata.ExternalIP()
@ -336,7 +336,7 @@ func (p *gceBuildletPool) GetBuildlet(ctx context.Context, hostType string, lg l
waitBuildlet = lg.CreateSpan("wait_buildlet_start", instName)
curSpan = waitBuildlet
},
OnGotInstanceInfo: func() {
OnGotInstanceInfo: func(*compute.Instance) {
lg.LogEventTime("got_instance_info", "waiting_for_buildlet...")
},
Zone: zone,
@ -354,12 +354,12 @@ func (p *gceBuildletPool) GetBuildlet(ctx context.Context, hostType string, lg l
waitBuildlet.Done(nil)
bc.SetDescription("GCE VM: " + instName)
bc.SetOnHeartbeatFailure(func() {
p.putBuildlet(bc, hostType, instName)
p.putBuildlet(bc, hostType, zone, instName)
})
return bc, nil
}
func (p *gceBuildletPool) putBuildlet(bc *buildlet.Client, hostType, instName string) error {
func (p *gceBuildletPool) putBuildlet(bc *buildlet.Client, hostType, zone, instName string) error {
// TODO(bradfitz): add the buildlet to a freelist (of max N
// items) for up to 10 minutes since when it got started if
// it's never seen a command execution failure, and we can
@ -369,7 +369,7 @@ func (p *gceBuildletPool) putBuildlet(bc *buildlet.Client, hostType, instName st
// buildlet client library between Close, Destroy/Halt, and
// tracking execution errors. That was all half-baked before
// and thus removed. Now Close always destroys everything.
deleteVM(buildEnv.Zone, instName)
deleteVM(zone, instName)
p.setInstanceUsed(instName, false)
hconf, ok := dashboard.Hosts[hostType]

Просмотреть файл

@ -59,7 +59,7 @@ func initKube() error {
var err error
buildletsKubeClient, err = gke.NewClient(ctx,
buildEnv.KubeBuild.Name,
gke.OptZone(buildEnv.Zone),
gke.OptZone(buildEnv.ControlZone),
gke.OptProject(buildEnv.ProjectName),
gke.OptTokenSource(gcpCreds.TokenSource))
if err != nil {
@ -68,7 +68,7 @@ func initKube() error {
goKubeClient, err = gke.NewClient(ctx,
buildEnv.KubeTools.Name,
gke.OptZone(buildEnv.Zone),
gke.OptZone(buildEnv.ControlZone),
gke.OptProject(buildEnv.ProjectName),
gke.OptTokenSource(gcpCreds.TokenSource))
if err != nil {
@ -437,7 +437,7 @@ func (p *kubeBuildletPool) cleanUpOldPods(ctx context.Context) {
}
if err == nil && time.Now().Unix() > unixDeadline {
stats.DeletedOld++
log.Printf("cleanUpOldPods: Deleting expired pod %q in zone %q ...", pod.Name, buildEnv.Zone)
log.Printf("cleanUpOldPods: Deleting expired pod %q in zone %q ...", pod.Name, buildEnv.ControlZone)
err = buildletsKubeClient.DeletePod(ctx, pod.Name)
if err != nil {
log.Printf("cleanUpOldPods: problem deleting old pod %q: %v", pod.Name, err)

Просмотреть файл

@ -90,9 +90,6 @@ func main() {
}
env = buildenv.FromFlags()
if *zone != "" {
env.Zone = *zone
}
ctx := context.Background()
@ -106,15 +103,20 @@ func main() {
name := fmt.Sprintf("debug-temp-%d", time.Now().Unix())
log.Printf("Creating %s (with VM image %s)", name, vmImageSummary)
var zoneSelected string
bc, err := buildlet.StartNewVM(creds, env, name, *hostType, buildlet.VMOpts{
Zone: *zone,
OnInstanceRequested: func() { log.Printf("instance requested") },
OnInstanceCreated: func() {
log.Printf("instance created")
if *serial {
go watchSerial(name)
go watchSerial(zoneSelected, name)
}
},
OnGotInstanceInfo: func() { log.Printf("got instance info") },
OnGotInstanceInfo: func(inst *compute.Instance) {
zoneSelected = inst.Zone
log.Printf("got instance info; running in %v", zoneSelected)
},
OnBeginBuildletProbe: func(buildletURL string) {
log.Printf("About to hit %s to see if buildlet is up yet...", buildletURL)
},
@ -213,11 +215,11 @@ func main() {
// gcloud compute connect-to-serial-port --zone=xxx $NAME
// but in Go and works. For some reason, gcloud doesn't work as a
// child process and has weird errors.
func watchSerial(name string) {
func watchSerial(zone, name string) {
start := int64(0)
indent := strings.Repeat(" ", len("2017/07/25 06:37:14 SERIAL: "))
for {
sout, err := computeSvc.Instances.GetSerialPortOutput(env.ProjectName, env.Zone, name).Start(start).Do()
sout, err := computeSvc.Instances.GetSerialPortOutput(env.ProjectName, zone, name).Start(start).Do()
if err != nil {
log.Printf("serial output error: %v", err)
return

Просмотреть файл

@ -37,7 +37,7 @@ resources:
- name: "{{ .Kube.Name }}"
type: container.v1.cluster
properties:
zone: "{{ .Env.Zone }}"
zone: "{{ .Env.ControlZone }}"
cluster:
initial_node_count: {{ .Kube.MinNodes }}
network: "default"

Просмотреть файл

@ -56,12 +56,12 @@ func main() {
case "kubectl":
env := getEnv()
curCtx := kubeCurrentContext()
wantCtx := fmt.Sprintf("gke_%s_%s_go", env.ProjectName, env.Zone)
wantCtx := fmt.Sprintf("gke_%s_%s_go", env.ProjectName, env.ControlZone)
if curCtx != wantCtx {
log.SetFlags(0)
log.Fatalf("Wrong kubectl context; currently using %q; want %q\nRun:\n gcloud container clusters get-credentials --project=%s --zone=%s go",
curCtx, wantCtx,
env.ProjectName, env.Zone,
env.ProjectName, env.ControlZone,
)
}
// gcloud container clusters get-credentials --zone=us-central1-f go

Просмотреть файл

@ -9,6 +9,7 @@ import (
"errors"
"fmt"
"log"
"path"
"sort"
"strings"
"time"
@ -88,7 +89,7 @@ func (c *Client) MakeBasepinDisks(ctx context.Context) error {
return err
}
if err := c.AwaitOp(ctx, op); err != nil {
log.Fatalf("basepin: failed to create: %v", err)
return fmt.Errorf("basepin: failed to create: %v", err)
}
}
log.Printf("basepin: created %d images in %v", len(needed), zone)
@ -103,10 +104,10 @@ func (c *Client) AwaitOp(ctx context.Context, op *compute.Operation) error {
svc := c.Compute()
opName := op.Name
// TODO: move logging to Client c.logger. and add Client.WithLogger shallow copier.
log.Printf("Waiting on operation %v", opName)
log.Printf("Waiting on operation %v (in %q)", opName, op.Zone)
for {
time.Sleep(2 * time.Second)
op, err := svc.ZoneOperations.Get(c.Env.ProjectName, c.Env.Zone, opName).Do()
op, err := svc.ZoneOperations.Get(c.Env.ProjectName, path.Base(op.Zone), opName).Do()
if err != nil {
return fmt.Errorf("Failed to get op %s: %v", opName, err)
}