зеркало из https://github.com/golang/build.git
512 строки
15 KiB
Go
512 строки
15 KiB
Go
// Copyright 2015 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package buildlet
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/build/buildenv"
|
|
"golang.org/x/build/dashboard"
|
|
"golang.org/x/oauth2"
|
|
"golang.org/x/oauth2/google"
|
|
"google.golang.org/api/compute/v1"
|
|
)
|
|
|
|
// GCEGate optionally specifies a function to run before any GCE API call.
|
|
// It's intended to be used to bound QPS rate to GCE.
|
|
var GCEGate func()
|
|
|
|
func apiGate() {
|
|
if GCEGate != nil {
|
|
GCEGate()
|
|
}
|
|
}
|
|
|
|
// ErrQuotaExceeded matches errors.Is when VM creation fails with a
|
|
// quota error. Currently, it only supports GCE quota errors.
|
|
var ErrQuotaExceeded = errors.New("quota exceeded")
|
|
|
|
type GCEError struct {
|
|
OpErrors []*compute.OperationErrorErrors
|
|
}
|
|
|
|
func (q *GCEError) Error() string {
|
|
var buf bytes.Buffer
|
|
fmt.Fprintf(&buf, "%d GCE operation errors: ", len(q.OpErrors))
|
|
for i, e := range q.OpErrors {
|
|
if i != 0 {
|
|
buf.WriteString("; ")
|
|
}
|
|
b, err := json.Marshal(e)
|
|
if err != nil {
|
|
fmt.Fprintf(&buf, "json.Marshal(OpErrors[%d]): %v", i, err)
|
|
continue
|
|
}
|
|
buf.Write(b)
|
|
}
|
|
return buf.String()
|
|
}
|
|
|
|
func (q *GCEError) Is(target error) bool {
|
|
for _, err := range q.OpErrors {
|
|
if target == ErrQuotaExceeded && err.Code == "QUOTA_EXCEEDED" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// StartNewVM boots a new VM on GCE and returns a buildlet client
|
|
// configured to speak to it.
|
|
func StartNewVM(creds *google.Credentials, buildEnv *buildenv.Environment, instName, hostType string, opts VMOpts) (Client, error) {
|
|
ctx := context.TODO()
|
|
computeService, _ := compute.New(oauth2.NewClient(ctx, creds.TokenSource))
|
|
|
|
if opts.Description == "" {
|
|
opts.Description = fmt.Sprintf("Go Builder for %s", hostType)
|
|
}
|
|
if opts.ProjectID == "" {
|
|
opts.ProjectID = buildEnv.ProjectName
|
|
}
|
|
if opts.Zone == "" {
|
|
opts.Zone = buildEnv.RandomVMZone()
|
|
}
|
|
zone := opts.Zone
|
|
if opts.DeleteIn == 0 {
|
|
opts.DeleteIn = 30 * time.Minute
|
|
}
|
|
|
|
hconf, ok := dashboard.Hosts[hostType]
|
|
if !ok {
|
|
return nil, fmt.Errorf("invalid host type %q", hostType)
|
|
}
|
|
if !hconf.IsVM() && !hconf.IsContainer() {
|
|
return nil, fmt.Errorf("host %q is type %q; want either a VM or container type", hostType, hconf.PoolName())
|
|
}
|
|
|
|
projectID := opts.ProjectID
|
|
if projectID == "" {
|
|
return nil, errors.New("buildlet: missing required ProjectID option")
|
|
}
|
|
|
|
prefix := "https://www.googleapis.com/compute/v1/projects/" + projectID
|
|
machType := prefix + "/zones/" + zone + "/machineTypes/" + hconf.MachineType()
|
|
diskType := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + zone + "/diskTypes/pd-ssd"
|
|
if hconf.RegularDisk {
|
|
diskType = "" // a spinning disk
|
|
}
|
|
|
|
srcImage := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + hconf.VMImage
|
|
minCPU := hconf.MinCPUPlatform
|
|
if hconf.IsContainer() {
|
|
if hconf.NestedVirt {
|
|
minCPU = "Intel Cascade Lake" // n2 vms (which support NestedVirtualization) are either Ice Lake or Cascade Lake.
|
|
}
|
|
if vm := hconf.ContainerVMImage(); vm != "" {
|
|
srcImage = "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + vm
|
|
} else {
|
|
var err error
|
|
srcImage, err = cosImage(ctx, computeService, hconf.CosArchitecture())
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error find Container-Optimized OS image: %v", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
instance := &compute.Instance{
|
|
Name: instName,
|
|
Description: opts.Description,
|
|
MachineType: machType,
|
|
MinCpuPlatform: minCPU,
|
|
Disks: []*compute.AttachedDisk{
|
|
{
|
|
AutoDelete: true,
|
|
Boot: true,
|
|
Type: "PERSISTENT",
|
|
InitializeParams: &compute.AttachedDiskInitializeParams{
|
|
DiskName: instName,
|
|
SourceImage: srcImage,
|
|
DiskType: diskType,
|
|
DiskSizeGb: opts.DiskSizeGB,
|
|
},
|
|
},
|
|
},
|
|
Tags: &compute.Tags{
|
|
// Warning: do NOT list "http-server" or "allow-ssh" (our
|
|
// project's custom tag to allow ssh access) here; the
|
|
// buildlet provides full remote code execution.
|
|
// The https-server is authenticated, though.
|
|
Items: []string{"https-server"},
|
|
},
|
|
Metadata: &compute.Metadata{},
|
|
NetworkInterfaces: []*compute.NetworkInterface{{
|
|
Network: prefix + "/global/networks/default-vpc",
|
|
}},
|
|
|
|
// Prior to git rev 1b1e086fd, we used preemptible
|
|
// instances, as we were helping test the feature. It was
|
|
// removed after git rev a23395d because we hadn't been
|
|
// using it for some time. Our VMs are so short-lived that
|
|
// the feature doesn't really help anyway. But if we ever
|
|
// find we want it again, this comment is here to point to
|
|
// code that might be useful to partially resurrect.
|
|
Scheduling: &compute.Scheduling{Preemptible: false},
|
|
}
|
|
|
|
// Container builders use the COS image, which defaults to logging to Cloud Logging.
|
|
// Permission is granted to this service account.
|
|
if hconf.IsContainer() && buildEnv.COSServiceAccount != "" {
|
|
instance.ServiceAccounts = []*compute.ServiceAccount{
|
|
{
|
|
Email: buildEnv.COSServiceAccount,
|
|
Scopes: []string{compute.CloudPlatformScope},
|
|
},
|
|
}
|
|
}
|
|
|
|
addMeta := func(key, value string) {
|
|
instance.Metadata.Items = append(instance.Metadata.Items, &compute.MetadataItems{
|
|
Key: key,
|
|
Value: &value,
|
|
})
|
|
}
|
|
// The buildlet-binary-url is the URL of the buildlet binary
|
|
// which the VMs are configured to download at boot and run.
|
|
// This lets us/ update the buildlet more easily than
|
|
// rebuilding the whole VM image.
|
|
addMeta("buildlet-binary-url", hconf.BuildletBinaryURL(buildenv.ByProjectID(opts.ProjectID)))
|
|
addMeta("buildlet-host-type", hostType)
|
|
if !opts.TLS.IsZero() {
|
|
addMeta("tls-cert", opts.TLS.CertPEM)
|
|
addMeta("tls-key", opts.TLS.KeyPEM)
|
|
addMeta("password", opts.TLS.Password())
|
|
}
|
|
if hconf.IsContainer() && hconf.CosArchitecture() == dashboard.CosArchAMD64 {
|
|
addMeta("gce-container-declaration", fmt.Sprintf(`spec:
|
|
containers:
|
|
- name: buildlet
|
|
image: 'gcr.io/%s/%s'
|
|
volumeMounts:
|
|
- name: tmpfs-0
|
|
mountPath: /workdir
|
|
securityContext:
|
|
privileged: true
|
|
stdin: false
|
|
tty: false
|
|
restartPolicy: Always
|
|
volumes:
|
|
- name: tmpfs-0
|
|
emptyDir:
|
|
medium: Memory
|
|
`, opts.ProjectID, hconf.ContainerImage))
|
|
addMeta("user-data", `#cloud-config
|
|
|
|
runcmd:
|
|
- sysctl -w kernel.core_pattern=core
|
|
`)
|
|
} else if hconf.IsContainer() && hconf.CosArchitecture() == dashboard.CosArchARM64 {
|
|
addMeta("user-data", fmt.Sprintf(`#cloud-config
|
|
|
|
write_files:
|
|
- path: /etc/systemd/system/buildlet.service
|
|
permissions: 0644
|
|
owner: root:root
|
|
content: |
|
|
[Unit]
|
|
Description=Start buildlet container
|
|
Wants=gcr-online.target
|
|
After=gcr-online.target
|
|
|
|
[Service]
|
|
Environment="HOME=/home/buildlet"
|
|
ExecStart=/usr/bin/docker run --rm --name=buildlet --privileged -p 80:80 gcr.io/%s/%s
|
|
ExecStop=/usr/bin/docker stop buildlet
|
|
ExecStopPost=/usr/bin/docker rm buildlet
|
|
RemainAfterExit=true
|
|
Type=oneshot
|
|
|
|
runcmd:
|
|
- systemctl daemon-reload
|
|
- systemctl start buildlet.service
|
|
- sysctl -w kernel.core_pattern=core
|
|
`, opts.ProjectID, hconf.ContainerImage))
|
|
}
|
|
|
|
if opts.DeleteIn > 0 {
|
|
// In case the VM gets away from us (generally: if the
|
|
// coordinator dies while a build is running), then we
|
|
// set this attribute of when it should be killed so
|
|
// we can kill it later when the coordinator is
|
|
// restarted. The cleanUpOldVMs goroutine loop handles
|
|
// that killing.
|
|
addMeta("delete-at", fmt.Sprint(time.Now().Add(opts.DeleteIn).Unix()))
|
|
}
|
|
|
|
for k, v := range opts.Meta {
|
|
addMeta(k, v)
|
|
}
|
|
|
|
apiGate()
|
|
op, err := computeService.Instances.Insert(projectID, zone, instance).Do()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Failed to create instance: %v", err)
|
|
}
|
|
condRun(opts.OnInstanceRequested)
|
|
createOp := op.Name
|
|
|
|
// Wait for instance create operation to succeed.
|
|
OpLoop:
|
|
for {
|
|
time.Sleep(2 * time.Second)
|
|
apiGate()
|
|
op, err := computeService.ZoneOperations.Get(projectID, zone, createOp).Do()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get op %s: %v", createOp, err)
|
|
}
|
|
switch op.Status {
|
|
case "PENDING", "RUNNING":
|
|
continue
|
|
case "DONE":
|
|
if op.Error != nil {
|
|
err := &GCEError{OpErrors: make([]*compute.OperationErrorErrors, len(op.Error.Errors))}
|
|
copy(err.OpErrors, op.Error.Errors)
|
|
return nil, err
|
|
}
|
|
break OpLoop
|
|
default:
|
|
return nil, fmt.Errorf("unknown create status %q: %+v", op.Status, op)
|
|
}
|
|
}
|
|
condRun(opts.OnInstanceCreated)
|
|
|
|
apiGate()
|
|
inst, err := computeService.Instances.Get(projectID, zone, instName).Do()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Error getting instance %s details after creation: %v", instName, err)
|
|
}
|
|
|
|
// Finds its internal and/or external IP addresses.
|
|
intIP, extIP := instanceIPs(inst)
|
|
|
|
// Wait for it to boot and its buildlet to come up.
|
|
var buildletURL string
|
|
var ipPort string
|
|
if !opts.TLS.IsZero() {
|
|
if extIP == "" {
|
|
return nil, errors.New("didn't find its external IP address")
|
|
}
|
|
buildletURL = "https://" + extIP
|
|
ipPort = extIP + ":443"
|
|
} else {
|
|
if intIP == "" {
|
|
return nil, errors.New("didn't find its internal IP address")
|
|
}
|
|
buildletURL = "http://" + intIP
|
|
ipPort = intIP + ":80"
|
|
}
|
|
if opts.OnGotInstanceInfo != nil {
|
|
opts.OnGotInstanceInfo(inst)
|
|
}
|
|
var closeFunc func()
|
|
if opts.UseIAPTunnel {
|
|
var localPort string
|
|
var err error
|
|
localPort, closeFunc, err = createIAPTunnel(ctx, inst)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("creating IAP tunnel: %v", err)
|
|
}
|
|
buildletURL = "http://localhost:" + localPort
|
|
ipPort = "127.0.0.1:" + localPort
|
|
}
|
|
client, err := buildletClient(ctx, buildletURL, ipPort, &opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if closeFunc != nil {
|
|
return &extraCloseClient{client, closeFunc}, nil
|
|
}
|
|
return client, nil
|
|
}
|
|
|
|
type extraCloseClient struct {
|
|
Client
|
|
close func()
|
|
}
|
|
|
|
func (e *extraCloseClient) Close() error {
|
|
defer e.close()
|
|
return e.Close()
|
|
}
|
|
|
|
func createIAPTunnel(ctx context.Context, inst *compute.Instance) (string, func(), error) {
|
|
// Allocate a local listening port.
|
|
ln, err := net.Listen("tcp", "localhost:0")
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
localAddr := ln.Addr().(*net.TCPAddr)
|
|
ln.Close()
|
|
// Start the gcloud command. For some reason, when gcloud is run with a
|
|
// pipe for stdout, it doesn't log the success message, so we can only
|
|
// check for success empirically.
|
|
m := regexp.MustCompile(`/projects/([^/]+)/zones/([^/]+)`).FindStringSubmatch(inst.Zone)
|
|
if m == nil {
|
|
return "", nil, fmt.Errorf("unexpected inst.Zone: %q", inst.Zone)
|
|
}
|
|
project, zone := m[1], m[2]
|
|
tunnelCmd := exec.CommandContext(ctx,
|
|
"gcloud", "compute", "start-iap-tunnel", "--iap-tunnel-disable-connection-check",
|
|
"--project", project, "--zone", zone, inst.Name, "80", "--local-host-port", localAddr.String())
|
|
|
|
// hideWriter hides the underlying io.Writer from os/exec, bypassing the
|
|
// special case where os/exec will let a subprocess share the fd to an
|
|
// *os.File. Using hideWriter will result in goroutines that copy from a
|
|
// fresh pipe and write to the writer in the parent Go program.
|
|
// That guarantees that if the subprocess
|
|
// leaves background processes lying around, they will not keep lingering
|
|
// references to the parent Go program's stdout and stderr.
|
|
//
|
|
// Prior to this, it was common for ./debugnewvm | cat to never finish,
|
|
// because debugnewvm left some gcloud helper processes behind, and cat
|
|
// (or any other program) would never observe EOF on its input pipe.
|
|
// We now try to shut gcloud down more carefully with os.Interrupt below,
|
|
// but hideWriter guarantees that lingering processes won't hang
|
|
// pipelines.
|
|
type hideWriter struct{ io.Writer }
|
|
tunnelCmd.Stderr = hideWriter{os.Stderr}
|
|
tunnelCmd.Stdout = hideWriter{os.Stdout}
|
|
|
|
if err := tunnelCmd.Start(); err != nil {
|
|
return "", nil, err
|
|
}
|
|
// Start the process. Either it's going to fail to start after a bit, or
|
|
// it'll start listening on its port. Because we told it not to check the
|
|
// connection above, the connections won't be functional, but we can dial.
|
|
errc := make(chan error, 1)
|
|
go func() { errc <- tunnelCmd.Wait() }()
|
|
for start := time.Now(); time.Since(start) < 60*time.Second; time.Sleep(5 * time.Second) {
|
|
// Check if the server crashed.
|
|
select {
|
|
case err := <-errc:
|
|
return "", nil, err
|
|
default:
|
|
}
|
|
// Check if it's healthy.
|
|
conn, err := net.DialTCP("tcp", nil, localAddr)
|
|
if err == nil {
|
|
conn.Close()
|
|
kill := func() {
|
|
// gcloud compute start-iap-tunnel is a group of Python processes,
|
|
// so send an interrupt to try for an orderly shutdown of the process tree
|
|
// before killing the process outright.
|
|
tunnelCmd.Process.Signal(os.Interrupt)
|
|
time.Sleep(2 * time.Second)
|
|
tunnelCmd.Process.Kill()
|
|
}
|
|
return fmt.Sprint(localAddr.Port), kill, nil
|
|
}
|
|
}
|
|
return "", nil, fmt.Errorf("iap tunnel startup timed out")
|
|
}
|
|
|
|
type VM struct {
|
|
// Name is the name of the GCE VM instance.
|
|
// For example, it's of the form "mote-bradfitz-plan9-386-foo",
|
|
// and not "plan9-386-foo".
|
|
Name string
|
|
IPPort string
|
|
TLS KeyPair
|
|
Type string // buildlet type
|
|
}
|
|
|
|
func instanceIPs(inst *compute.Instance) (intIP, extIP string) {
|
|
for _, iface := range inst.NetworkInterfaces {
|
|
if strings.HasPrefix(iface.NetworkIP, "10.") {
|
|
intIP = iface.NetworkIP
|
|
}
|
|
for _, accessConfig := range iface.AccessConfigs {
|
|
if accessConfig.Type == "ONE_TO_ONE_NAT" {
|
|
extIP = accessConfig.NatIP
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
var (
|
|
cosListMu sync.Mutex
|
|
cosCachedTime time.Time
|
|
cosCache = map[dashboard.CosArch]*cosCacheEntry{}
|
|
)
|
|
|
|
type cosCacheEntry struct {
|
|
cachedTime time.Time
|
|
cachedImage string
|
|
}
|
|
|
|
// cosImage returns the GCP VM image name of the latest stable
|
|
// Container-Optimized OS image. It caches results for 15 minutes.
|
|
func cosImage(ctx context.Context, svc *compute.Service, arch dashboard.CosArch) (string, error) {
|
|
const cacheDuration = 15 * time.Minute
|
|
cosListMu.Lock()
|
|
defer cosListMu.Unlock()
|
|
|
|
cosQuery := func(a dashboard.CosArch) (string, error) {
|
|
imList, err := svc.Images.List("cos-cloud").Filter(fmt.Sprintf("(family eq %q)", string(arch))).Context(ctx).Do()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if imList.NextPageToken != "" {
|
|
return "", fmt.Errorf("too many images; pagination not supported")
|
|
}
|
|
ims := imList.Items
|
|
if len(ims) == 0 {
|
|
return "", errors.New("no image found")
|
|
}
|
|
sort.Slice(ims, func(i, j int) bool {
|
|
if ims[i].Deprecated == nil && ims[j].Deprecated != nil {
|
|
return true
|
|
}
|
|
return ims[i].CreationTimestamp > ims[j].CreationTimestamp
|
|
})
|
|
return ims[0].SelfLink, nil
|
|
}
|
|
c, ok := cosCache[arch]
|
|
if !ok {
|
|
image, err := cosQuery(arch)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
cosCache[arch] = &cosCacheEntry{
|
|
cachedTime: time.Now(),
|
|
cachedImage: image,
|
|
}
|
|
return image, nil
|
|
}
|
|
if c.cachedImage != "" && c.cachedTime.After(time.Now().Add(-cacheDuration)) {
|
|
return c.cachedImage, nil
|
|
}
|
|
image, err := cosQuery(arch)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
c.cachedImage = image
|
|
c.cachedTime = time.Now()
|
|
return image, nil
|
|
}
|