зеркало из https://github.com/golang/build.git
405 строки
11 KiB
Go
405 строки
11 KiB
Go
// Copyright 2014 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// The retrybuilds command clears build failures from the build.golang.org dashboard
|
|
// to force them to be rebuilt.
|
|
//
|
|
// Valid usage modes:
|
|
//
|
|
// retrybuilds -loghash=f45f0eb8
|
|
// retrybuilds -builder=openbsd-amd64
|
|
// retrybuilds -builder=openbsd-amd64 -hash=6fecb7
|
|
// retrybuilds -redo-flaky
|
|
// retrybuilds -redo-flaky -builder=linux-amd64-clang
|
|
// retrybuilds -substr="failed to find foo"
|
|
// retrybuilds -substr="failed to find foo" -builder=linux-amd64-stretch
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"crypto/hmac"
|
|
"crypto/md5"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/build/buildenv"
|
|
"golang.org/x/build/cmd/coordinator/protos"
|
|
"golang.org/x/build/internal/iapclient"
|
|
"golang.org/x/build/internal/secret"
|
|
"google.golang.org/grpc/codes"
|
|
"google.golang.org/grpc/metadata"
|
|
"google.golang.org/grpc/status"
|
|
)
|
|
|
|
var (
|
|
dryRun = flag.Bool("dry-run", false, "just report what would've been done, without changing anything")
|
|
masterKeyFile = flag.String("masterkey", filepath.Join(os.Getenv("HOME"), "keys", "gobuilder-master.key"), "path to Go builder master key. If present, the key argument is not necessary")
|
|
keyFile = flag.String("key", "", "path to key file")
|
|
builder = flag.String("builder", "", "builder to wipe a result for. Empty means all.")
|
|
hash = flag.String("hash", "", "Hash to wipe. If empty, all will be wiped.")
|
|
redoFlaky = flag.Bool("redo-flaky", false, "Reset all flaky builds. If builder is empty, the master key is required.")
|
|
builderPrefix = flag.String("builder-prefix", "https://build.golang.org", "builder URL prefix")
|
|
logHash = flag.String("loghash", "", "If non-empty, clear the build that failed with this loghash prefix")
|
|
sendMasterKey = flag.Bool("sendmaster", false, "send the master key in request instead of a builder-specific key; allows overriding actions of revoked keys")
|
|
branch = flag.String("branch", "master", "branch to find flakes from (for use with -redo-flaky)")
|
|
substr = flag.String("substr", "", "if non-empty, redoes all build failures whose failure logs contain this substring")
|
|
grpcHost = flag.String("grpc-host", "build.golang.org:443", "use gRPC for communicating with the Coordinator API")
|
|
)
|
|
|
|
type Failure struct {
|
|
Builder string
|
|
Hash string
|
|
LogURL string
|
|
}
|
|
|
|
func main() {
|
|
log.SetFlags(0)
|
|
buildenv.RegisterStagingFlag()
|
|
flag.Parse()
|
|
|
|
*builderPrefix = strings.TrimSuffix(*builderPrefix, "/")
|
|
ctx := context.Background()
|
|
cc, err := iapclient.GRPCClient(ctx, *grpcHost)
|
|
if err != nil {
|
|
log.Fatalf("grpc.DialContext(_, %q, _) = %v, wanted no error", *grpcHost, err)
|
|
}
|
|
cl := client{
|
|
coordinator: protos.NewCoordinatorClient(cc),
|
|
}
|
|
|
|
if *logHash != "" {
|
|
substr := "/log/" + *logHash
|
|
for _, f := range failures() {
|
|
if strings.Contains(f.LogURL, substr) {
|
|
log.Printf("Restarting %+v", f)
|
|
cl.wipe(f.Builder, f.Hash)
|
|
}
|
|
}
|
|
log.Printf("wiped %d matching failures\n", cl.wiped)
|
|
return
|
|
}
|
|
if *substr != "" {
|
|
foreachFailure(func(f Failure, failLog string) {
|
|
if strings.Contains(failLog, *substr) {
|
|
log.Printf("Restarting %+v", f)
|
|
cl.wipe(f.Builder, f.Hash)
|
|
}
|
|
})
|
|
log.Printf("wiped %d matching failures\n", cl.wiped)
|
|
return
|
|
}
|
|
if *redoFlaky {
|
|
foreachFailure(func(f Failure, failLog string) {
|
|
if isFlaky(failLog) {
|
|
log.Printf("Restarting flaky %+v", f)
|
|
cl.wipe(f.Builder, f.Hash)
|
|
}
|
|
})
|
|
log.Printf("wiped %d matching failures\n", cl.wiped)
|
|
return
|
|
}
|
|
if *builder == "" {
|
|
log.Fatalf("Missing -builder, -redo-flaky, -substr, or -loghash flag.")
|
|
}
|
|
if *hash == "" {
|
|
for _, f := range failures() {
|
|
if f.Builder != *builder {
|
|
continue
|
|
}
|
|
log.Printf("Restarting %+v", f)
|
|
cl.wipe(f.Builder, f.Hash)
|
|
}
|
|
log.Printf("wiped %d matching failures\n", cl.wiped)
|
|
return
|
|
}
|
|
fullHash := fullHash(*hash)
|
|
log.Printf("Restarting %q", fullHash)
|
|
cl.wipe(*builder, fullHash)
|
|
log.Printf("wiped %d matching failures\n", cl.wiped)
|
|
}
|
|
|
|
func foreachFailure(fn func(f Failure, failLog string)) {
|
|
gate := make(chan bool, 50)
|
|
var wg sync.WaitGroup
|
|
for _, f := range failures() {
|
|
f := f
|
|
if *builder != "" && f.Builder != *builder {
|
|
continue
|
|
}
|
|
gate <- true
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
defer func() { <-gate }()
|
|
res, err := http.Get(f.LogURL)
|
|
if err != nil {
|
|
log.Fatalf("Error fetching %s: %v", f.LogURL, err)
|
|
}
|
|
failLog, err := io.ReadAll(res.Body)
|
|
res.Body.Close()
|
|
if err != nil {
|
|
log.Fatalf("Error reading %s: %v", f.LogURL, err)
|
|
}
|
|
fn(f, string(failLog))
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
}
|
|
|
|
var flakePhrases = []string{
|
|
"No space left on device",
|
|
"no space left on device", // solaris case apparently
|
|
"fatal error: error in backend: IO failure on output stream",
|
|
"Boffset: unknown state 0",
|
|
"Bseek: unknown state 0",
|
|
"error exporting repository: exit status",
|
|
"remote error: User Is Over Quota",
|
|
"fatal: remote did not send all necessary objects",
|
|
"Failed to schedule \"", // e.g. Failed to schedule "go_test:archive/tar" test after 3 tries.
|
|
"lookup _xmpp-server._tcp.google.com. on 8.8.8.8:53: dial udp 8.8.8.8:53: i/o timeout",
|
|
"lookup _xmpp-server._tcp.google.com on",
|
|
"lookup gmail.com. on 8.8.8.8:53: dial udp 8.8.8.8:53: i/o timeout",
|
|
"lookup gmail.com on 8.8.8.8:53",
|
|
"lookup www.mit.edu on ",
|
|
"undefined: runtime.SetMutexProfileFraction", // ppc64 builders had not-quite-go1.8 bootstrap
|
|
"make.bat: The parameter is incorrect",
|
|
"killed",
|
|
"memory",
|
|
"allocate",
|
|
"Killed",
|
|
"Error running API checker: exit status 1",
|
|
"/compile: exit status 1",
|
|
"cmd/link: exit status 1",
|
|
}
|
|
|
|
func isFlaky(failLog string) bool {
|
|
if strings.Count(strings.TrimSpace(failLog), "\n") < 2 {
|
|
return true
|
|
}
|
|
if strings.HasPrefix(failLog, "exit status ") {
|
|
return true
|
|
}
|
|
if strings.HasPrefix(failLog, "timed out after ") {
|
|
return true
|
|
}
|
|
if strings.HasPrefix(failLog, "Failed to schedule ") {
|
|
return true
|
|
}
|
|
for _, phrase := range flakePhrases {
|
|
if strings.Contains(failLog, phrase) {
|
|
return true
|
|
}
|
|
}
|
|
numLines := strings.Count(failLog, "\n")
|
|
if numLines < 20 && strings.Contains(failLog, "error: exit status") {
|
|
return true
|
|
}
|
|
// e.g. fatal: destination path 'go.tools.TMP' already exists and is not an empty directory.
|
|
// To be fixed in golang.org/issue/9407
|
|
if strings.Contains(failLog, "fatal: destination path '") &&
|
|
strings.Contains(failLog, "' already exists and is not an empty directory.") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func fullHash(h string) string {
|
|
if len(h) == 40 {
|
|
return h
|
|
}
|
|
if h != "" {
|
|
for _, f := range failures() {
|
|
if strings.HasPrefix(f.Hash, h) {
|
|
return f.Hash
|
|
}
|
|
}
|
|
}
|
|
log.Fatalf("invalid hash %q; failed to finds its full hash. Not a recent failure?", h)
|
|
panic("unreachable")
|
|
}
|
|
|
|
type client struct {
|
|
coordinator protos.CoordinatorClient
|
|
wiped int // wiped is how many build results have been wiped.
|
|
}
|
|
|
|
// grpcWipe wipes a git hash failure for the provided builder and hash.
|
|
// Only the main Go repo is currently supported.
|
|
// TODO(golang.org/issue/34744) - replace HTTP wipe with this after gRPC API for ClearResults is deployed
|
|
func (c *client) grpcWipe(builder, hash string) {
|
|
md := metadata.New(map[string]string{"coordinator-authorization": "builder " + builderKey(builder)})
|
|
for i := 0; i < 10; i++ {
|
|
ctx, cancel := context.WithTimeout(metadata.NewOutgoingContext(context.Background(), md), time.Minute)
|
|
resp, err := c.coordinator.ClearResults(ctx, &protos.ClearResultsRequest{
|
|
Builder: builder,
|
|
Hash: hash,
|
|
})
|
|
cancel()
|
|
|
|
if err != nil {
|
|
s, _ := status.FromError(err)
|
|
switch s.Code() {
|
|
case codes.Aborted:
|
|
log.Printf("Concurrent datastore transaction wiping %v %v: retrying in 1 second", builder, hash)
|
|
time.Sleep(time.Second)
|
|
case codes.DeadlineExceeded:
|
|
log.Printf("Timeout wiping %v %v: retrying", builder, hash)
|
|
default:
|
|
log.Fatalln(err)
|
|
}
|
|
continue
|
|
}
|
|
log.Printf("cl.ClearResults(%q, %q) = %v: resp: %v", builder, hash, status.Code(err), resp)
|
|
c.wiped++
|
|
return
|
|
}
|
|
}
|
|
|
|
// wipe wipes the git hash failure for the provided failure.
|
|
// Only the main go repo is currently supported.
|
|
func (c *client) wipe(builder, hash string) {
|
|
if *dryRun {
|
|
c.wiped++ // Pretend.
|
|
return
|
|
}
|
|
if *grpcHost != "" {
|
|
// TODO(golang.org/issue/34744) - Remove HTTP logic after gRPC API for ClearResults is deployed
|
|
// to the Coordinator.
|
|
c.grpcWipe(builder, hash)
|
|
return
|
|
}
|
|
vals := url.Values{
|
|
"builder": {builder},
|
|
"hash": {hash},
|
|
"key": {builderKey(builder)},
|
|
}
|
|
for i := 0; i < 10; i++ {
|
|
res, err := http.PostForm(*builderPrefix+"/clear-results?"+vals.Encode(), nil)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
body, err := io.ReadAll(res.Body)
|
|
res.Body.Close()
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
if res.StatusCode != 200 {
|
|
log.Fatalf("Error clearing %v hash %q: %v", builder, hash, res.Status)
|
|
}
|
|
var dashResponse struct {
|
|
Error string
|
|
}
|
|
if err := json.Unmarshal(body, &dashResponse); err != nil {
|
|
log.Fatalf("Bad dashboard response: %v\nBody: %s", err, body)
|
|
}
|
|
|
|
switch e := dashResponse.Error; e {
|
|
case "datastore: concurrent transaction":
|
|
log.Printf("Concurrent datastore transaction wiping %v %v: retrying in 1 second", builder, hash)
|
|
time.Sleep(time.Second)
|
|
continue
|
|
default:
|
|
log.Fatalf("Dashboard error: %v", e)
|
|
case "":
|
|
c.wiped++
|
|
return
|
|
}
|
|
}
|
|
log.Fatalf("Too many datastore transaction issues wiping %v %v", builder, hash)
|
|
}
|
|
|
|
func builderKey(builder string) string {
|
|
if v, ok := builderKeyFromMaster(builder); ok {
|
|
return v
|
|
}
|
|
if *keyFile == "" {
|
|
log.Fatalf("No --key specified for builder %s", builder)
|
|
}
|
|
slurp, err := os.ReadFile(*keyFile)
|
|
if err != nil {
|
|
log.Fatalf("Error reading builder key %s: %v", builder, err)
|
|
}
|
|
return strings.TrimSpace(string(slurp))
|
|
}
|
|
|
|
func builderKeyFromMaster(builder string) (key string, ok bool) {
|
|
masterKey, err := getMasterKeyFromSecretManager()
|
|
if err != nil {
|
|
slurp, err := os.ReadFile(*masterKeyFile)
|
|
if err != nil {
|
|
return "", false
|
|
}
|
|
masterKey = string(bytes.TrimSpace(slurp))
|
|
}
|
|
if *sendMasterKey {
|
|
return masterKey, true
|
|
}
|
|
h := hmac.New(md5.New, []byte(masterKey))
|
|
h.Write([]byte(builder))
|
|
return fmt.Sprintf("%x", h.Sum(nil)), true
|
|
}
|
|
|
|
// getMasterKeyFromSecretManager retrieves the master key
|
|
// from the secret manager service.
|
|
func getMasterKeyFromSecretManager() (string, error) {
|
|
sc, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer sc.Close()
|
|
return sc.Retrieve(context.Background(), secret.NameBuilderMasterKey)
|
|
}
|
|
|
|
var (
|
|
failMu sync.Mutex
|
|
failCache []Failure
|
|
)
|
|
|
|
func failures() (ret []Failure) {
|
|
failMu.Lock()
|
|
ret = failCache
|
|
failMu.Unlock()
|
|
if ret != nil {
|
|
return
|
|
}
|
|
ret = []Failure{} // non-nil
|
|
|
|
res, err := http.Get(*builderPrefix + "/?mode=failures&branch=" + url.QueryEscape(*branch))
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
slurp, err := io.ReadAll(res.Body)
|
|
res.Body.Close()
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
body := string(slurp)
|
|
for _, line := range strings.Split(body, "\n") {
|
|
f := strings.Fields(line)
|
|
if len(f) == 3 {
|
|
ret = append(ret, Failure{
|
|
Hash: f[0],
|
|
Builder: f[1],
|
|
LogURL: f[2],
|
|
})
|
|
}
|
|
}
|
|
|
|
failMu.Lock()
|
|
failCache = ret
|
|
failMu.Unlock()
|
|
return ret
|
|
}
|