buildlet: fix Exec to return ErrTimeout on timeout

The coordinator relies on Exec reporting that the given timeout was exceeded in order to mark a build as failed instead of retrying it. A refactor resulted in Exec no longer doing that, despite what its documentation promises, so fix that. Also add a test since evidence shows that catching a regression can be helpful. For golang/go#42699. Updates golang/go#35707. Change-Id: Iacef90b83e7b81fad88a33baa6489d5157e3528f Reviewed-on: https://go-review.googlesource.com/c/build/+/407555 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Bryan Mills <bcmills@google.com> Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-05-20 13:43:07 -04:00 · 2022-05-20 13:43:07 -04:00 · 33d38b8f07
--- a/buildlet/buildletclient.go
+++ b/buildlet/buildletclient.go
@ -509,6 +509,8 @@ type ExecOpts struct {
 	OnStartExec func()
 }
 // ErrTimeout is a sentinel error that represents that waiting
 // for a command to complete has exceeded the given timeout.
 var ErrTimeout = errors.New("buildlet: timeout waiting for command to complete")
 // Exec runs cmd on the buildlet.
@ -519,8 +521,8 @@ var ErrTimeout = errors.New("buildlet: timeout waiting for command to complete")
 // seen to completition. If execErr is non-nil, the remoteErr is
 // meaningless.
 //
-// If the context's deadline is exceeded, the returned execErr is
+// If the context's deadline is exceeded while waiting for the command
-// ErrTimeout.
+// to complete, the returned execErr is ErrTimeout.
 func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr, execErr error) {
 	var mode string
 	if opts.SystemLevel {
@ -553,10 +555,11 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
 	// (Atlanta, Paris, Sydney, etc.) the reverse buildlet is:
 	res, err := c.doHeaderTimeout(req, 20*time.Second)
 	if err == errHeaderTimeout {
 		// If we don't see headers after all that time,
 		// consider the buildlet to be unhealthy.
 		c.MarkBroken()
 		return nil, errors.New("buildlet: timeout waiting for exec header response")
-	}
+	} else if err != nil {
 	if err != nil {
 		return nil, err
 	}
 	defer res.Body.Close()
@ -577,7 +580,7 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
 			out = ioutil.Discard
 		}
 		if _, err := io.Copy(out, res.Body); err != nil {
-			resc <- errs{execErr: fmt.Errorf("error copying response: %v", err)}
+			resc <- errs{execErr: fmt.Errorf("error copying response: %w", err)}
 			return
 		}
@ -600,10 +603,15 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
 	select {
 	case res := <-resc:
 		if res.execErr != nil {
 			// Note: We've historically marked the buildlet as unhealthy after
 			// reaching any kind of execution error, even when it's a remote command
 			// execution timeout (see use of ErrTimeout below).
 			// This is certainly on the safer side of avoiding false positive signal,
 			// but maybe someday we'll want to start to rely on the buildlet to report
 			// such a condition and not mark it as unhealthy.
 			c.MarkBroken()
-			if res.execErr == context.DeadlineExceeded {
+			if errors.Is(res.execErr, context.DeadlineExceeded) {
 				// Historical pre-context value.
 				// TODO: update docs & callers to just use the context value.
 				res.execErr = ErrTimeout
 			}
 		}
--- a/buildlet/buildletclient_test.go
+++ b/buildlet/buildletclient_test.go
@ -7,10 +7,12 @@ package buildlet
 import (
 	"context"
 	"crypto/tls"
 	"encoding/json"
 	"errors"
 	"net"
 	"net/http"
 	"net/http/httptest"
 	"net/url"
 	"strings"
 	"testing"
 )
@ -171,3 +173,55 @@ func createKeyPair(t *testing.T) KeyPair {
 	}
 	return kp
 }
 // Test that Exec returns ErrTimeout upon reaching the context timeout
 // during command execution, as its documentation promises.
 func TestExecTimeoutError(t *testing.T) {
 	mux := http.NewServeMux()
 	mux.HandleFunc("/status", func(w http.ResponseWriter, req *http.Request) {
 		json.NewEncoder(w).Encode(Status{})
 	})
 	mux.HandleFunc("/exec", func(w http.ResponseWriter, req *http.Request) {
 		w.Write([]byte("."))
 		w.(http.Flusher).Flush() // /exec needs to flush headers right away.
 		<-req.Context().Done()   // Simulate that execution hangs, so no more output.
 	})
 	ts := httptest.NewServer(mux)
 	defer ts.Close()
 	u, err := url.Parse(ts.URL)
 	if err != nil {
 		t.Fatalf("unable to parse http server url %s", err)
 	}
 	cl := NewClient(u.Host, NoKeyPair)
 	defer cl.Close()
 	// Use a custom context that reports context.DeadlineExceeded
 	// after Exec starts command execution. (context.WithTimeout
 	// requires us to select an arbitrary duration, which might
 	// not be long enough or will make the test take too long.)
 	ctx := deadlineOnDemandContext{
 		Context: context.Background(),
 		done:    make(chan struct{}),
 	}
 	_, execErr := cl.Exec(ctx, "./bin/test", ExecOpts{
 		OnStartExec: func() { close(ctx.done) },
 	})
 	if execErr != ErrTimeout {
 		t.Errorf("cl.Exec error = %v; want %v", execErr, ErrTimeout)
 	}
 }
 type deadlineOnDemandContext struct {
 	context.Context
 	done chan struct{}
 }
 func (c deadlineOnDemandContext) Done() <-chan struct{} { return c.done }
 func (c deadlineOnDemandContext) Err() error {
 	select {
 	default:
 		return nil
 	case <-c.done:
 		return context.DeadlineExceeded
 	}
 }