buildlet: fix Exec to return ErrTimeout on timeout

The coordinator relies on Exec reporting that the given timeout was exceeded in order to mark a build as failed instead of retrying it. A refactor resulted in Exec no longer doing that, despite what its documentation promises, so fix that. Also add a test since evidence shows that catching a regression can be helpful. For golang/go#42699. Updates golang/go#35707. Change-Id: Iacef90b83e7b81fad88a33baa6489d5157e3528f Reviewed-on: https://go-review.googlesource.com/c/build/+/407555 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Bryan Mills <bcmills@google.com> Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-05-20 13:43:07 -04:00 · 2022-05-20 13:43:07 -04:00 · 33d38b8f07
--- a/buildlet/buildletclient.go
+++ b/buildlet/buildletclient.go
@ -509,6 +509,8 @@ type ExecOpts struct {
 	OnStartExec func()
 }

+// ErrTimeout is a sentinel error that represents that waiting
+// for a command to complete has exceeded the given timeout.
 var ErrTimeout = errors.New("buildlet: timeout waiting for command to complete")

 // Exec runs cmd on the buildlet.
@ -519,8 +521,8 @@ var ErrTimeout = errors.New("buildlet: timeout waiting for command to complete")
 // seen to completition. If execErr is non-nil, the remoteErr is
 // meaningless.
 //
-// If the context's deadline is exceeded, the returned execErr is
-// ErrTimeout.
+// If the context's deadline is exceeded while waiting for the command
+// to complete, the returned execErr is ErrTimeout.
 func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr, execErr error) {
 	var mode string
 	if opts.SystemLevel {
@ -553,10 +555,11 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
 	// (Atlanta, Paris, Sydney, etc.) the reverse buildlet is:
 	res, err := c.doHeaderTimeout(req, 20*time.Second)
 	if err == errHeaderTimeout {
+		// If we don't see headers after all that time,
+		// consider the buildlet to be unhealthy.
 		c.MarkBroken()
 		return nil, errors.New("buildlet: timeout waiting for exec header response")
-	}
-	if err != nil {
+	} else if err != nil {
 		return nil, err
 	}
 	defer res.Body.Close()
@ -577,7 +580,7 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
 			out = ioutil.Discard
 		}
 		if _, err := io.Copy(out, res.Body); err != nil {
-			resc <- errs{execErr: fmt.Errorf("error copying response: %v", err)}
+			resc <- errs{execErr: fmt.Errorf("error copying response: %w", err)}
 			return
 		}

@ -600,10 +603,15 @@ func (c *client) Exec(ctx context.Context, cmd string, opts ExecOpts) (remoteErr
 	select {
 	case res := <-resc:
 		if res.execErr != nil {
+			// Note: We've historically marked the buildlet as unhealthy after
+			// reaching any kind of execution error, even when it's a remote command
+			// execution timeout (see use of ErrTimeout below).
+			// This is certainly on the safer side of avoiding false positive signal,
+			// but maybe someday we'll want to start to rely on the buildlet to report
+			// such a condition and not mark it as unhealthy.
+
 			c.MarkBroken()
-			if res.execErr == context.DeadlineExceeded {
-				// Historical pre-context value.
-				// TODO: update docs & callers to just use the context value.
+			if errors.Is(res.execErr, context.DeadlineExceeded) {
 				res.execErr = ErrTimeout
 			}
 		}
--- a/buildlet/buildletclient_test.go
+++ b/buildlet/buildletclient_test.go
@ -7,10 +7,12 @@ package buildlet
 import (
 	"context"
 	"crypto/tls"
+	"encoding/json"
 	"errors"
 	"net"
 	"net/http"
 	"net/http/httptest"
+	"net/url"
 	"strings"
 	"testing"
 )
@ -171,3 +173,55 @@ func createKeyPair(t *testing.T) KeyPair {
 	}
 	return kp
 }
+
+// Test that Exec returns ErrTimeout upon reaching the context timeout
+// during command execution, as its documentation promises.
+func TestExecTimeoutError(t *testing.T) {
+	mux := http.NewServeMux()
+	mux.HandleFunc("/status", func(w http.ResponseWriter, req *http.Request) {
+		json.NewEncoder(w).Encode(Status{})
+	})
+	mux.HandleFunc("/exec", func(w http.ResponseWriter, req *http.Request) {
+		w.Write([]byte("."))
+		w.(http.Flusher).Flush() // /exec needs to flush headers right away.
+		<-req.Context().Done()   // Simulate that execution hangs, so no more output.
+	})
+	ts := httptest.NewServer(mux)
+	defer ts.Close()
+	u, err := url.Parse(ts.URL)
+	if err != nil {
+		t.Fatalf("unable to parse http server url %s", err)
+	}
+	cl := NewClient(u.Host, NoKeyPair)
+	defer cl.Close()
+
+	// Use a custom context that reports context.DeadlineExceeded
+	// after Exec starts command execution. (context.WithTimeout
+	// requires us to select an arbitrary duration, which might
+	// not be long enough or will make the test take too long.)
+	ctx := deadlineOnDemandContext{
+		Context: context.Background(),
+		done:    make(chan struct{}),
+	}
+	_, execErr := cl.Exec(ctx, "./bin/test", ExecOpts{
+		OnStartExec: func() { close(ctx.done) },
+	})
+	if execErr != ErrTimeout {
+		t.Errorf("cl.Exec error = %v; want %v", execErr, ErrTimeout)
+	}
+}
+
+type deadlineOnDemandContext struct {
+	context.Context
+	done chan struct{}
+}
+
+func (c deadlineOnDemandContext) Done() <-chan struct{} { return c.done }
+func (c deadlineOnDemandContext) Err() error {
+	select {
+	default:
+		return nil
+	case <-c.done:
+		return context.DeadlineExceeded
+	}
+}