internal/relui/sign: improve communication timing visibility

I expect successful communication with a signing client to be on the
order of a few seconds, maybe 10 or 20. Some signing status requests
are stalling for 10 minutes.

In such cases, it's better to give up earlier and try again, so cap
individual status requests inside AwaitCondition to a minute at most,
and prefer to try again if that doesn't work out. This should prevent
the watchdog timer from reaching 10 minutes and stopping the task.

Also improve logging to print more useful timing information, and
fix a case where UpdateSigningStatus was always returning nil error.

For golang/go#53632.

Change-Id: Idb340fb6396b31570acd980adf33a52877d60012
Reviewed-on: https://go-review.googlesource.com/c/build/+/444495
Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Dmitri Shuralyov 2022-10-20 13:12:06 -04:00 коммит произвёл Gopher Robot
Родитель e987ab40b4
Коммит 14a42abee7
2 изменённых файлов: 26 добавлений и 9 удалений

Просмотреть файл

@ -10,6 +10,7 @@ import (
"io"
"log"
"sync"
"time"
"github.com/google/uuid"
"golang.org/x/build/internal/access"
@ -49,10 +50,14 @@ func NewServer() *SigningServer {
// and receive status updates on signing requests. There is no specific order which the requests or responses
// need to occur in. The connection returns an error once the context is canceled or an error is encountered.
func (rs *SigningServer) UpdateSigningStatus(stream protos.ReleaseService_UpdateSigningStatusServer) error {
_, err := access.IAPFromContext(stream.Context())
iap, err := access.IAPFromContext(stream.Context())
if err != nil {
return status.Errorf(codes.Unauthenticated, "request does not contain the required authentication")
}
log.Printf("SigningServer: a client connected (iap = %+v)\n", iap)
defer log.Println("SigningServer: a client disconnected")
g, ctx := errgroup.WithContext(stream.Context())
g.Go(func() error {
for {
@ -79,7 +84,7 @@ func (rs *SigningServer) UpdateSigningStatus(stream protos.ReleaseService_Update
rs.callAndDeregister(resp.GetMessageId(), &signResponse{status: resp})
}
})
if err := g.Wait(); err == nil {
if err := g.Wait(); err != nil {
log.Printf("SigningServer: UpdateSigningStatus=%s", err)
return err
}
@ -88,7 +93,16 @@ func (rs *SigningServer) UpdateSigningStatus(stream protos.ReleaseService_Update
// do sends a signing request and returns the corresponding signing response.
// It blocks until a response is received or the context times out or is canceled.
func (rs *SigningServer) do(ctx context.Context, req *protos.SigningRequest) (*protos.SigningStatus, error) {
func (rs *SigningServer) do(ctx context.Context, req *protos.SigningRequest) (resp *protos.SigningStatus, err error) {
t := time.Now()
defer func() {
if err == nil {
log.Printf("SigningServer: successfully round-tripped message=%q in %v:\n req = %v\n resp = %v\n", req.GetMessageId(), time.Since(t), req, resp)
} else {
log.Printf("SigningServer: communication error %v for message=%q after %v:\n req = %v\n", err, req.GetMessageId(), time.Since(t), req)
}
}()
// Register where to send the response for this message ID.
respCh := make(chan *signResponse, 1) // Room for one response.
rs.register(req.GetMessageId(), func(r *signResponse) { respCh <- r })
@ -96,7 +110,6 @@ func (rs *SigningServer) do(ctx context.Context, req *protos.SigningRequest) (*p
// Send the request.
select {
case rs.requests <- req:
log.Printf("SigningServer: sent signing request with message=%q", req.GetMessageId())
case <-ctx.Done():
rs.deregister(req.GetMessageId())
return nil, ctx.Err()

Просмотреть файл

@ -814,10 +814,13 @@ func (b *BuildReleaseTasks) signArtifacts(ctx *wf.TaskContext, bt sign.BuildType
if err != nil {
return nil, err
}
outURLs, jobError := task.AwaitCondition(ctx, 30*time.Second, func() (out []string, done bool, _ error) {
status, desc, out, err := b.SignService.ArtifactSigningStatus(ctx, jobID)
outURLs, jobError := task.AwaitCondition(ctx, time.Minute, func() (out []string, done bool, _ error) {
statusContext, cancel := context.WithTimeout(ctx, time.Minute)
defer cancel()
t := time.Now()
status, desc, out, err := b.SignService.ArtifactSigningStatus(statusContext, jobID)
if err != nil {
ctx.Printf("ArtifactSigningStatus ran into a retryable communication error: %v\n", err)
ctx.Printf("ArtifactSigningStatus ran into a retryable communication error after %v: %v\n", time.Since(t), err)
return nil, false, nil
}
switch status {
@ -838,11 +841,12 @@ func (b *BuildReleaseTasks) signArtifacts(ctx *wf.TaskContext, bt sign.BuildType
if jobError != nil {
// If ctx is canceled, also cancel the signing request.
if ctx.Err() != nil {
cancelContext, cancel := context.WithTimeout(context.Background(), 30*time.Second)
cancelContext, cancel := context.WithTimeout(context.Background(), time.Minute)
defer cancel()
t := time.Now()
err := b.SignService.CancelSigning(cancelContext, jobID)
if err != nil {
ctx.Printf("CancelSigning error: %v", err)
ctx.Printf("CancelSigning error after %v: %v\n", time.Since(t), err)
}
}