vitess-gh/go/vt/vtgate/shard_conn_flaky_test.go

826 строки
31 KiB
Go

// Copyright 2012, Google Inc. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package vtgate
import (
"fmt"
"reflect"
"strings"
"testing"
"time"
"github.com/youtube/vitess/go/stats"
"golang.org/x/net/context"
"github.com/youtube/vitess/go/vt/tabletserver/querytypes"
"github.com/youtube/vitess/go/vt/vterrors"
topodatapb "github.com/youtube/vitess/go/vt/proto/topodata"
vtrpcpb "github.com/youtube/vitess/go/vt/proto/vtrpc"
)
// This file uses the sandbox_test framework.
var (
retryCount = 3
retryDelay = 1 * time.Millisecond
connTimeoutTotal = 20 * time.Millisecond
connTimeoutPerConn = 10 * time.Millisecond
connLife = 24 * time.Hour
connectTimings = stats.NewMultiTimings("", []string{"Keyspace", "ShardName", "DbType"})
)
func TestShardConnExecute(t *testing.T) {
testShardConnGeneric(t, "TestShardConnExecute", false, func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnExecute", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
_, err := sdc.Execute(context.Background(), "query", nil, 0)
return err
})
testShardConnTransact(t, "TestShardConnExecute", func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnExecute", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
_, err := sdc.Execute(context.Background(), "query", nil, 1)
return err
})
}
func TestShardConnExecuteBatch(t *testing.T) {
testShardConnGeneric(t, "TestShardConnExecuteBatch", false, func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnExecuteBatch", "0", topodatapb.TabletType_REPLICA, 1*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
queries := []querytypes.BoundQuery{{"query", nil}}
_, err := sdc.ExecuteBatch(context.Background(), queries, false, 0)
return err
})
testShardConnTransact(t, "TestShardConnExecuteBatch", func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnExecuteBatch", "0", topodatapb.TabletType_REPLICA, 1*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
queries := []querytypes.BoundQuery{{"query", nil}}
_, err := sdc.ExecuteBatch(context.Background(), queries, false, 1)
return err
})
}
func TestShardConnExecuteStream(t *testing.T) {
testShardConnGeneric(t, "TestShardConnExecuteStream", true, func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnExecuteStream", "0", topodatapb.TabletType_REPLICA, 1*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
_, errfunc := sdc.StreamExecute(context.Background(), "query", nil, 0)
return errfunc()
})
testShardConnTransact(t, "TestShardConnExecuteStream", func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnExecuteStream", "0", topodatapb.TabletType_REPLICA, 1*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
_, errfunc := sdc.StreamExecute(context.Background(), "query", nil, 1)
return errfunc()
})
}
func TestShardConnBegin(t *testing.T) {
testShardConnGeneric(t, "TestShardConnBegin", false, func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnBegin", "0", topodatapb.TabletType_REPLICA, 1*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
_, err := sdc.Begin(context.Background())
return err
})
}
func TestShardConnCommit(t *testing.T) {
testShardConnTransact(t, "TestShardConnCommit", func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnCommit", "0", topodatapb.TabletType_REPLICA, 1*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
return sdc.Commit(context.Background(), 1)
})
}
func TestShardConnRollback(t *testing.T) {
testShardConnTransact(t, "TestShardConnRollback", func() error {
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnRollback", "0", topodatapb.TabletType_REPLICA, 1*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
return sdc.Rollback(context.Background(), 1)
})
}
func verifyShardConnError(t *testing.T, err error, wantErr string, wantCode vtrpcpb.ErrorCode) {
if err == nil || err.Error() != wantErr {
t.Errorf("wanted error: %s, got error: %v", wantErr, err)
}
if _, ok := err.(*ShardConnError); !ok {
t.Errorf("wanted error type *ShardConnError, got error type: %v", reflect.TypeOf(err))
}
code := vterrors.RecoverVtErrorCode(err)
if code != wantCode {
t.Errorf("wanted error code: %s, got: %v", wantCode, code)
}
}
func testShardConnGeneric(t *testing.T, name string, streaming bool, f func() error) {
// Topo failure
s := createSandbox(name)
s.EndPointMustFail = retryCount + 1
want := fmt.Sprintf("shard, host: %v.0.replica, <nil>, endpoints fetch error: topo error", name)
err := f()
verifyShardConnError(t, err, want, vtrpcpb.ErrorCode_INTERNAL_ERROR)
if epCount := s.EndPointCounter.Get(); epCount != int64(retryCount+1) {
t.Errorf("want %v, got %v", (retryCount + 1), epCount)
}
// Connect failure
s.Reset()
sbc := &sandboxConn{}
s.MapTestConn("0", sbc)
s.DialMustFail = 4
err = f()
want = fmt.Sprintf("shard, host: %v.0.replica, %+v, conn error %+v", name, nil, sbc.EndPoint())
verifyShardConnError(t, err, want, vtrpcpb.ErrorCode_UNKNOWN_ERROR)
// Ensure we dialed 4 times before failing.
if s.DialCounter != 4 {
t.Errorf("want 4, got %v", s.DialCounter)
}
// no valid endpoints as the only connection is marked down
// **** It tests the behavior when retryCount is odd.
// When the retryCount is even, the error should be "retry: err". ****
s.Reset()
sbc = &sandboxConn{mustFailRetry: retryCount + 1}
s.MapTestConn("0", sbc)
err = f()
want = fmt.Sprintf("shard, host: %v.0.replica, %+v, no valid endpoint", name, nil)
verifyShardConnError(t, err, want, vtrpcpb.ErrorCode_INTERNAL_ERROR)
// Ensure we dialed 2 times before failing.
if s.DialCounter != 2 {
t.Errorf("want 2, got %v", s.DialCounter)
}
// Ensure we executed 2 times before failing.
if execCount := sbc.ExecCount.Get(); execCount != 2 {
t.Errorf("want 2, got %v", execCount)
}
// retry error (one failure)
s.Reset()
sbc = &sandboxConn{mustFailRetry: 1}
s.MapTestConn("0", sbc)
err = f()
if err != nil {
t.Errorf("want nil, got %v", err)
}
// Ensure we dialed twice (second one succeeded)
if s.DialCounter != 2 {
t.Errorf("want 2, got %v", s.DialCounter)
}
// Ensure we executed twice (second one succeeded)
if execCount := sbc.ExecCount.Get(); execCount != 2 {
t.Errorf("want 2, got %v", execCount)
}
// fatal error (one failure)
s.Reset()
sbc = &sandboxConn{mustFailFatal: 1}
s.MapTestConn("0", sbc)
err = f()
// streaming queries don't retry on fatal
if streaming {
want = fmt.Sprintf("shard, host: %v.0.replica, host:\"0\" port_map:<key:\"vt\" value:1 > , fatal: err", name)
verifyShardConnError(t, err, want, vtrpcpb.ErrorCode_INTERNAL_ERROR)
} else {
if err != nil {
t.Errorf("want nil, got %v", err)
}
}
wantCounter := 2
if streaming {
wantCounter = 1
}
// Ensure we dialed twice (second one succeeded)
if s.DialCounter != wantCounter {
t.Errorf("want %v, got %v", wantCounter, s.DialCounter)
}
// Ensure we executed twice (second one succeeded)
if execCount := sbc.ExecCount.Get(); execCount != int64(wantCounter) {
t.Errorf("want %v, got %v", wantCounter, execCount)
}
// server error
s.Reset()
sbc = &sandboxConn{mustFailServer: 1}
s.MapTestConn("0", sbc)
err = f()
want = fmt.Sprintf("shard, host: %v.0.replica, host:\"0\" port_map:<key:\"vt\" value:1 > , error: err", name)
verifyShardConnError(t, err, want, vtrpcpb.ErrorCode_BAD_INPUT)
// Ensure we did not redial.
if s.DialCounter != 1 {
t.Errorf("want 1, got %v", s.DialCounter)
}
// Ensure we did not re-execute.
if execCount := sbc.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
// conn error (one failure)
// no retry on OperationalError
s.Reset()
sbc = &sandboxConn{mustFailConn: 1}
s.MapTestConn("0", sbc)
err = f()
want = fmt.Sprintf("shard, host: %v.0.replica, host:\"0\" port_map:<key:\"vt\" value:1 > , error: conn", name)
verifyShardConnError(t, err, want, vtrpcpb.ErrorCode_UNKNOWN_ERROR)
if err == nil || err.Error() != want {
t.Errorf("want %v, got %v", want, err)
}
// Ensure we did not redail.
if s.DialCounter != 1 {
t.Errorf("want 1, got %v", s.DialCounter)
}
// Ensure we did not re-execute.
if execCount := sbc.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
// no failures
s.Reset()
sbc = &sandboxConn{}
s.MapTestConn("0", sbc)
err = f()
if err != nil {
t.Errorf("want nil, got %v", err)
}
if s.DialCounter != 1 {
t.Errorf("want 1, got %v", s.DialCounter)
}
if execCount := sbc.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
}
func testShardConnTransact(t *testing.T, name string, f func() error) {
// retry error
s := createSandbox(name)
sbc := &sandboxConn{mustFailRetry: 3}
s.MapTestConn("0", sbc)
err := f()
want := fmt.Sprintf("shard, host: %v.0.replica, host:\"0\" port_map:<key:\"vt\" value:1 > , retry: err", name)
if err == nil || err.Error() != want {
t.Errorf("want %s, got %v", want, err)
}
// Should not retry if we're in transaction
if execCount := sbc.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
// conn error
s.Reset()
sbc = &sandboxConn{mustFailConn: 3}
s.MapTestConn("0", sbc)
err = f()
want = fmt.Sprintf("shard, host: %v.0.replica, host:\"0\" port_map:<key:\"vt\" value:1 > , error: conn", name)
if err == nil || err.Error() != want {
t.Errorf("want %s, got %v", want, err)
}
// Should not retry if we're in transaction
if execCount := sbc.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
}
func TestShardConnBeginOther(t *testing.T) {
// tx_pool_full
s := createSandbox("TestShardConnBeginOther")
sbc := &sandboxConn{mustFailTxPool: 1}
s.MapTestConn("0", sbc)
want := fmt.Sprintf("shard, host: TestShardConnBeginOther.0.replica, host:\"0\" port_map:<key:\"vt\" value:1 > , tx_pool_full: err")
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnBeginOther", "0", topodatapb.TabletType_REPLICA, 10*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
_, err := sdc.Begin(context.Background())
if err == nil || err.Error() != want {
t.Errorf("want %v, got %v", want, err)
}
// There should have been no redial.
if s.DialCounter != 1 {
t.Errorf("want 1, got %v", s.DialCounter)
}
// Account for 1 call to Begin.
if execCount := sbc.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
}
func TestShardConnStreamingRetry(t *testing.T) {
// ERR_RETRY
s := createSandbox("TestShardConnStreamingRetry")
sbc := &sandboxConn{mustFailRetry: 1}
s.MapTestConn("0", sbc)
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnStreamingRetry", "0", topodatapb.TabletType_REPLICA, 10*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
_, errfunc := sdc.StreamExecute(context.Background(), "query", nil, 0)
err := errfunc()
if err != nil {
t.Errorf("want nil, got %v", err)
}
if s.DialCounter != 2 {
t.Errorf("want 2, got %v", s.DialCounter)
}
if execCount := sbc.ExecCount.Get(); execCount != 2 {
t.Errorf("want 2, got %v", execCount)
}
// ERR_FATAL
s.Reset()
sbc = &sandboxConn{mustFailFatal: 1}
s.MapTestConn("0", sbc)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnStreamingRetry", "0", topodatapb.TabletType_REPLICA, 10*time.Millisecond, 3, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
_, errfunc = sdc.StreamExecute(context.Background(), "query", nil, 0)
err = errfunc()
want := "shard, host: TestShardConnStreamingRetry.0.replica, host:\"0\" port_map:<key:\"vt\" value:1 > , fatal: err"
if err == nil || err.Error() != want {
t.Errorf("want %v, got %v", want, err)
}
if s.DialCounter != 1 {
t.Errorf("want 1, got %v", s.DialCounter)
}
if execCount := sbc.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
}
func TestShardConnTimeout(t *testing.T) {
s := createSandbox("TestShardConnTimeout")
// case 1: one endpoint, per conn timeout becomes total timeout
s.DialMustTimeout = 1
sbc := &sandboxConn{}
s.MapTestConn("0", sbc)
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnTimeout", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
startTime := time.Now()
_, err := sdc.Execute(context.Background(), "query", nil, 0)
execDuration := time.Now().Sub(startTime)
if execDuration < connTimeoutTotal {
t.Errorf("timeout too fast, want > %v, got %v", connTimeoutTotal, execDuration)
}
if execDuration > connTimeoutTotal*2 {
t.Errorf("timeout too slow, want < %v, got %v", connTimeoutTotal*2, execDuration)
}
if err == nil || strings.Contains(err.Error(), "timeout") == false {
t.Errorf("want timeout in error, got %v", err)
}
// case 2: per conn timeout is longer than total timeout (by error)
s.Reset()
s.DialMustTimeout = 1
sbc = &sandboxConn{}
s.MapTestConn("0", sbc)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnTimeout", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutTotal*3, connLife, connectTimings)
startTime = time.Now()
_, err = sdc.Execute(context.Background(), "query", nil, 0)
execDuration = time.Now().Sub(startTime)
if execDuration < connTimeoutTotal {
t.Errorf("timeout too fast, want > %v, got %v", connTimeoutTotal, execDuration)
}
if execDuration > connTimeoutTotal*2 {
t.Errorf("timeout want < %v, got %v", connTimeoutTotal*2, execDuration)
}
if err == nil || strings.Contains(err.Error(), "timeout") == false {
t.Errorf("want timeout in error, got %v", err)
}
// case 3: per conn timeout is honored if there are more than one endpoint
// with one endpoint times out
s.Reset()
s.DialMustTimeout = 1
sbc = &sandboxConn{}
sbc2 := &sandboxConn{}
s.MapTestConn("0", sbc)
s.MapTestConn("0", sbc2)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnTimeout", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
startTime = time.Now()
_, err = sdc.Execute(context.Background(), "query", nil, 0)
execDuration = time.Now().Sub(startTime)
if execDuration < connTimeoutPerConn {
t.Errorf("timeout too fast, want > %v, got %v", connTimeoutPerConn, execDuration)
}
if execDuration > connTimeoutTotal {
t.Errorf("timeout too slow, want < %v, got %v", connTimeoutTotal, execDuration)
}
if err != nil {
t.Errorf("want nil, got %v", err)
}
// case 4: per conn timeout is honored if there are more than one endpoint
// with two endpoints time out
s.Reset()
s.DialMustTimeout = 2
sbc = &sandboxConn{}
sbc2 = &sandboxConn{}
s.MapTestConn("0", sbc)
s.MapTestConn("0", sbc2)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnTimeout", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
startTime = time.Now()
_, err = sdc.Execute(context.Background(), "query", nil, 0)
execDuration = time.Now().Sub(startTime)
if execDuration < connTimeoutTotal {
t.Errorf("timeout too fast, want > %v, got %v", connTimeoutTotal, execDuration)
}
if execDuration > connTimeoutTotal*2 {
t.Errorf("timeout too slow, want < %v, got %v", connTimeoutTotal*2, execDuration)
}
if err == nil || strings.Contains(err.Error(), "timeout") == false {
t.Errorf("want timeout in error, got %v", err)
}
}
func TestShardConnReconnect(t *testing.T) {
retryDelay := 50 * time.Millisecond
retryCount := 5
s := createSandbox("TestShardConnReconnect")
// case 1: resolved 0 endpoint, return error
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
startTime := time.Now()
_, err := sdc.Execute(context.Background(), "query", nil, 0)
execDuration := time.Now().Sub(startTime)
if execDuration < (retryDelay * time.Duration(retryCount)) {
t.Errorf("retry too fast, want %v, got %v", retryDelay*time.Duration(retryCount), execDuration)
}
if execDuration > retryDelay*time.Duration(retryCount+1) {
t.Errorf("retry too slow, want %v, got %v", retryDelay*time.Duration(retryCount+1), execDuration)
}
if err == nil {
t.Errorf("want error, got nil")
}
if epCount := s.EndPointCounter.Get(); epCount != int64(retryCount+1) {
t.Errorf("want %v, got %v", retryCount+1, epCount)
}
// case 2.1: resolve 1 endpoint and connect failed -> resolve and retry without spamming
s.Reset()
s.DialMustFail = 1
sbc := &sandboxConn{}
s.MapTestConn("0", sbc)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart := time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration := time.Now().Sub(timeStart)
if timeDuration < retryDelay {
t.Errorf("want no spam delay %v, got %v", retryDelay, timeDuration)
}
if timeDuration > retryDelay*2 {
t.Errorf("want instant resolve %v, got %v", retryDelay, timeDuration)
}
if epCount := s.EndPointCounter.Get(); epCount != 2 {
t.Errorf("want 2, got %v", epCount)
}
// case 2.2: resolve 1 endpoint and execute failed with retryable error -> resolve and retry without spamming
s.Reset()
sbc = &sandboxConn{mustFailRetry: 1}
s.MapTestConn("0", sbc)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration < retryDelay {
t.Errorf("want no spam delay %v, got %v", retryDelay, timeDuration)
}
if timeDuration > retryDelay*2 {
t.Errorf("want instant resolve %v, got %v", retryDelay, timeDuration)
}
if epCount := s.EndPointCounter.Get(); epCount != 3 {
t.Errorf("want 3, got %v", epCount)
}
// case 2.3: resolve 1 endpoint and execute failed with OperationalError -> no retry
s.Reset()
sbc = &sandboxConn{mustFailConn: 1}
s.MapTestConn("0", sbc)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration > retryDelay {
t.Errorf("want instant fail %v, got %v", retryDelay, timeDuration)
}
if epCount := s.EndPointCounter.Get(); epCount != 1 {
t.Errorf("want 1, got %v", epCount)
}
// case 3.1: resolve 3 endpoints, failed connection to 1st one -> resolve and connect to 2nd one
s.Reset()
s.DialMustFail = 1
sbc0 := &sandboxConn{}
sbc1 := &sandboxConn{}
sbc2 := &sandboxConn{}
s.MapTestConn("0", sbc0)
s.MapTestConn("0", sbc1)
s.MapTestConn("0", sbc2)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration >= retryDelay {
t.Errorf("want no delay, got %v", timeDuration)
}
if execCount := sbc0.ExecCount.Get() + sbc1.ExecCount.Get() + sbc2.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
if epCount := s.EndPointCounter.Get(); epCount != 1 {
t.Errorf("want 1, got %v", epCount)
}
// case 3.2: resolve 3 endpoints, failed execution on 1st one -> resolve and execute on 2nd one
s.Reset()
countConnUse := 0
onConnUse := func(conn *sandboxConn) {
if countConnUse == 0 {
conn.mustFailRetry = 1
}
countConnUse++
}
sbc0 = &sandboxConn{onConnUse: onConnUse}
sbc1 = &sandboxConn{onConnUse: onConnUse}
sbc2 = &sandboxConn{onConnUse: onConnUse}
s.MapTestConn("0", sbc0)
s.MapTestConn("0", sbc1)
s.MapTestConn("0", sbc2)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration >= retryDelay {
t.Errorf("want no delay, got %v", timeDuration)
}
if execCount := sbc0.ExecCount.Get() + sbc1.ExecCount.Get() + sbc2.ExecCount.Get(); execCount != 2 {
t.Errorf("want 2, got %v", execCount)
}
{
execCount0 := sbc0.ExecCount.Get()
execCount1 := sbc1.ExecCount.Get()
execCount2 := sbc2.ExecCount.Get()
if execCount0 > 1 || execCount1 > 1 || execCount2 > 1 {
t.Errorf("want no more than 1, got %v,%v,%v", execCount0, execCount1, execCount2)
}
}
if epCount := s.EndPointCounter.Get(); epCount != 2 {
t.Errorf("want 2, got %v", epCount)
}
// case 4: resolve 3 endpoints, failed connection to 1st, failed execution on 2nd -> resolve and execute on 3rd one
s.Reset()
s.DialMustFail = 1
countConnUse = 0
onConnUse = func(conn *sandboxConn) {
if countConnUse == 0 {
conn.mustFailRetry = 1
}
countConnUse++
}
sbc0 = &sandboxConn{onConnUse: onConnUse}
sbc1 = &sandboxConn{onConnUse: onConnUse}
sbc2 = &sandboxConn{onConnUse: onConnUse}
s.MapTestConn("0", sbc0)
s.MapTestConn("0", sbc1)
s.MapTestConn("0", sbc2)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration >= retryDelay {
t.Errorf("want no delay, got %v", timeDuration)
}
{
execCount0 := sbc0.ExecCount.Get()
execCount1 := sbc1.ExecCount.Get()
execCount2 := sbc2.ExecCount.Get()
if sum := execCount0 + execCount1 + execCount2; sum != 2 {
t.Errorf("want 2, got %v", sum)
}
if execCount0 > 1 || execCount1 > 1 || execCount2 > 1 {
t.Errorf("want no more than 1, got %v,%v,%v", execCount0, execCount1, execCount2)
}
}
if epCount := s.EndPointCounter.Get(); epCount != 2 {
t.Errorf("want 2, got %v", epCount)
}
// case 5: always resolve the same 3 endpoints, all 3 execution failed -> resolve and use the first one
s.Reset()
var firstConn *sandboxConn
countConnUse = 0
onConnUse = func(conn *sandboxConn) {
if countConnUse == 0 {
firstConn = conn
}
countConnUse++
}
sbc0 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
sbc1 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
sbc2 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
s.MapTestConn("0", sbc0)
s.MapTestConn("0", sbc1)
s.MapTestConn("0", sbc2)
sdc = NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration < retryDelay {
t.Errorf("want no spam delay %v, got %v", retryDelay, timeDuration)
}
if timeDuration > retryDelay*2 {
t.Errorf("want instant resolve %v, got %v", retryDelay, timeDuration)
}
for _, conn := range []*sandboxConn{sbc0, sbc1, sbc2} {
var wantExecCount int64 = 1
if conn == firstConn {
wantExecCount = 2
}
if execCount := conn.ExecCount.Get(); execCount != wantExecCount {
t.Errorf("want %v, got %v", wantExecCount, execCount)
}
}
if epCount := s.EndPointCounter.Get(); epCount != 5 {
t.Errorf("want 5, got %v", epCount)
}
// case 6: resolve 3 endpoints with 1st execution failed, resolve to a new set without the failed one -> try a random one
s.Reset()
firstConn = nil
onConnUse = func(conn *sandboxConn) {
if firstConn == nil {
firstConn = conn
conn.mustFailRetry = 1
}
}
sbc0 = &sandboxConn{onConnUse: onConnUse}
sbc1 = &sandboxConn{onConnUse: onConnUse}
sbc2 = &sandboxConn{onConnUse: onConnUse}
sbc3 := &sandboxConn{}
s.MapTestConn("0", sbc0)
s.MapTestConn("0", sbc1)
s.MapTestConn("0", sbc2)
countGetEndPoints := 0
onGetEndPoints := func(st *sandboxTopo) {
if countGetEndPoints == 1 {
s.MapTestConn("0", sbc3)
s.DeleteTestConn("0", firstConn)
}
countGetEndPoints++
}
sdc = NewShardConn(context.Background(), &sandboxTopo{callbackGetEndPoints: onGetEndPoints}, "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration >= retryDelay {
t.Errorf("want no delay, got %v", timeDuration)
}
if execCount := firstConn.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
var totalExecCount int64
for _, conn := range s.TestConns["0"] {
totalExecCount += conn.(*sandboxConn).ExecCount.Get()
}
if totalExecCount != 1 {
t.Errorf("want 1, got %v", totalExecCount)
}
if epCount := s.EndPointCounter.Get(); epCount != 2 {
t.Errorf("want 2, got %v", epCount)
}
// case 7: resolve 3 bad endpoints with execution failed
// upon resolve, 2nd bad endpoint changed address (once only) but still fails on execution
// -> should only use the 1st endpoint after all other endpoints are tried out
s.Reset()
var secondConn *sandboxConn
countConnUse = 0
onConnUse = func(conn *sandboxConn) {
if countConnUse == 0 {
firstConn = conn
} else if countConnUse == 1 {
secondConn = conn
}
countConnUse++
}
sbc0 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
sbc1 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
sbc2 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
sbc3 = &sandboxConn{mustFailRetry: 1}
s.MapTestConn("0", sbc0)
s.MapTestConn("0", sbc1)
s.MapTestConn("0", sbc2)
countGetEndPoints = 0
onGetEndPoints = func(st *sandboxTopo) {
if countGetEndPoints == 2 {
s.MapTestConn("0", sbc3)
s.DeleteTestConn("0", secondConn)
}
countGetEndPoints++
}
sdc = NewShardConn(context.Background(), &sandboxTopo{callbackGetEndPoints: onGetEndPoints}, "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration < retryDelay {
t.Errorf("want no spam delay %v, got %v", retryDelay, timeDuration)
}
if timeDuration > retryDelay*2 {
t.Errorf("want instant resolve %v, got %v", retryDelay, timeDuration)
}
if execCount := secondConn.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
if execCount := firstConn.ExecCount.Get(); execCount != 2 {
t.Errorf("want 2, got %v", execCount)
}
for _, conn := range s.TestConns["0"] {
if execCount := conn.(*sandboxConn).ExecCount.Get(); conn != firstConn && execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
}
if epCount := s.EndPointCounter.Get(); epCount != 6 {
t.Errorf("want 6, got %v", epCount)
}
// case 8: resolve 3 bad endpoints with execution failed,
// after resolve, all endpoints are valid on new addresses
// -> random use an endpoint without delay
s.Reset()
firstConn = nil
onConnUse = func(conn *sandboxConn) {
if firstConn == nil {
firstConn = conn
}
}
sbc0 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
sbc1 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
sbc2 = &sandboxConn{mustFailRetry: 1, onConnUse: onConnUse}
sbc3 = &sandboxConn{}
sbc4 := &sandboxConn{}
sbc5 := &sandboxConn{}
s.MapTestConn("0", sbc0)
s.MapTestConn("0", sbc1)
s.MapTestConn("0", sbc2)
countGetEndPoints = 0
onGetEndPoints = func(st *sandboxTopo) {
if countGetEndPoints == 1 {
s.MapTestConn("0", sbc3)
s.MapTestConn("0", sbc4)
s.MapTestConn("0", sbc5)
s.DeleteTestConn("0", sbc0)
s.DeleteTestConn("0", sbc1)
s.DeleteTestConn("0", sbc2)
}
countGetEndPoints++
}
sdc = NewShardConn(context.Background(), &sandboxTopo{callbackGetEndPoints: onGetEndPoints}, "aa", "TestShardConnReconnect", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, connLife, connectTimings)
timeStart = time.Now()
sdc.Execute(context.Background(), "query", nil, 0)
timeDuration = time.Now().Sub(timeStart)
if timeDuration >= retryDelay {
t.Errorf("want no delay, got %v", timeDuration)
}
if execCount := firstConn.ExecCount.Get(); execCount != 1 {
t.Errorf("want 1, got %v", execCount)
}
for _, conn := range []*sandboxConn{sbc0, sbc1, sbc2} {
if execCount := conn.ExecCount.Get(); conn != firstConn && execCount != 0 {
t.Errorf("want 0, got %v", execCount)
}
}
if sum := sbc3.ExecCount.Get() + sbc4.ExecCount.Get() + sbc5.ExecCount.Get(); sum != 1 {
t.Errorf("want 1, got %v", sum)
}
if epCount := s.EndPointCounter.Get(); epCount != 2 {
t.Errorf("want 2, got %v", epCount)
}
}
func TestReplicaShardConnLife(t *testing.T) {
// auto-reconnect for non-master
retryDelay := 10 * time.Millisecond
retryCount := 5
s := createSandbox("TestReplicaShardConnLife")
sbc := &sandboxConn{}
s.MapTestConn("0", sbc)
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestReplicaShardConnLife", "0", topodatapb.TabletType_REPLICA, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, 10*time.Millisecond, connectTimings)
sdc.Execute(context.Background(), "query", nil, 0)
if s.DialCounter != 1 {
t.Errorf("DialCounter: %d, want 1", s.DialCounter)
}
time.Sleep(20 * time.Millisecond)
sdc.Execute(context.Background(), "query", nil, 0)
if s.DialCounter != 2 {
t.Errorf("DialCounter: %d, want 2", s.DialCounter)
}
sdc.Close()
}
func TestMasterShardConnLife(t *testing.T) {
// Do not auto-reconnect for master
retryDelay := 10 * time.Millisecond
retryCount := 5
s := createSandbox("TestMasterShardConnLife")
sbc := &sandboxConn{}
s.MapTestConn("0", sbc)
sdc := NewShardConn(context.Background(), new(sandboxTopo), "aa", "TestMasterShardConnLife", "0", topodatapb.TabletType_MASTER, retryDelay, retryCount, connTimeoutTotal, connTimeoutPerConn, 10*time.Millisecond, connectTimings)
sdc.Execute(context.Background(), "query", nil, 0)
if s.DialCounter != 1 {
t.Errorf("DialCounter: %d, want 1", s.DialCounter)
}
time.Sleep(20 * time.Millisecond)
sdc.Execute(context.Background(), "query", nil, 0)
if s.DialCounter != 1 {
t.Errorf("DialCounter: %d, want 1", s.DialCounter)
}
sdc.Close()
}