From 7b5eb86f58fc8ce4b25d18266de565cc2e9f0ddf Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Mon, 26 Sep 2022 21:33:51 +0530 Subject: [PATCH] Flakiness Fix: Tests for GracefulPrimaryTakeover (#11355) * feat: add some helpful logging to graceful primary takeover call Signed-off-by: Manan Gupta * test: retry API calls if they return cannot deduce cluster primary Signed-off-by: Manan Gupta Signed-off-by: Manan Gupta --- .../graceful_takeover_test.go | 20 ++++++++--------- go/test/endtoend/vtorc/utils/utils.go | 22 +++++++++++++++++++ go/vt/vtorc/logic/topology_recovery.go | 2 ++ 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go index 7a98a234a4..f859a557d3 100644 --- a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go +++ b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go @@ -72,8 +72,8 @@ func TestGracefulPrimaryTakeover(t *testing.T) { // we try to set the same end timestamp on the recovery of first 2 failures which fails the unique constraint time.Sleep(1 * time.Second) - status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort, replica.MySQLPort)) - assert.Equal(t, 200, status) + status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort, replica.MySQLPort)) + assert.Equal(t, 200, status, resp) // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) @@ -108,8 +108,8 @@ func TestGracefulPrimaryTakeoverNoTarget(t *testing.T) { // check that the replication is setup correctly before we failover utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second) - status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort)) - assert.Equal(t, 200, status) + status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort)) + assert.Equal(t, 200, status, resp) // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) @@ -147,15 +147,15 @@ func TestGracefulPrimaryTakeoverAuto(t *testing.T) { // check that the replication is setup correctly before we failover utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second) - status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, replica.MySQLPort)) - assert.Equal(t, 200, status) + status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, replica.MySQLPort)) + assert.Equal(t, 200, status, resp) // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{primary, rdonly}, 10*time.Second) - status, _ = utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, replica.MySQLPort)) - assert.Equal(t, 200, status) + status, resp = utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, replica.MySQLPort)) + assert.Equal(t, 200, status, resp) // check that the primary gets promoted back utils.CheckPrimaryTablet(t, clusterInfo, primary, true) @@ -189,8 +189,8 @@ func TestGracefulPrimaryTakeoverFailCrossCell(t *testing.T) { // newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{crossCellReplica1, rdonly}, 25*time.Second) - status, response := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, crossCellReplica1.MySQLPort)) - assert.Equal(t, 500, status) + status, response := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, crossCellReplica1.MySQLPort)) + assert.Equal(t, 500, status, response) assert.Contains(t, response, "GracefulPrimaryTakeover: constraint failure") // check that the cross-cell replica doesn't get promoted and the previous primary is still the primary diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index c2a6beaed9..b51fbca7b0 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -730,6 +730,28 @@ func MakeAPICall(t *testing.T, url string) (status int, response string) { return res.StatusCode, body } +// MakeAPICallRetry is used to make an API call and retry if we see a 500 - Cannot deduce cluster primary output. This happens when we haven't +// finished refreshing information after a ClusterHasNoPrimary recovery and call GracefulPrimaryTakeover. This leads to us seeing no Primary tablet +// in the database of VTOrc. This is ephemeral though, because we will refresh the new-primary's information as part of the ClusterHasNoPrimary recovery flow. +func MakeAPICallRetry(t *testing.T, url string) (status int, response string) { + t.Helper() + timeout := time.After(10 * time.Second) + for { + select { + case <-timeout: + t.Fatal("timed out waiting for api to work") + return + default: + status, response = MakeAPICall(t, url) + if status == 500 && strings.Contains(response, "Cannot deduce cluster primary") { + time.Sleep(1 * time.Second) + break + } + return status, response + } + } +} + // SetupNewClusterSemiSync is used to setup a new cluster with semi-sync set. // It creates a cluster with 4 tablets, one of which is a Replica func SetupNewClusterSemiSync(t *testing.T) *VTOrcClusterInfo { diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index bdc558f28a..d44eed4f9f 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -1313,6 +1313,7 @@ func ForcePrimaryTakeover(clusterName string, destination *inst.Instance) (topol // It will point old primary at the newly promoted primary at the correct coordinates. // All of this is accomplished via PlannedReparentShard operation. It is an idempotent operation, look at its documentation for more detail func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey) (topologyRecovery *TopologyRecovery, err error) { + log.Infof("GracefulPrimaryTakeover for shard %v", clusterName) clusterPrimaries, err := inst.ReadClusterPrimary(clusterName) if err != nil { return nil, fmt.Errorf("Cannot deduce cluster primary for %+v; error: %+v", clusterName, err) @@ -1321,6 +1322,7 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey return nil, fmt.Errorf("Cannot deduce cluster primary for %+v. Found %+v potential primarys", clusterName, len(clusterPrimaries)) } clusterPrimary := clusterPrimaries[0] + log.Infof("GracefulPrimaryTakeover for shard %v, current primary - %v", clusterName, clusterPrimary.InstanceAlias) analysisEntry, err := forceAnalysisEntry(clusterName, inst.GraceFulPrimaryTakeover, inst.GracefulPrimaryTakeoverCommandHint, &clusterPrimary.Key) if err != nil {