зеркало из https://github.com/github/vitess-gh.git
Flakiness Fix: Tests for GracefulPrimaryTakeover (#11355)
* feat: add some helpful logging to graceful primary takeover call Signed-off-by: Manan Gupta <manan@planetscale.com> * test: retry API calls if they return cannot deduce cluster primary Signed-off-by: Manan Gupta <manan@planetscale.com> Signed-off-by: Manan Gupta <manan@planetscale.com>
This commit is contained in:
Родитель
5394719517
Коммит
7b5eb86f58
|
@ -72,8 +72,8 @@ func TestGracefulPrimaryTakeover(t *testing.T) {
|
|||
// we try to set the same end timestamp on the recovery of first 2 failures which fails the unique constraint
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort, replica.MySQLPort))
|
||||
assert.Equal(t, 200, status)
|
||||
status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort, replica.MySQLPort))
|
||||
assert.Equal(t, 200, status, resp)
|
||||
|
||||
// check that the replica gets promoted
|
||||
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
|
||||
|
@ -108,8 +108,8 @@ func TestGracefulPrimaryTakeoverNoTarget(t *testing.T) {
|
|||
// check that the replication is setup correctly before we failover
|
||||
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second)
|
||||
|
||||
status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort))
|
||||
assert.Equal(t, 200, status)
|
||||
status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort))
|
||||
assert.Equal(t, 200, status, resp)
|
||||
|
||||
// check that the replica gets promoted
|
||||
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
|
||||
|
@ -147,15 +147,15 @@ func TestGracefulPrimaryTakeoverAuto(t *testing.T) {
|
|||
// check that the replication is setup correctly before we failover
|
||||
utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)
|
||||
|
||||
status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, replica.MySQLPort))
|
||||
assert.Equal(t, 200, status)
|
||||
status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, replica.MySQLPort))
|
||||
assert.Equal(t, 200, status, resp)
|
||||
|
||||
// check that the replica gets promoted
|
||||
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
|
||||
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{primary, rdonly}, 10*time.Second)
|
||||
|
||||
status, _ = utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, replica.MySQLPort))
|
||||
assert.Equal(t, 200, status)
|
||||
status, resp = utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, replica.MySQLPort))
|
||||
assert.Equal(t, 200, status, resp)
|
||||
|
||||
// check that the primary gets promoted back
|
||||
utils.CheckPrimaryTablet(t, clusterInfo, primary, true)
|
||||
|
@ -189,8 +189,8 @@ func TestGracefulPrimaryTakeoverFailCrossCell(t *testing.T) {
|
|||
// newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too
|
||||
utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{crossCellReplica1, rdonly}, 25*time.Second)
|
||||
|
||||
status, response := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, crossCellReplica1.MySQLPort))
|
||||
assert.Equal(t, 500, status)
|
||||
status, response := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, crossCellReplica1.MySQLPort))
|
||||
assert.Equal(t, 500, status, response)
|
||||
assert.Contains(t, response, "GracefulPrimaryTakeover: constraint failure")
|
||||
|
||||
// check that the cross-cell replica doesn't get promoted and the previous primary is still the primary
|
||||
|
|
|
@ -730,6 +730,28 @@ func MakeAPICall(t *testing.T, url string) (status int, response string) {
|
|||
return res.StatusCode, body
|
||||
}
|
||||
|
||||
// MakeAPICallRetry is used to make an API call and retry if we see a 500 - Cannot deduce cluster primary output. This happens when we haven't
|
||||
// finished refreshing information after a ClusterHasNoPrimary recovery and call GracefulPrimaryTakeover. This leads to us seeing no Primary tablet
|
||||
// in the database of VTOrc. This is ephemeral though, because we will refresh the new-primary's information as part of the ClusterHasNoPrimary recovery flow.
|
||||
func MakeAPICallRetry(t *testing.T, url string) (status int, response string) {
|
||||
t.Helper()
|
||||
timeout := time.After(10 * time.Second)
|
||||
for {
|
||||
select {
|
||||
case <-timeout:
|
||||
t.Fatal("timed out waiting for api to work")
|
||||
return
|
||||
default:
|
||||
status, response = MakeAPICall(t, url)
|
||||
if status == 500 && strings.Contains(response, "Cannot deduce cluster primary") {
|
||||
time.Sleep(1 * time.Second)
|
||||
break
|
||||
}
|
||||
return status, response
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SetupNewClusterSemiSync is used to setup a new cluster with semi-sync set.
|
||||
// It creates a cluster with 4 tablets, one of which is a Replica
|
||||
func SetupNewClusterSemiSync(t *testing.T) *VTOrcClusterInfo {
|
||||
|
|
|
@ -1313,6 +1313,7 @@ func ForcePrimaryTakeover(clusterName string, destination *inst.Instance) (topol
|
|||
// It will point old primary at the newly promoted primary at the correct coordinates.
|
||||
// All of this is accomplished via PlannedReparentShard operation. It is an idempotent operation, look at its documentation for more detail
|
||||
func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey) (topologyRecovery *TopologyRecovery, err error) {
|
||||
log.Infof("GracefulPrimaryTakeover for shard %v", clusterName)
|
||||
clusterPrimaries, err := inst.ReadClusterPrimary(clusterName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Cannot deduce cluster primary for %+v; error: %+v", clusterName, err)
|
||||
|
@ -1321,6 +1322,7 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey
|
|||
return nil, fmt.Errorf("Cannot deduce cluster primary for %+v. Found %+v potential primarys", clusterName, len(clusterPrimaries))
|
||||
}
|
||||
clusterPrimary := clusterPrimaries[0]
|
||||
log.Infof("GracefulPrimaryTakeover for shard %v, current primary - %v", clusterName, clusterPrimary.InstanceAlias)
|
||||
|
||||
analysisEntry, err := forceAnalysisEntry(clusterName, inst.GraceFulPrimaryTakeover, inst.GracefulPrimaryTakeoverCommandHint, &clusterPrimary.Key)
|
||||
if err != nil {
|
||||
|
|
Загрузка…
Ссылка в новой задаче