Flakiness Fix: Tests for GracefulPrimaryTakeover (#11355)

* feat: add some helpful logging to graceful primary takeover call

Signed-off-by: Manan Gupta <manan@planetscale.com>

* test: retry API calls if they return cannot deduce cluster primary

Signed-off-by: Manan Gupta <manan@planetscale.com>

Signed-off-by: Manan Gupta <manan@planetscale.com>
This commit is contained in:
Manan Gupta 2022-09-26 21:33:51 +05:30 коммит произвёл GitHub
Родитель 5394719517
Коммит 7b5eb86f58
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 34 добавлений и 10 удалений

Просмотреть файл

@ -72,8 +72,8 @@ func TestGracefulPrimaryTakeover(t *testing.T) {
// we try to set the same end timestamp on the recovery of first 2 failures which fails the unique constraint
time.Sleep(1 * time.Second)
status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort, replica.MySQLPort))
assert.Equal(t, 200, status)
status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort, replica.MySQLPort))
assert.Equal(t, 200, status, resp)
// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
@ -108,8 +108,8 @@ func TestGracefulPrimaryTakeoverNoTarget(t *testing.T) {
// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second)
status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort))
assert.Equal(t, 200, status)
status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, curPrimary.MySQLPort))
assert.Equal(t, 200, status, resp)
// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
@ -147,15 +147,15 @@ func TestGracefulPrimaryTakeoverAuto(t *testing.T) {
// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)
status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, replica.MySQLPort))
assert.Equal(t, 200, status)
status, resp := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, replica.MySQLPort))
assert.Equal(t, 200, status, resp)
// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{primary, rdonly}, 10*time.Second)
status, _ = utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, replica.MySQLPort))
assert.Equal(t, 200, status)
status, resp = utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover-auto/localhost/%d/", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, replica.MySQLPort))
assert.Equal(t, 200, status, resp)
// check that the primary gets promoted back
utils.CheckPrimaryTablet(t, clusterInfo, primary, true)
@ -189,8 +189,8 @@ func TestGracefulPrimaryTakeoverFailCrossCell(t *testing.T) {
// newly started tablet does not replicate from anyone yet, we will allow vtorc to fix this too
utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{crossCellReplica1, rdonly}, 25*time.Second)
status, response := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, crossCellReplica1.MySQLPort))
assert.Equal(t, 500, status)
status, response := utils.MakeAPICallRetry(t, fmt.Sprintf("http://localhost:%d/api/graceful-primary-takeover/localhost/%d/localhost/%d", clusterInfo.ClusterInstance.VTOrcProcesses[0].WebPort, primary.MySQLPort, crossCellReplica1.MySQLPort))
assert.Equal(t, 500, status, response)
assert.Contains(t, response, "GracefulPrimaryTakeover: constraint failure")
// check that the cross-cell replica doesn't get promoted and the previous primary is still the primary

Просмотреть файл

@ -730,6 +730,28 @@ func MakeAPICall(t *testing.T, url string) (status int, response string) {
return res.StatusCode, body
}
// MakeAPICallRetry is used to make an API call and retry if we see a 500 - Cannot deduce cluster primary output. This happens when we haven't
// finished refreshing information after a ClusterHasNoPrimary recovery and call GracefulPrimaryTakeover. This leads to us seeing no Primary tablet
// in the database of VTOrc. This is ephemeral though, because we will refresh the new-primary's information as part of the ClusterHasNoPrimary recovery flow.
func MakeAPICallRetry(t *testing.T, url string) (status int, response string) {
t.Helper()
timeout := time.After(10 * time.Second)
for {
select {
case <-timeout:
t.Fatal("timed out waiting for api to work")
return
default:
status, response = MakeAPICall(t, url)
if status == 500 && strings.Contains(response, "Cannot deduce cluster primary") {
time.Sleep(1 * time.Second)
break
}
return status, response
}
}
}
// SetupNewClusterSemiSync is used to setup a new cluster with semi-sync set.
// It creates a cluster with 4 tablets, one of which is a Replica
func SetupNewClusterSemiSync(t *testing.T) *VTOrcClusterInfo {

Просмотреть файл

@ -1313,6 +1313,7 @@ func ForcePrimaryTakeover(clusterName string, destination *inst.Instance) (topol
// It will point old primary at the newly promoted primary at the correct coordinates.
// All of this is accomplished via PlannedReparentShard operation. It is an idempotent operation, look at its documentation for more detail
func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey) (topologyRecovery *TopologyRecovery, err error) {
log.Infof("GracefulPrimaryTakeover for shard %v", clusterName)
clusterPrimaries, err := inst.ReadClusterPrimary(clusterName)
if err != nil {
return nil, fmt.Errorf("Cannot deduce cluster primary for %+v; error: %+v", clusterName, err)
@ -1321,6 +1322,7 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey
return nil, fmt.Errorf("Cannot deduce cluster primary for %+v. Found %+v potential primarys", clusterName, len(clusterPrimaries))
}
clusterPrimary := clusterPrimaries[0]
log.Infof("GracefulPrimaryTakeover for shard %v, current primary - %v", clusterName, clusterPrimary.InstanceAlias)
analysisEntry, err := forceAnalysisEntry(clusterName, inst.GraceFulPrimaryTakeover, inst.GracefulPrimaryTakeoverCommandHint, &clusterPrimary.Key)
if err != nil {