From a65a7f6fa7191a05a86da80d79a909052d85b0d6 Mon Sep 17 00:00:00 2001 From: deepthi Date: Fri, 30 Aug 2019 17:59:34 -0700 Subject: [PATCH 1/8] along with old master also update any other tablet that thinks it is master to REPLICA Signed-off-by: deepthi --- .../tabletmanager/rpc_external_reparent.go | 51 +++++++++++++++---- .../testlib/reparent_external_test.go | 2 +- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index 7b066ed1e1..d53c88740d 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -143,11 +143,12 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context var oldMasterTablet *topodatapb.Tablet oldMasterAlias := si.MasterAlias - // Update the tablet records concurrently. + // Update the new and old master tablet records concurrently. event.DispatchUpdate(ev, "updating old and new master tablet records") log.Infof("finalizeTabletExternallyReparented: updating tablet records") wg.Add(1) go func() { + log.Infof("finalizeTabletExternallyReparented: updating tablet record for new master: %v", agent.TabletAlias) defer wg.Done() // Update our own record to master. _, err := agent.TopoServer.UpdateTabletFields(ctx, agent.TabletAlias, @@ -160,9 +161,14 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context } }() + tablet := agent.Tablet() + tabletMap, err := agent.TopoServer.GetTabletMapForShard(ctx, tablet.Keyspace, tablet.Shard) + // make the channel buffer big enough that it doesn't block senders + tablets := make(chan topodatapb.Tablet, len(tabletMap)) if !topoproto.TabletAliasIsZero(oldMasterAlias) { wg.Add(1) go func() { + log.Infof("finalizeTabletExternallyReparented: updating tablet record for old master: %v", oldMasterAlias) defer wg.Done() // Forcibly demote the old master in topology, since we can't rely on the @@ -180,15 +186,38 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context // We now know more about the old master, so add it to event data. ev.OldMaster = *oldMasterTablet + tablets <- *oldMasterTablet }() } - tablet := agent.Tablet() + // update any other tablets claiming to be MASTER also to REPLICA + for alias, tabletInfo := range tabletMap { + if alias != topoproto.TabletAliasString(agent.TabletAlias) && alias != topoproto.TabletAliasString(oldMasterAlias) && tabletInfo.Tablet.Type == topodatapb.TabletType_MASTER { + log.Infof("finalizeTabletExternallyReparented: updating tablet record for another old master: %v", alias) + wg.Add(1) + go func() { + defer wg.Done() + var err error + tab, err := agent.TopoServer.UpdateTabletFields(ctx, tabletInfo.Tablet.Alias, + func(tablet *topodatapb.Tablet) error { + tablet.Type = topodatapb.TabletType_REPLICA + return nil + }) + if err != nil { + errs.RecordError(err) + return + } + tablets <- *tab + }() + } + } // Wait for the tablet records to be updated. At that point, any rebuild will // see the new master, so we're ready to mark the reparent as done in the // global shard record. wg.Wait() + // we waited for all goroutines to complete, so now close the channel + close(tablets) if errs.HasErrors() { return errs.Error() } @@ -221,22 +250,24 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context errs.RecordError(err) } }() - if !topoproto.TabletAliasIsZero(oldMasterAlias) { + wg.Wait() + + for tab := range tablets { + log.Infof("finalizeTabletExternallyReparented: Refresh state for tablet: %v", topoproto.TabletAliasString(tab.Alias)) wg.Add(1) - go func() { + go func(tablet *topodatapb.Tablet) { defer wg.Done() - // Tell the old master to re-read its tablet record and change its state. + // Tell the old master(s) to re-read its tablet record and change its state. // We don't need to put error into errs if this fails, but we need to wait - // for it to make sure that old master tablet is not stuck in the MASTER + // for it to make sure that an old master tablet is not stuck in the MASTER // state. tmc := tmclient.NewTabletManagerClient() - if err := tmc.RefreshState(ctx, oldMasterTablet); err != nil { - log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(oldMasterTablet.Alias), err) + if err := tmc.RefreshState(ctx, tablet); err != nil { + log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(tablet.Alias), err) } - }() + }(&tab) } - wg.Wait() if errs.HasErrors() { return errs.Error() diff --git a/go/vt/wrangler/testlib/reparent_external_test.go b/go/vt/wrangler/testlib/reparent_external_test.go index 153bb8d0a7..79176b82cb 100644 --- a/go/vt/wrangler/testlib/reparent_external_test.go +++ b/go/vt/wrangler/testlib/reparent_external_test.go @@ -300,7 +300,7 @@ func TestTabletExternallyReparentedFailedOldMaster(t *testing.T) { t.Fatalf("GetTablet(%v) failed: %v", oldMaster.Tablet.Alias, err) } if tablet.Type != topodatapb.TabletType_REPLICA { - t.Fatalf("old master should be spare but is: %v", tablet.Type) + t.Fatalf("old master should be replica but is: %v", tablet.Type) } } From 7bd6618620ccfc6ec45a71ba09f95ad66310afb5 Mon Sep 17 00:00:00 2001 From: deepthi Date: Mon, 9 Sep 2019 20:10:19 -0700 Subject: [PATCH 2/8] address review comments Signed-off-by: deepthi --- .../tabletmanager/rpc_external_reparent.go | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index d53c88740d..5b5a172c58 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -148,8 +148,8 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context log.Infof("finalizeTabletExternallyReparented: updating tablet records") wg.Add(1) go func() { - log.Infof("finalizeTabletExternallyReparented: updating tablet record for new master: %v", agent.TabletAlias) defer wg.Done() + log.Infof("finalizeTabletExternallyReparented: updating tablet record for new master: %v", agent.TabletAlias) // Update our own record to master. _, err := agent.TopoServer.UpdateTabletFields(ctx, agent.TabletAlias, func(tablet *topodatapb.Tablet) error { @@ -163,13 +163,17 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context tablet := agent.Tablet() tabletMap, err := agent.TopoServer.GetTabletMapForShard(ctx, tablet.Keyspace, tablet.Shard) + if err != nil { + log.Errorf("ignoring error %v from GetTabletMapForShard so that we can process any partial results", err) + } + // make the channel buffer big enough that it doesn't block senders - tablets := make(chan topodatapb.Tablet, len(tabletMap)) + tabletsToRefresh := make(chan topodatapb.Tablet, len(tabletMap)+1) if !topoproto.TabletAliasIsZero(oldMasterAlias) { wg.Add(1) go func() { - log.Infof("finalizeTabletExternallyReparented: updating tablet record for old master: %v", oldMasterAlias) defer wg.Done() + log.Infof("finalizeTabletExternallyReparented: updating tablet record for old master: %v", oldMasterAlias) // Forcibly demote the old master in topology, since we can't rely on the // old master to be up to change its own record. @@ -186,19 +190,20 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context // We now know more about the old master, so add it to event data. ev.OldMaster = *oldMasterTablet - tablets <- *oldMasterTablet + tabletsToRefresh <- *oldMasterTablet }() } // update any other tablets claiming to be MASTER also to REPLICA - for alias, tabletInfo := range tabletMap { - if alias != topoproto.TabletAliasString(agent.TabletAlias) && alias != topoproto.TabletAliasString(oldMasterAlias) && tabletInfo.Tablet.Type == topodatapb.TabletType_MASTER { + for _, tabletInfo := range tabletMap { + alias := tabletInfo.Tablet.Alias + if !topoproto.TabletAliasEqual(alias, agent.TabletAlias) && !topoproto.TabletAliasEqual(alias, oldMasterAlias) && tabletInfo.Tablet.Type == topodatapb.TabletType_MASTER { log.Infof("finalizeTabletExternallyReparented: updating tablet record for another old master: %v", alias) wg.Add(1) - go func() { + go func(alias *topodatapb.TabletAlias) { defer wg.Done() var err error - tab, err := agent.TopoServer.UpdateTabletFields(ctx, tabletInfo.Tablet.Alias, + tab, err := agent.TopoServer.UpdateTabletFields(ctx, alias, func(tablet *topodatapb.Tablet) error { tablet.Type = topodatapb.TabletType_REPLICA return nil @@ -207,8 +212,8 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context errs.RecordError(err) return } - tablets <- *tab - }() + tabletsToRefresh <- *tab + }(alias) } } @@ -217,7 +222,7 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context // global shard record. wg.Wait() // we waited for all goroutines to complete, so now close the channel - close(tablets) + close(tabletsToRefresh) if errs.HasErrors() { return errs.Error() } @@ -250,9 +255,8 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context errs.RecordError(err) } }() - wg.Wait() - for tab := range tablets { + for tab := range tabletsToRefresh { log.Infof("finalizeTabletExternallyReparented: Refresh state for tablet: %v", topoproto.TabletAliasString(tab.Alias)) wg.Add(1) go func(tablet *topodatapb.Tablet) { From 2b718e9d93f4a7100ee3566afa1e7c5000f8498c Mon Sep 17 00:00:00 2001 From: deepthi Date: Tue, 10 Sep 2019 10:57:27 -0700 Subject: [PATCH 3/8] check tablet is really master before changing type, add unit test Signed-off-by: deepthi --- .../tabletmanager/rpc_external_reparent.go | 20 ++++-- .../testlib/reparent_external_test.go | 70 ++++++++++++++++++- 2 files changed, 84 insertions(+), 6 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index 5b5a172c58..80988aabdb 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -180,7 +180,9 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context var err error oldMasterTablet, err = agent.TopoServer.UpdateTabletFields(ctx, oldMasterAlias, func(tablet *topodatapb.Tablet) error { - tablet.Type = topodatapb.TabletType_REPLICA + if tablet.Type == topodatapb.TabletType_MASTER { + tablet.Type = topodatapb.TabletType_REPLICA + } return nil }) if err != nil { @@ -189,8 +191,11 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context } // We now know more about the old master, so add it to event data. - ev.OldMaster = *oldMasterTablet - tabletsToRefresh <- *oldMasterTablet + // oldMasterTablet will be nil if no update was needed + if oldMasterTablet != nil { + ev.OldMaster = *oldMasterTablet + tabletsToRefresh <- *oldMasterTablet + } }() } @@ -205,14 +210,19 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context var err error tab, err := agent.TopoServer.UpdateTabletFields(ctx, alias, func(tablet *topodatapb.Tablet) error { - tablet.Type = topodatapb.TabletType_REPLICA + if tablet.Type == topodatapb.TabletType_MASTER { + tablet.Type = topodatapb.TabletType_REPLICA + } return nil }) if err != nil { errs.RecordError(err) return } - tabletsToRefresh <- *tab + // tab will be nil if no update was needed + if tab != nil { + tabletsToRefresh <- *tab + } }(alias) } } diff --git a/go/vt/wrangler/testlib/reparent_external_test.go b/go/vt/wrangler/testlib/reparent_external_test.go index 79176b82cb..509b911c97 100644 --- a/go/vt/wrangler/testlib/reparent_external_test.go +++ b/go/vt/wrangler/testlib/reparent_external_test.go @@ -253,7 +253,7 @@ func TestTabletExternallyReparentedContinueOnUnexpectedMaster(t *testing.T) { } func TestTabletExternallyReparentedFailedOldMaster(t *testing.T) { - // The 'RefreshState' clal on the old master will timeout on + // The 'RefreshState' call on the old master will timeout on // this value, so it has to be smaller than the 10s of the // wait for the 'finished' state of waitForExternalReparent. tabletmanager.SetReparentFlags(2 * time.Second /* finalizeTimeout */) @@ -304,6 +304,74 @@ func TestTabletExternallyReparentedFailedOldMaster(t *testing.T) { } } +func TestTabletExternallyReparentedImpostorMaster(t *testing.T) { + tabletmanager.SetReparentFlags(time.Minute /* finalizeTimeout */) + + ctx := context.Background() + ts := memorytopo.NewServer("cell1", "cell2") + wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) + + // Create an old master, a new master, and a bad slave. + badSlave := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_MASTER, nil) + // do this after badSlave so that the shard record has the expected master + oldMaster := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_MASTER, nil, ForceInitTablet()) + newMaster := NewFakeTablet(t, wr, "cell1", 1, topodatapb.TabletType_REPLICA, nil) + + // check the old master is really master + tablet, err := ts.GetTablet(ctx, oldMaster.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", oldMaster.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_MASTER { + t.Fatalf("old master should be MASTER but is: %v", tablet.Type) + } + + // On the elected master, we will respond to + // TabletActionSlaveWasPromoted. + newMaster.StartActionLoop(t, wr) + defer newMaster.StopActionLoop(t) + + // On the old master, we will only respond to + // TabletActionSlaveWasRestarted. + oldMaster.StartActionLoop(t, wr) + defer oldMaster.StopActionLoop(t) + + // On the bad slave, we will respond to + // TabletActionSlaveWasRestarted. + badSlave.StartActionLoop(t, wr) + defer badSlave.StopActionLoop(t) + + // The reparent should work as expected here + tmc := tmclient.NewTabletManagerClient() + ti, err := ts.GetTablet(ctx, newMaster.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet failed: %v", err) + } + waitID := makeWaitID() + if err := tmc.TabletExternallyReparented(context.Background(), ti.Tablet, waitID); err != nil { + t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) + } + waitForExternalReparent(t, "TestTabletExternallyReparentedImpostorMaster: good case", waitID) + + // check the old master was converted to replica + tablet, err = ts.GetTablet(ctx, oldMaster.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", oldMaster.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_REPLICA { + t.Fatalf("old master should be replica but is: %v", tablet.Type) + } + + // check the impostor master was converted to replica + tablet, err = ts.GetTablet(ctx, badSlave.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", badSlave.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_REPLICA { + t.Fatalf("bad slave should be replica but is: %v", tablet.Type) + } +} + var ( externalReparents = make(map[string]chan struct{}) externalReparentsMutex sync.Mutex From 8236bdb1733de40ab9f971303e3437ffc2cae9eb Mon Sep 17 00:00:00 2001 From: deepthi Date: Tue, 10 Sep 2019 15:52:39 -0700 Subject: [PATCH 4/8] fix race condition, one more unit test Signed-off-by: deepthi --- .../tabletmanager/rpc_external_reparent.go | 6 +- .../testlib/reparent_external_test.go | 101 ++++++++++++++++++ 2 files changed, 104 insertions(+), 3 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index 80988aabdb..83904ab1ca 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -269,7 +269,7 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context for tab := range tabletsToRefresh { log.Infof("finalizeTabletExternallyReparented: Refresh state for tablet: %v", topoproto.TabletAliasString(tab.Alias)) wg.Add(1) - go func(tablet *topodatapb.Tablet) { + go func(tablet topodatapb.Tablet) { defer wg.Done() // Tell the old master(s) to re-read its tablet record and change its state. @@ -277,10 +277,10 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context // for it to make sure that an old master tablet is not stuck in the MASTER // state. tmc := tmclient.NewTabletManagerClient() - if err := tmc.RefreshState(ctx, tablet); err != nil { + if err := tmc.RefreshState(ctx, &tablet); err != nil { log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(tablet.Alias), err) } - }(&tab) + }(tab) } wg.Wait() if errs.HasErrors() { diff --git a/go/vt/wrangler/testlib/reparent_external_test.go b/go/vt/wrangler/testlib/reparent_external_test.go index 509b911c97..11d785f96a 100644 --- a/go/vt/wrangler/testlib/reparent_external_test.go +++ b/go/vt/wrangler/testlib/reparent_external_test.go @@ -326,6 +326,15 @@ func TestTabletExternallyReparentedImpostorMaster(t *testing.T) { t.Fatalf("old master should be MASTER but is: %v", tablet.Type) } + // check the impostor also claims to be master + tablet, err = ts.GetTablet(ctx, badSlave.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", badSlave.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_MASTER { + t.Fatalf("old master should be MASTER but is: %v", tablet.Type) + } + // On the elected master, we will respond to // TabletActionSlaveWasPromoted. newMaster.StartActionLoop(t, wr) @@ -353,6 +362,98 @@ func TestTabletExternallyReparentedImpostorMaster(t *testing.T) { } waitForExternalReparent(t, "TestTabletExternallyReparentedImpostorMaster: good case", waitID) + // check the new master is really master + tablet, err = ts.GetTablet(ctx, newMaster.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", newMaster.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_MASTER { + t.Fatalf("new master should be MASTER but is: %v", tablet.Type) + } + + // check the old master was converted to replica + tablet, err = ts.GetTablet(ctx, oldMaster.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", oldMaster.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_REPLICA { + t.Fatalf("old master should be replica but is: %v", tablet.Type) + } + + // check the impostor master was converted to replica + tablet, err = ts.GetTablet(ctx, badSlave.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", badSlave.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_REPLICA { + t.Fatalf("bad slave should be replica but is: %v", tablet.Type) + } +} + +func TestTabletExternallyReparentedFailedImpostorMaster(t *testing.T) { + tabletmanager.SetReparentFlags(2 * time.Second /* finalizeTimeout */) + + ctx := context.Background() + ts := memorytopo.NewServer("cell1", "cell2") + wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) + + // Create an old master, a new master, and a bad slave. + badSlave := NewFakeTablet(t, wr, "cell1", 2, topodatapb.TabletType_MASTER, nil) + // do this after badSlave so that the shard record has the expected master + oldMaster := NewFakeTablet(t, wr, "cell1", 0, topodatapb.TabletType_MASTER, nil, ForceInitTablet()) + newMaster := NewFakeTablet(t, wr, "cell1", 1, topodatapb.TabletType_REPLICA, nil) + + // check the old master is really master + tablet, err := ts.GetTablet(ctx, oldMaster.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", oldMaster.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_MASTER { + t.Fatalf("old master should be MASTER but is: %v", tablet.Type) + } + + // check the impostor also claims to be master + tablet, err = ts.GetTablet(ctx, badSlave.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", badSlave.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_MASTER { + t.Fatalf("old master should be MASTER but is: %v", tablet.Type) + } + + // On the elected master, we will respond to + // TabletActionSlaveWasPromoted. + newMaster.StartActionLoop(t, wr) + defer newMaster.StopActionLoop(t) + + // On the old master, we will only respond to + // TabletActionSlaveWasRestarted. + oldMaster.StartActionLoop(t, wr) + defer oldMaster.StopActionLoop(t) + + // Reparent to a replica, and pretend the impostor master is not responding. + + // The reparent should work as expected here + tmc := tmclient.NewTabletManagerClient() + ti, err := ts.GetTablet(ctx, newMaster.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet failed: %v", err) + } + waitID := makeWaitID() + if err := tmc.TabletExternallyReparented(context.Background(), ti.Tablet, waitID); err != nil { + t.Fatalf("TabletExternallyReparented(replica) failed: %v", err) + } + waitForExternalReparent(t, "TestTabletExternallyReparentedImpostorMaster: good case", waitID) + + // check the new master is really master + tablet, err = ts.GetTablet(ctx, newMaster.Tablet.Alias) + if err != nil { + t.Fatalf("GetTablet(%v) failed: %v", newMaster.Tablet.Alias, err) + } + if tablet.Type != topodatapb.TabletType_MASTER { + t.Fatalf("new master should be MASTER but is: %v", tablet.Type) + } + // check the old master was converted to replica tablet, err = ts.GetTablet(ctx, oldMaster.Tablet.Alias) if err != nil { From ccba3dba83a1de24c40c071407abadcbb15c43f7 Mon Sep 17 00:00:00 2001 From: deepthi Date: Wed, 11 Sep 2019 14:09:50 -0700 Subject: [PATCH 5/8] make TER idempotent so that it can be called again if the first call does not complete all steps Signed-off-by: deepthi --- .../tabletmanager/rpc_external_reparent.go | 221 ++++++++++-------- 1 file changed, 122 insertions(+), 99 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index 83904ab1ca..426f578283 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -81,13 +81,6 @@ func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, extern // timestamp to the current time. agent.setExternallyReparentedTime(startTime) - if topoproto.TabletAliasEqual(si.MasterAlias, tablet.Alias) { - // We may get called on the current master even when nothing has changed. - // If the global shard record is already updated, it means we successfully - // finished a previous reparent to this tablet. - return nil - } - // Create a reusable Reparent event with available info. ev := &events.Reparent{ ShardInfo: *si, @@ -107,14 +100,18 @@ func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, extern // Execute state change to master by force-updating only the local copy of the // tablet record. The actual record in topo will be updated later. - log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER") newTablet := proto.Clone(tablet).(*topodatapb.Tablet) - newTablet.Type = topodatapb.TabletType_MASTER - // This is where updateState will block for gracePeriod, while it gives - // vtgate a chance to stop sending replica queries. - agent.updateState(ctx, newTablet, "fastTabletExternallyReparented") + // We may get called on the current master multiple times in order to fix incomplete external reparents + // update tablet only if it is not currently master + if newTablet.Type != topodatapb.TabletType_MASTER { + log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER") + newTablet.Type = topodatapb.TabletType_MASTER + // This is where updateState will block for gracePeriod, while it gives + // vtgate a chance to stop sending replica queries. + agent.updateState(ctx, newTablet, "fastTabletExternallyReparented") + } // Start the finalize stage with a background context, but connect the trace. bgCtx, cancel := context.WithTimeout(agent.batchCtx, *finalizeReparentTimeout) bgCtx = trace.CopySpan(bgCtx, ctx) @@ -135,8 +132,12 @@ func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, extern } // finalizeTabletExternallyReparented performs slow, synchronized reconciliation -// tasks that ensure topology is self-consistent, and then marks the reparent as -// finished by updating the global shard record. +// tasks that ensure topology is self-consistent +// it first updates new and old master tablet records, then updates +// the global shard record, then refreshes the old master +// after that it attempts to detect and clean up any lingering old masters +// note that an up-to-date shard record does not necessarily mean that +// the reparent completed all the actions successfully func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context, si *topo.ShardInfo, ev *events.Reparent) (err error) { var wg sync.WaitGroup var errs concurrency.AllErrorRecorder @@ -150,10 +151,12 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context go func() { defer wg.Done() log.Infof("finalizeTabletExternallyReparented: updating tablet record for new master: %v", agent.TabletAlias) - // Update our own record to master. + // Update our own record to master if needed _, err := agent.TopoServer.UpdateTabletFields(ctx, agent.TabletAlias, func(tablet *topodatapb.Tablet) error { - tablet.Type = topodatapb.TabletType_MASTER + if tablet.Type != topodatapb.TabletType_MASTER { + tablet.Type = topodatapb.TabletType_MASTER + } return nil }) if err != nil { @@ -161,14 +164,6 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context } }() - tablet := agent.Tablet() - tabletMap, err := agent.TopoServer.GetTabletMapForShard(ctx, tablet.Keyspace, tablet.Shard) - if err != nil { - log.Errorf("ignoring error %v from GetTabletMapForShard so that we can process any partial results", err) - } - - // make the channel buffer big enough that it doesn't block senders - tabletsToRefresh := make(chan topodatapb.Tablet, len(tabletMap)+1) if !topoproto.TabletAliasIsZero(oldMasterAlias) { wg.Add(1) go func() { @@ -194,95 +189,123 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context // oldMasterTablet will be nil if no update was needed if oldMasterTablet != nil { ev.OldMaster = *oldMasterTablet - tabletsToRefresh <- *oldMasterTablet } }() } - - // update any other tablets claiming to be MASTER also to REPLICA - for _, tabletInfo := range tabletMap { - alias := tabletInfo.Tablet.Alias - if !topoproto.TabletAliasEqual(alias, agent.TabletAlias) && !topoproto.TabletAliasEqual(alias, oldMasterAlias) && tabletInfo.Tablet.Type == topodatapb.TabletType_MASTER { - log.Infof("finalizeTabletExternallyReparented: updating tablet record for another old master: %v", alias) - wg.Add(1) - go func(alias *topodatapb.TabletAlias) { - defer wg.Done() - var err error - tab, err := agent.TopoServer.UpdateTabletFields(ctx, alias, - func(tablet *topodatapb.Tablet) error { - if tablet.Type == topodatapb.TabletType_MASTER { - tablet.Type = topodatapb.TabletType_REPLICA - } - return nil - }) - if err != nil { - errs.RecordError(err) - return - } - // tab will be nil if no update was needed - if tab != nil { - tabletsToRefresh <- *tab - } - }(alias) - } - } - // Wait for the tablet records to be updated. At that point, any rebuild will // see the new master, so we're ready to mark the reparent as done in the // global shard record. wg.Wait() - // we waited for all goroutines to complete, so now close the channel - close(tabletsToRefresh) if errs.HasErrors() { return errs.Error() } - wg.Add(1) - go func() { - defer wg.Done() + masterTablet := agent.Tablet() - // Update the master field in the global shard record. We don't use a lock - // here anymore. The lock was only to ensure that the global shard record - // didn't get modified between the time when we read it and the time when we - // write it back. Now we use an update loop pattern to do that instead. - event.DispatchUpdate(ev, "updating global shard record") - log.Infof("finalizeTabletExternallyReparented: updating global shard record if needed") - _, err = agent.TopoServer.UpdateShardFields(ctx, tablet.Keyspace, tablet.Shard, func(currentSi *topo.ShardInfo) error { - if topoproto.TabletAliasEqual(currentSi.MasterAlias, tablet.Alias) { - return topo.NewError(topo.NoUpdateNeeded, tablet.Alias.String()) - } - if !topoproto.TabletAliasEqual(currentSi.MasterAlias, oldMasterAlias) { - log.Warningf("old master alias (%v) not found in the global Shard record i.e. it has changed in the meantime."+ - " We're not overwriting the value with the new master (%v) because the current value is probably newer."+ - " (initial Shard record = %#v, current Shard record = %#v)", - oldMasterAlias, tablet.Alias, si, currentSi) - return topo.NewError(topo.NoUpdateNeeded, oldMasterAlias.String()) - } - currentSi.MasterAlias = tablet.Alias - return nil - }) - if err != nil { - errs.RecordError(err) + // Update the master field in the global shard record. We don't use a lock + // here anymore. The lock was only to ensure that the global shard record + // didn't get modified between the time when we read it and the time when we + // write it back. Now we use an update loop pattern to do that instead. + event.DispatchUpdate(ev, "updating global shard record") + log.Infof("finalizeTabletExternallyReparented: updating global shard record if needed") + _, err = agent.TopoServer.UpdateShardFields(ctx, masterTablet.Keyspace, masterTablet.Shard, func(currentSi *topo.ShardInfo) error { + if topoproto.TabletAliasEqual(currentSi.MasterAlias, masterTablet.Alias) { + // It is correct to return this error here, UpdateShardFields will ignore it + return topo.NewError(topo.NoUpdateNeeded, masterTablet.Alias.String()) } - }() - - for tab := range tabletsToRefresh { - log.Infof("finalizeTabletExternallyReparented: Refresh state for tablet: %v", topoproto.TabletAliasString(tab.Alias)) - wg.Add(1) - go func(tablet topodatapb.Tablet) { - defer wg.Done() - - // Tell the old master(s) to re-read its tablet record and change its state. - // We don't need to put error into errs if this fails, but we need to wait - // for it to make sure that an old master tablet is not stuck in the MASTER - // state. - tmc := tmclient.NewTabletManagerClient() - if err := tmc.RefreshState(ctx, &tablet); err != nil { - log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(tablet.Alias), err) - } - }(tab) + if !topoproto.TabletAliasEqual(currentSi.MasterAlias, oldMasterAlias) { + log.Warningf("old master alias (%v) not found in the global Shard record i.e. it has changed in the meantime."+ + " We're not overwriting the value with the new master (%v) because the current value is probably newer."+ + " (initial Shard record = %#v, current Shard record = %#v)", + oldMasterAlias, masterTablet.Alias, si, currentSi) + // It is correct to return this error here, UpdateShardFields will ignore it + return topo.NewError(topo.NoUpdateNeeded, oldMasterAlias.String()) + } + currentSi.MasterAlias = masterTablet.Alias + return nil + }) + if err != nil { + errs.RecordError(err) + } + + if errs.HasErrors() { + return errs.Error() + } + + if !topoproto.TabletAliasIsZero(oldMasterAlias) { + // Tell the old master to re-read its tablet record and change its state. + // We don't need to put error into errs if this fails, but we need to wait + // for it to make sure that old master tablet is not stuck in the MASTER + // state. + tmc := tmclient.NewTabletManagerClient() + if err := tmc.RefreshState(ctx, oldMasterTablet); err != nil { + log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(oldMasterTablet.Alias), err) + } + } + + tabletMap, err := agent.TopoServer.GetTabletMapForShard(ctx, masterTablet.Keyspace, masterTablet.Shard) + if err != nil { + log.Errorf("ignoring error %v from GetTabletMapForShard so that we can process any partial results", err) + } + + if len(tabletMap) > 0 { + // make the channel buffer big enough that it doesn't block senders + tabletsToRefresh := make(chan topodatapb.Tablet, len(tabletMap)) + // update any other tablets claiming to be MASTER also to REPLICA + for _, tabletInfo := range tabletMap { + alias := tabletInfo.Tablet.Alias + if !topoproto.TabletAliasEqual(alias, agent.TabletAlias) && !topoproto.TabletAliasEqual(alias, oldMasterAlias) && tabletInfo.Tablet.Type == topodatapb.TabletType_MASTER { + log.Infof("finalizeTabletExternallyReparented: updating tablet record for another old master: %v", alias) + wg.Add(1) + go func(alias *topodatapb.TabletAlias) { + defer wg.Done() + var err error + tab, err := agent.TopoServer.UpdateTabletFields(ctx, alias, + func(tablet *topodatapb.Tablet) error { + if tablet.Type == topodatapb.TabletType_MASTER { + tablet.Type = topodatapb.TabletType_REPLICA + } + return nil + }) + if err != nil { + errs.RecordError(err) + return + } + // tab will be nil if no update was needed + if tab != nil { + tabletsToRefresh <- *tab + } + }(alias) + } + } + // Wait for the tablet records to be updated. At that point, any rebuild will + // see the new master, so we're ready to mark the reparent as done in the + // global shard record. + wg.Wait() + // we waited for all goroutines to complete, so now close the channel + close(tabletsToRefresh) + if errs.HasErrors() { + return errs.Error() + } + + for tab := range tabletsToRefresh { + log.Infof("finalizeTabletExternallyReparented: Refresh state for tablet: %v", topoproto.TabletAliasString(tab.Alias)) + wg.Add(1) + go func(tablet topodatapb.Tablet) { + defer wg.Done() + + // Tell the old master(s) to re-read its tablet record and change its state. + // We don't need to put error into errs if this fails, but we need to wait + // for it to make sure that an old master tablet is not stuck in the MASTER + // state. + tmc := tmclient.NewTabletManagerClient() + if err := tmc.RefreshState(ctx, &tablet); err != nil { + log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(tablet.Alias), err) + } + }(tab) + } + wg.Wait() } - wg.Wait() if errs.HasErrors() { return errs.Error() } From 55157408f6afcb5a87d5496c899c7c6139125ee8 Mon Sep 17 00:00:00 2001 From: deepthi Date: Wed, 11 Sep 2019 21:23:45 -0700 Subject: [PATCH 6/8] add comments explaining why some synchronization has been removed Signed-off-by: deepthi --- go/vt/vttablet/tabletmanager/rpc_external_reparent.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index 426f578283..fb8b48e3fa 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -206,6 +206,11 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context // here anymore. The lock was only to ensure that the global shard record // didn't get modified between the time when we read it and the time when we // write it back. Now we use an update loop pattern to do that instead. + + // We also used to do this in parallel with RefreshState on the old master + // we don't do that any more. we want to update the shard record first + // and only then attempt to refresh the old master because it is possible + // that the old master is unreachable event.DispatchUpdate(ev, "updating global shard record") log.Infof("finalizeTabletExternallyReparented: updating global shard record if needed") _, err = agent.TopoServer.UpdateShardFields(ctx, masterTablet.Keyspace, masterTablet.Shard, func(currentSi *topo.ShardInfo) error { From 6b2d0d14435e6a33b902be2fcfe9687d1582d0f7 Mon Sep 17 00:00:00 2001 From: deepthi Date: Thu, 12 Sep 2019 09:03:40 -0700 Subject: [PATCH 7/8] fix/add descriptive comments, avoid unnecessary topo calls, merge 2 loops into 1 and remove use of channel that is no longer needed Signed-off-by: deepthi --- .../tabletmanager/rpc_external_reparent.go | 186 ++++++++---------- 1 file changed, 82 insertions(+), 104 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index fb8b48e3fa..eb44b1d3d4 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -91,19 +91,14 @@ func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, extern }, ExternalID: externalID, } - defer func() { - if err != nil { - event.DispatchUpdate(ev, "failed: "+err.Error()) - } - }() event.DispatchUpdate(ev, "starting external from tablet (fast)") // Execute state change to master by force-updating only the local copy of the // tablet record. The actual record in topo will be updated later. newTablet := proto.Clone(tablet).(*topodatapb.Tablet) - // We may get called on the current master multiple times in order to fix incomplete external reparents - // update tablet only if it is not currently master + // We may get called on the current master multiple times in order to fix incomplete external reparents. + // We update the tablet here only if it is not currently master if newTablet.Type != topodatapb.TabletType_MASTER { log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER") newTablet.Type = topodatapb.TabletType_MASTER @@ -132,11 +127,11 @@ func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, extern } // finalizeTabletExternallyReparented performs slow, synchronized reconciliation -// tasks that ensure topology is self-consistent -// it first updates new and old master tablet records, then updates -// the global shard record, then refreshes the old master -// after that it attempts to detect and clean up any lingering old masters -// note that an up-to-date shard record does not necessarily mean that +// tasks that ensure topology is self-consistent. +// It first updates new and old master tablet records, then updates +// the global shard record, then refreshes the old master. +// After that it attempts to detect and clean up any lingering old masters. +// Note that an up-to-date shard record does not necessarily mean that // the reparent completed all the actions successfully func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context, si *topo.ShardInfo, ev *events.Reparent) (err error) { var wg sync.WaitGroup @@ -156,8 +151,10 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context func(tablet *topodatapb.Tablet) error { if tablet.Type != topodatapb.TabletType_MASTER { tablet.Type = topodatapb.TabletType_MASTER + return nil } - return nil + // returning NoUpdateNeeded avoids unnecessary calls to UpdateTablet + return topo.NewError(topo.NoUpdateNeeded, agent.TabletAlias.String()) }) if err != nil { errs.RecordError(err) @@ -177,8 +174,10 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context func(tablet *topodatapb.Tablet) error { if tablet.Type == topodatapb.TabletType_MASTER { tablet.Type = topodatapb.TabletType_REPLICA + return nil } - return nil + // returning NoUpdateNeeded avoids unnecessary calls to UpdateTablet + return topo.NewError(topo.NoUpdateNeeded, oldMasterAlias.String()) }) if err != nil { errs.RecordError(err) @@ -202,115 +201,94 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context masterTablet := agent.Tablet() - // Update the master field in the global shard record. We don't use a lock - // here anymore. The lock was only to ensure that the global shard record - // didn't get modified between the time when we read it and the time when we - // write it back. Now we use an update loop pattern to do that instead. - - // We also used to do this in parallel with RefreshState on the old master - // we don't do that any more. we want to update the shard record first - // and only then attempt to refresh the old master because it is possible - // that the old master is unreachable event.DispatchUpdate(ev, "updating global shard record") log.Infof("finalizeTabletExternallyReparented: updating global shard record if needed") - _, err = agent.TopoServer.UpdateShardFields(ctx, masterTablet.Keyspace, masterTablet.Shard, func(currentSi *topo.ShardInfo) error { - if topoproto.TabletAliasEqual(currentSi.MasterAlias, masterTablet.Alias) { - // It is correct to return this error here, UpdateShardFields will ignore it - return topo.NewError(topo.NoUpdateNeeded, masterTablet.Alias.String()) + wg.Add(1) + go func() { + defer wg.Done() + // Update the master field in the global shard record. We don't use a lock + // here anymore. The lock was only to ensure that the global shard record + // didn't get modified between the time when we read it and the time when we + // write it back. Now we use an update loop pattern to do that instead. + _, err = agent.TopoServer.UpdateShardFields(ctx, masterTablet.Keyspace, masterTablet.Shard, func(currentSi *topo.ShardInfo) error { + if topoproto.TabletAliasEqual(currentSi.MasterAlias, masterTablet.Alias) { + // returning NoUpdateNeeded avoids unnecessary calls to UpdateTablet + return topo.NewError(topo.NoUpdateNeeded, masterTablet.Alias.String()) + } + if !topoproto.TabletAliasEqual(currentSi.MasterAlias, oldMasterAlias) { + log.Warningf("old master alias (%v) not found in the global Shard record i.e. it has changed in the meantime."+ + " We're not overwriting the value with the new master (%v) because the current value is probably newer."+ + " (initial Shard record = %#v, current Shard record = %#v)", + oldMasterAlias, masterTablet.Alias, si, currentSi) + // returning NoUpdateNeeded avoids unnecessary calls to UpdateTablet + return topo.NewError(topo.NoUpdateNeeded, oldMasterAlias.String()) + } + currentSi.MasterAlias = masterTablet.Alias + return nil + }) + if err != nil { + errs.RecordError(err) } - if !topoproto.TabletAliasEqual(currentSi.MasterAlias, oldMasterAlias) { - log.Warningf("old master alias (%v) not found in the global Shard record i.e. it has changed in the meantime."+ - " We're not overwriting the value with the new master (%v) because the current value is probably newer."+ - " (initial Shard record = %#v, current Shard record = %#v)", - oldMasterAlias, masterTablet.Alias, si, currentSi) - // It is correct to return this error here, UpdateShardFields will ignore it - return topo.NewError(topo.NoUpdateNeeded, oldMasterAlias.String()) - } - currentSi.MasterAlias = masterTablet.Alias - return nil - }) - if err != nil { - errs.RecordError(err) + }() + + if !topoproto.TabletAliasIsZero(oldMasterAlias) && oldMasterTablet != nil { + wg.Add(1) + go func() { + defer wg.Done() + // Tell the old master to re-read its tablet record and change its state. + // We don't need to put error into errs if this fails, but we need to wait + // for it to make sure that old master tablet is not stuck in the MASTER + // state. + tmc := tmclient.NewTabletManagerClient() + if err := tmc.RefreshState(ctx, oldMasterTablet); err != nil { + log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(oldMasterTablet.Alias), err) + } + }() } + wg.Wait() if errs.HasErrors() { return errs.Error() } - if !topoproto.TabletAliasIsZero(oldMasterAlias) { - // Tell the old master to re-read its tablet record and change its state. - // We don't need to put error into errs if this fails, but we need to wait - // for it to make sure that old master tablet is not stuck in the MASTER - // state. - tmc := tmclient.NewTabletManagerClient() - if err := tmc.RefreshState(ctx, oldMasterTablet); err != nil { - log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(oldMasterTablet.Alias), err) - } - } - + // Look for any other tablets claiming to be master and fix them up on a best-effort basis tabletMap, err := agent.TopoServer.GetTabletMapForShard(ctx, masterTablet.Keyspace, masterTablet.Shard) if err != nil { log.Errorf("ignoring error %v from GetTabletMapForShard so that we can process any partial results", err) } - if len(tabletMap) > 0 { - // make the channel buffer big enough that it doesn't block senders - tabletsToRefresh := make(chan topodatapb.Tablet, len(tabletMap)) - // update any other tablets claiming to be MASTER also to REPLICA - for _, tabletInfo := range tabletMap { - alias := tabletInfo.Tablet.Alias - if !topoproto.TabletAliasEqual(alias, agent.TabletAlias) && !topoproto.TabletAliasEqual(alias, oldMasterAlias) && tabletInfo.Tablet.Type == topodatapb.TabletType_MASTER { - log.Infof("finalizeTabletExternallyReparented: updating tablet record for another old master: %v", alias) - wg.Add(1) - go func(alias *topodatapb.TabletAlias) { - defer wg.Done() - var err error - tab, err := agent.TopoServer.UpdateTabletFields(ctx, alias, - func(tablet *topodatapb.Tablet) error { - if tablet.Type == topodatapb.TabletType_MASTER { - tablet.Type = topodatapb.TabletType_REPLICA - } - return nil - }) - if err != nil { - errs.RecordError(err) - return - } - // tab will be nil if no update was needed - if tab != nil { - tabletsToRefresh <- *tab - } - }(alias) - } - } - // Wait for the tablet records to be updated. At that point, any rebuild will - // see the new master, so we're ready to mark the reparent as done in the - // global shard record. - wg.Wait() - // we waited for all goroutines to complete, so now close the channel - close(tabletsToRefresh) - if errs.HasErrors() { - return errs.Error() - } - - for tab := range tabletsToRefresh { - log.Infof("finalizeTabletExternallyReparented: Refresh state for tablet: %v", topoproto.TabletAliasString(tab.Alias)) + for _, tabletInfo := range tabletMap { + alias := tabletInfo.Tablet.Alias + if !topoproto.TabletAliasEqual(alias, agent.TabletAlias) && !topoproto.TabletAliasEqual(alias, oldMasterAlias) && tabletInfo.Tablet.Type == topodatapb.TabletType_MASTER { + log.Infof("finalizeTabletExternallyReparented: updating tablet record for another old master: %v", alias) wg.Add(1) - go func(tablet topodatapb.Tablet) { + go func(alias *topodatapb.TabletAlias) { defer wg.Done() - - // Tell the old master(s) to re-read its tablet record and change its state. - // We don't need to put error into errs if this fails, but we need to wait - // for it to make sure that an old master tablet is not stuck in the MASTER - // state. - tmc := tmclient.NewTabletManagerClient() - if err := tmc.RefreshState(ctx, &tablet); err != nil { - log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(tablet.Alias), err) + var err error + tab, err := agent.TopoServer.UpdateTabletFields(ctx, alias, + func(tablet *topodatapb.Tablet) error { + if tablet.Type == topodatapb.TabletType_MASTER { + tablet.Type = topodatapb.TabletType_REPLICA + return nil + } + return topo.NewError(topo.NoUpdateNeeded, alias.String()) + }) + if err != nil { + errs.RecordError(err) + return } - }(tab) + // tab will be nil if no update was needed + if tab != nil { + log.Infof("finalizeTabletExternallyReparented: Refresh state for tablet: %v", topoproto.TabletAliasString(tab.Alias)) + tmc := tmclient.NewTabletManagerClient() + if err := tmc.RefreshState(ctx, tab); err != nil { + log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(tab.Alias), err) + } + } + }(alias) } - wg.Wait() } + wg.Wait() if errs.HasErrors() { return errs.Error() } From 90d3b8a8d7f58db5dad86e3ca9fbcbc04a7f66e6 Mon Sep 17 00:00:00 2001 From: deepthi Date: Fri, 13 Sep 2019 14:24:23 -0700 Subject: [PATCH 8/8] defer event update, only make a copy of tablet if we are changing it, use one tmc for all calls to RefreshState Signed-off-by: deepthi --- go/vt/vtcombo/tablet_map.go | 14 ++++++++------ .../vttablet/tabletmanager/healthcheck_test.go | 5 ++++- .../tabletmanager/rpc_external_reparent.go | 18 +++++++++++------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/go/vt/vtcombo/tablet_map.go b/go/vt/vtcombo/tablet_map.go index a4cd2baab5..fac8ab155a 100644 --- a/go/vt/vtcombo/tablet_map.go +++ b/go/vt/vtcombo/tablet_map.go @@ -111,6 +111,14 @@ func InitTabletMap(ts *topo.Server, tpb *vttestpb.VTTestTopology, mysqld mysqlct ctx := context.Background() + // Register the tablet manager client factory for tablet manager + // Do this before any tablets are created so that they respect the protocol, + // otherwise it defaults to grpc + tmclient.RegisterTabletManagerClientFactory("internal", func() tmclient.TabletManagerClient { + return &internalTabletManagerClient{} + }) + *tmclient.TabletManagerProtocol = "internal" + // iterate through the keyspaces wr := wrangler.New(logutil.NewConsoleLogger(), ts, nil) var uid uint32 = 1 @@ -246,12 +254,6 @@ func InitTabletMap(ts *topo.Server, tpb *vttestpb.VTTestTopology, mysqld mysqlct tabletconn.RegisterDialer("internal", dialer) *tabletconn.TabletProtocol = "internal" - // Register the tablet manager client factory for tablet manager - tmclient.RegisterTabletManagerClientFactory("internal", func() tmclient.TabletManagerClient { - return &internalTabletManagerClient{} - }) - *tmclient.TabletManagerProtocol = "internal" - // run healthcheck on all vttablets tmc := tmclient.NewTabletManagerClient() for _, tablet := range tabletMap { diff --git a/go/vt/vttablet/tabletmanager/healthcheck_test.go b/go/vt/vttablet/tabletmanager/healthcheck_test.go index 5f59d0e617..4cdc8e29d8 100644 --- a/go/vt/vttablet/tabletmanager/healthcheck_test.go +++ b/go/vt/vttablet/tabletmanager/healthcheck_test.go @@ -38,6 +38,9 @@ import ( "vitess.io/vitess/go/vt/vttablet/tabletservermock" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + + // needed so that grpc client is registered + _ "vitess.io/vitess/go/vt/vttablet/grpctmclient" ) func TestHealthRecordDeduplication(t *testing.T) { @@ -717,7 +720,7 @@ func TestStateChangeImmediateHealthBroadcast(t *testing.T) { // Run TER to turn us into a proper master, wait for it to finish. agent.HealthReporter.(*fakeHealthCheck).reportReplicationDelay = 19 * time.Second if err := agent.TabletExternallyReparented(ctx, "unused_id"); err != nil { - t.Fatal(err) + t.Fatalf("TabletExternallyReparented failed: %v", err) } <-agent.finalizeReparentCtx.Done() ti, err := agent.TopoServer.GetTablet(ctx, tabletAlias) diff --git a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go index eb44b1d3d4..848ba234f0 100644 --- a/go/vt/vttablet/tabletmanager/rpc_external_reparent.go +++ b/go/vt/vttablet/tabletmanager/rpc_external_reparent.go @@ -91,16 +91,20 @@ func (agent *ActionAgent) TabletExternallyReparented(ctx context.Context, extern }, ExternalID: externalID, } + defer func() { + if err != nil { + event.DispatchUpdate(ev, "failed: "+err.Error()) + } + }() event.DispatchUpdate(ev, "starting external from tablet (fast)") - // Execute state change to master by force-updating only the local copy of the - // tablet record. The actual record in topo will be updated later. - newTablet := proto.Clone(tablet).(*topodatapb.Tablet) - // We may get called on the current master multiple times in order to fix incomplete external reparents. // We update the tablet here only if it is not currently master - if newTablet.Type != topodatapb.TabletType_MASTER { + if tablet.Type != topodatapb.TabletType_MASTER { log.Infof("fastTabletExternallyReparented: executing change callback for state change to MASTER") + // Execute state change to master by force-updating only the local copy of the + // tablet record. The actual record in topo will be updated later. + newTablet := proto.Clone(tablet).(*topodatapb.Tablet) newTablet.Type = topodatapb.TabletType_MASTER // This is where updateState will block for gracePeriod, while it gives @@ -231,6 +235,8 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context } }() + tmc := tmclient.NewTabletManagerClient() + defer tmc.Close() if !topoproto.TabletAliasIsZero(oldMasterAlias) && oldMasterTablet != nil { wg.Add(1) go func() { @@ -239,7 +245,6 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context // We don't need to put error into errs if this fails, but we need to wait // for it to make sure that old master tablet is not stuck in the MASTER // state. - tmc := tmclient.NewTabletManagerClient() if err := tmc.RefreshState(ctx, oldMasterTablet); err != nil { log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(oldMasterTablet.Alias), err) } @@ -280,7 +285,6 @@ func (agent *ActionAgent) finalizeTabletExternallyReparented(ctx context.Context // tab will be nil if no update was needed if tab != nil { log.Infof("finalizeTabletExternallyReparented: Refresh state for tablet: %v", topoproto.TabletAliasString(tab.Alias)) - tmc := tmclient.NewTabletManagerClient() if err := tmc.RefreshState(ctx, tab); err != nil { log.Warningf("Error calling RefreshState on old master %v: %v", topoproto.TabletAliasString(tab.Alias), err) }