Merge pull request #673 from youtube/replication

Replication
2015-05-06 17:09:05 -07:00 · 2015-05-06 17:09:05 -07:00 · fd74fecf3f
--- a/doc/Reparenting.md
+++ b/doc/Reparenting.md
@ -80,11 +80,12 @@ The actions performed are:
    wait for the entry in the test table. (if a slave wasn't
    replicating, we don't change its state and don't start replication
    after reparent)
+  - additionally, on the old master, we start replication, so it catches up.

 The old master is left as 'spare' in this scenario. If health checking
 is enabled on that tablet (using target\_tablet\_type parameter for
 vttablet), the server will most likely rejoin the cluster as a
-replica.
+replica on the next health check.

 ### Emergency Reparent: vtctl EmergencyReparentShard

--- a/go/vt/mysqlctl/mysql_daemon.go
+++ b/go/vt/mysqlctl/mysql_daemon.go
@ -71,7 +71,9 @@ type FakeMysqlDaemon struct {
 	// return an error.
 	MysqlPort int

-	// Replicating is updated when calling StopSlave
+	// Replicating is updated when calling StartSlave / StopSlave
+	// (it is not used at all when calling SlaveStatus, it is the
+	// test owner responsability to have these two match)
 	Replicating bool

 	// CurrentSlaveStatus is returned by SlaveStatus
--- a/go/vt/wrangler/reparent.go
+++ b/go/vt/wrangler/reparent.go
@ -403,6 +403,14 @@ func (wr *Wrangler) plannedReparentShardLocked(ctx context.Context, ev *events.R
 				wr.logger.Infof("setting new master on slave %v", alias)
 				if err := wr.TabletManagerClient().SetMaster(ctx, tabletInfo, masterElectTabletAlias, now); err != nil {
 					rec.RecordError(fmt.Errorf("Tablet %v SetMaster failed: %v", alias, err))
+					return
+				}
+
+				// also restart replication on old master
+				if alias == oldMasterTabletInfo.Alias {
+					if err := wr.TabletManagerClient().StartSlave(ctx, tabletInfo); err != nil {
+						rec.RecordError(fmt.Errorf("old master %v StartSlave failed: %v", alias, err))
+					}
 				}
 			}(alias, tabletInfo)
 		}
--- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go
+++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go
@ -33,6 +33,7 @@ func TestPlannedReparentShard(t *testing.T) {

 	// new master
 	newMaster.FakeMysqlDaemon.ReadOnly = true
+	newMaster.FakeMysqlDaemon.Replicating = true
 	newMaster.FakeMysqlDaemon.WaitMasterPosition = myproto.ReplicationPosition{
 		GTIDSet: myproto.MariadbGTID{
 			Domain:   7,
@ -57,6 +58,7 @@ func TestPlannedReparentShard(t *testing.T) {

 	// old master
 	oldMaster.FakeMysqlDaemon.ReadOnly = false
+	oldMaster.FakeMysqlDaemon.Replicating = false
 	oldMaster.FakeMysqlDaemon.DemoteMasterPosition = newMaster.FakeMysqlDaemon.WaitMasterPosition
 	oldMaster.FakeMysqlDaemon.SetMasterCommandsInput = fmt.Sprintf("%v:%v", newMaster.Tablet.Hostname, newMaster.Tablet.Portmap["mysql"])
 	oldMaster.FakeMysqlDaemon.SetMasterCommandsResult = []string{"set master cmd 1"}
@ -69,6 +71,7 @@ func TestPlannedReparentShard(t *testing.T) {

 	// good slave 1 is replicating
 	goodSlave1.FakeMysqlDaemon.ReadOnly = true
+	goodSlave1.FakeMysqlDaemon.Replicating = true
 	goodSlave1.FakeMysqlDaemon.CurrentSlaveStatus = &myproto.ReplicationStatus{
 		SlaveIORunning:  true,
 		SlaveSQLRunning: true,
@ -85,6 +88,7 @@ func TestPlannedReparentShard(t *testing.T) {

 	// good slave 2 is not replicating
 	goodSlave2.FakeMysqlDaemon.ReadOnly = true
+	goodSlave2.FakeMysqlDaemon.Replicating = false
 	goodSlave2.FakeMysqlDaemon.SetMasterCommandsInput = fmt.Sprintf("%v:%v", newMaster.Tablet.Hostname, newMaster.Tablet.Portmap["mysql"])
 	goodSlave2.FakeMysqlDaemon.SetMasterCommandsResult = []string{"set master cmd 1"}
 	goodSlave2.StartActionLoop(t, wr)
@ -127,4 +131,12 @@ func TestPlannedReparentShard(t *testing.T) {
 		t.Errorf("oldMaster...QueryServiceEnabled set")
 	}

+	// verify the old master was told to start replicating (and not
+	// the slave that wasn't replicating in the first place)
+	if !oldMaster.FakeMysqlDaemon.Replicating {
+		t.Errorf("oldMaster.FakeMysqlDaemon.Replicating not set")
+	}
+	if goodSlave2.FakeMysqlDaemon.Replicating {
+		t.Errorf("goodSlave2.FakeMysqlDaemon.Replicating set")
+	}
 }
--- a/test/schema.py
+++ b/test/schema.py
@ -243,7 +243,7 @@ class TestSchema(unittest.TestCase):
                    auto_log=True)

    # check all expected hosts have the change
-    self._check_tables(shard_0_master, 1) # was stuck a long time ago as scrap
+    self._check_tables(shard_0_master, 2) # was stuck a long time ago as scrap
    self._check_tables(shard_0_replica1, 3) # current master
    self._check_tables(shard_0_replica2, 3)
    self._check_tables(shard_0_rdonly, 3)
@ -262,7 +262,7 @@ class TestSchema(unittest.TestCase):
    # check all expected hosts have the change:
    # - master won't have it as it's a complex change
    # - backup won't have it as IsReplicatingType is false
-    self._check_tables(shard_0_master, 1) # was stuck a long time ago as scrap
+    self._check_tables(shard_0_master, 2) # was stuck a long time ago as scrap
    self._check_tables(shard_0_replica1, 3) # current master
    self._check_tables(shard_0_replica2, 4)
    self._check_tables(shard_0_rdonly, 4)