Added worker.py to cover worker integration tests

2015-04-01 10:53:03 -07:00 · 2015-04-01 10:53:03 -07:00 · 383e2c52e4
--- a/6
+++ b/6
@ -100,7 +100,8 @@ medium_integration_test_files = \
 	reparent.py \
 	vtdb_test.py \
 	vtgate_utils_test.py \
-	rowcache_invalidator.py
+	rowcache_invalidator.py \
+	worker.py

 large_integration_test_files = \
 	vtgatev2_test.py \
@ -122,7 +123,8 @@ worker_integration_test_files = \
 	vertical_split.py \
 	vertical_split_vtgate.py \
 	initial_sharding.py \
-	initial_sharding_bytes.py
+	initial_sharding_bytes.py \
+	worker.py

 .ONESHELL:
 SHELL = /bin/bash
--- a/doc/HorizontalReshardingGuide.md
+++ b/doc/HorizontalReshardingGuide.md
@ -9,7 +9,7 @@ Let’s assume that you’ve already got a keyspace up and running, with a singl

 The first thing that we need to do is add a column to the soon-to-be-sharded keyspace which will be used as the "sharding key". This column will tell Vitess which shard a particular row of data should go to. You can add the column by running an alter on the unsharded keyspace - probably by running something like:

-`vtctl ApplySchemaKeyspace -simple -sql="alter table <table name> add keyspace_id" test_keyspace`
+`vtctl ApplySchema -sql="alter table <table name> add keyspace_id" test_keyspace`

 for each table in the keyspace. Once the column is added everywhere, each row needs to be backfilled with the appropriate keyspace ID.

--- a/go/vt/worker/clone_utils.go
+++ b/go/vt/worker/clone_utils.go
@ -151,9 +151,6 @@ func formatTableStatuses(tableStatuses []*tableStatus, startTime time.Time) ([]s

 var errExtract = regexp.MustCompile(`\(errno (\d+)\)`)

-// The amount of time we should wait before retrying ExecuteFetch calls
-var executeFetchRetryTime = (30 * time.Second)
-
 // executeFetchWithRetries will attempt to run ExecuteFetch for a single command, with a reasonably small timeout.
 // If will keep retrying the ExecuteFetch (for a finite but longer duration) if it fails due to a timeout or a
 // retriable application error.
@ -190,6 +187,9 @@ func executeFetchWithRetries(ctx context.Context, wr *wrangler.Wrangler, ti *top
 		case errNo == "1290":
 			wr.Logger().Warningf("ExecuteFetch failed on %v; will reresolve and retry because it's due to a MySQL read-only error: %v", ti, err)
 			statsRetryCounters.Add("ReadOnly", 1)
+		case errNo == "2002" || errNo == "2006":
+			wr.Logger().Warningf("ExecuteFetch failed on %v; will reresolve and retry because it's due to a MySQL connection error: %v", ti, err)
+			statsRetryCounters.Add("ConnectionError", 1)
 		case errNo == "1062":
 			if !isRetry {
 				return ti, fmt.Errorf("ExecuteFetch failed on %v on the first attempt; not retrying as this is not a recoverable error: %v", ti, err)
@ -200,7 +200,7 @@ func executeFetchWithRetries(ctx context.Context, wr *wrangler.Wrangler, ti *top
 			// Unknown error
 			return ti, err
 		}
-		t := time.NewTimer(executeFetchRetryTime)
+		t := time.NewTimer(*executeFetchRetryTime)
 		// don't leak memory if the timer isn't triggered
 		defer t.Stop()

--- a/go/vt/worker/split_clone.go
+++ b/go/vt/worker/split_clone.go
@ -384,7 +384,7 @@ func (scw *SplitCloneWorker) ResolveDestinationMasters() error {
 	defer scw.resolveMu.Unlock()

 	// If the last resolution was fresh enough, return it.
-	if time.Now().Sub(scw.resolveTime) < resolveTTL {
+	if time.Now().Sub(scw.resolveTime) < *resolveTTL {
 		return nil
 	}

--- a/go/vt/worker/split_clone_test.go
+++ b/go/vt/worker/split_clone_test.go
@ -329,7 +329,7 @@ func testSplitClone(t *testing.T, strategy string) {
 	rightRdonly.FakeMysqlDaemon.DbAppConnectionFactory = DestinationsFactory(t, 30)

 	// Only wait 1 ms between retries, so that the test passes faster
-	executeFetchRetryTime = (1 * time.Millisecond)
+	*executeFetchRetryTime = (1 * time.Millisecond)

 	wrk.Run()
 	status := wrk.StatusAsText()
--- a/go/vt/worker/vertical_split_clone.go
+++ b/go/vt/worker/vertical_split_clone.go
@ -346,7 +346,7 @@ func (vscw *VerticalSplitCloneWorker) ResolveDestinationMasters() error {
 	defer vscw.resolveMu.Unlock()

 	// If the last resolution was fresh enough, return it.
-	if time.Now().Sub(vscw.resolveTime) < resolveTTL {
+	if time.Now().Sub(vscw.resolveTime) < *resolveTTL {
 		return nil
 	}

--- a/go/vt/worker/vertical_split_clone_test.go
+++ b/go/vt/worker/vertical_split_clone_test.go
@ -314,7 +314,7 @@ func testVerticalSplitClone(t *testing.T, strategy string) {
 	destRdonly.FakeMysqlDaemon.DbAppConnectionFactory = VerticalDestinationsFactory(t, 30)

 	// Only wait 1 ms between retries, so that the test passes faster
-	executeFetchRetryTime = (1 * time.Millisecond)
+	*executeFetchRetryTime = (1 * time.Millisecond)

 	wrk.Run()
 	status := wrk.StatusAsText()
--- a/go/vt/worker/worker.go
+++ b/go/vt/worker/worker.go
@ -9,6 +9,7 @@ functions for long running actions. 'vtworker' binary will use these.
 package worker

 import (
+	"flag"
 	"html/template"
 	"time"

@ -49,11 +50,10 @@ type Resolver interface {
 	GetDestinationMaster(shardName string) (*topo.TabletInfo, error)
 }

-// Resolvers should attempt to keep the previous topo resolution cached for at
-// least this long.
-const resolveTTL = 15 * time.Second
-
 var (
+	resolveTTL            = flag.Duration("resolve_ttl", 15*time.Second, "Amount of time that a topo resolution can be cached for")
+	executeFetchRetryTime = flag.Duration("executefetch_retry_time", 30*time.Second, "Amount of time we should wait before retrying ExecuteFetch calls")
+
 	statsState = stats.NewString("WorkerState")
 	// the number of times that the worker attempst to reresolve the masters
 	statsDestinationAttemptedResolves = stats.NewInt("WorkerDestinationAttemptedResolves")
--- a/test/config.json
+++ b/test/config.json
@ -75,6 +75,9 @@
    },
    {
      "File": "resharding.py"
+    },
+    {
+      "File": "worker.py"
    }
  ]
 }
--- a/test/tablet.py
+++ b/test/tablet.py
@ -240,7 +240,7 @@ class Tablet(object):
    rows = self.mquery('', 'show databases')
    for row in rows:
      dbname = row[0]
-      if dbname in ['information_schema', '_vt', 'mysql']:
+      if dbname in ['information_schema', 'mysql']:
        continue
      self.drop_db(dbname)

--- a/test/utils.py
+++ b/test/utils.py
@ -20,6 +20,7 @@ import environment

 from vtctl import vtctl_client
 from mysql_flavor import set_mysql_flavor
+from mysql_flavor import mysql_flavor
 from protocols_flavor import set_protocols_flavor, protocols_flavor
 from topo_flavor.server import set_topo_server_flavor

@ -79,7 +80,13 @@ def set_options(opts):

 # main executes the test classes contained in the passed module, or
 # __main__ if empty.
-def main(mod=None):
+def main(mod=None, test_options=None):
+  """The replacement main method, which parses args and runs tests.
+
+  Args:
+    test_options - a function which adds OptionParser options that are specific
+      to a test file.
+  """
  if mod == None:
    mod = sys.modules['__main__']

@ -87,6 +94,8 @@ def main(mod=None):

  parser = optparse.OptionParser(usage="usage: %prog [options] [test_names]")
  add_options(parser)
+  if test_options:
+    test_options(parser)
  (options, args) = parser.parse_args()

  if options.verbose == 0:
@ -256,12 +265,6 @@ def wait_procs(proc_list, raise_on_error=True):
      if raise_on_error:
        raise CalledProcessError(proc.returncode, ' '.join(proc.args))

-def run_procs(cmds, raise_on_error=True):
-  procs = []
-  for cmd in cmds:
-    procs.append(run_bg(cmd))
-  wait_procs(procs, raise_on_error=raise_on_error)
-
 def validate_topology(ping_tablets=False):
  if ping_tablets:
    run_vtctl(['Validate', '-ping-tablets'])
@ -325,6 +328,40 @@ def wait_for_vars(name, port, var=None):
      break
    timeout = wait_step('waiting for /debug/vars of %s' % name, timeout)

+def poll_for_vars(name, port, condition_msg, timeout=60.0, condition_fn=None):
+  """Polls for debug variables to exist, or match specific conditions, within a timeout.
+
+  This function polls in a tight loop, with no sleeps. This is useful for
+  variables that are expected to be short-lived (e.g., a 'Done' state
+  immediately before a process exits).
+
+  Args:
+    name - the name of the process that we're trying to poll vars from.
+    port - the port number that we should poll for variables.
+    condition_msg - string describing the conditions that we're polling for,
+      used for error messaging.
+    timeout - number of seconds that we should attempt to poll for.
+    condition_fn - a function that takes the debug vars dict as input, and
+      returns a truthy value if it matches the success conditions.
+
+  Raises:
+    TestError, if the conditions aren't met within the given timeout
+
+  Returns:
+    dict of debug variables
+  """
+  start_time = time.time()
+  while True:
+    if (time.time() - start_time) >= timeout:
+      raise TestError('Timed out polling for vars from %s; condition "%s" not met' % (name, condition_msg))
+    _vars = get_vars(port)
+    if _vars is None:
+      continue
+    if condition_fn is None:
+      return _vars
+    elif condition_fn(_vars):
+      return _vars
+
 def apply_vschema(vschema):
  fname = os.path.join(environment.tmproot, "vschema.json")
  with open(fname, "w") as f:
@ -345,6 +382,23 @@ def wait_for_tablet_type(tablet_alias, expected_type, timeout=10):
      timeout
    )

+def wait_for_replication_pos(tablet_a, tablet_b, timeout=60.0):
+  """Waits for tablet B to catch up to the replication position of tablet A.
+
+  If the replication position does not catch up within timeout seconds, it will
+  raise a TestError.
+  """
+  replication_pos_a = mysql_flavor().master_position(tablet_a)
+  while True:
+    replication_pos_b = mysql_flavor().master_position(tablet_b)
+    if mysql_flavor().position_at_least(replication_pos_b, replication_pos_a):
+      break
+    timeout = wait_step(
+      "%s's replication position to catch up %s's; currently at: %s, waiting to catch up to: %s" % (
+        tablet_b.tablet_alias, tablet_a.tablet_alias, replication_pos_b, replication_pos_a),
+      timeout
+    )
+
 # vtgate helpers, assuming it always restarts on the same port
 def vtgate_start(vtport=None, cell='test_nj', retry_delay=1, retry_count=2,
                 topo_impl=None, tablet_bson_encrypted=False, cache_ttl='1s',
@ -463,10 +517,37 @@ def run_vtctl_json(clargs):

 # vtworker helpers
 def run_vtworker(clargs, log_level='', auto_log=False, expect_fail=False, **kwargs):
+  """Runs a vtworker process, returning the stdout and stderr"""
+  cmd, _ = _get_vtworker_cmd(clargs, log_level, auto_log)
+  if expect_fail:
+    return run_fail(cmd, **kwargs)
+  return run(cmd, **kwargs)
+
+def run_vtworker_bg(clargs, log_level='', auto_log=False, **kwargs):
+  """Starts a background vtworker process.
+
+  Returns:
+    proc - process returned by subprocess.Popen
+    port - int with the port number that the vtworker is running with
+  """
+  cmd, port = _get_vtworker_cmd(clargs, log_level, auto_log)
+  return run_bg(cmd, **kwargs), port
+
+def _get_vtworker_cmd(clargs, log_level='', auto_log=False):
+  """Assembles the command that is needed to run a vtworker.
+
+  Returns:
+    cmd - list of cmd arguments, can be passed to any `run`-like functions
+    port - int with the port number that the vtworker is running with
+  """
+  port = environment.reserve_ports(1)
  args = environment.binary_args('vtworker') + [
          '-log_dir', environment.vtlogroot,
          '-min_healthy_rdonly_endpoints', '1',
-          '-port', str(environment.reserve_ports(1))]
+          '-port', str(port),
+          '-resolve_ttl', '2s',
+          '-executefetch_retry_time', '1s',
+          ]
  args.extend(environment.topo_server().flags())
  args.extend(protocols_flavor().tablet_manager_protocol_flags())

@ -481,9 +562,7 @@ def run_vtworker(clargs, log_level='', auto_log=False, expect_fail=False, **kwar
    args.append('--stderrthreshold=%s' % log_level)

  cmd = args + clargs
-  if expect_fail:
-    return run_fail(cmd, **kwargs)
-  return run(cmd, **kwargs)
+  return cmd, port

 # vtclient2 helpers
 # driver is one of:
--- a/test/worker.py
+++ b/test/worker.py
@ -0,0 +1,438 @@
+#!/usr/bin/env python
+#
+# Copyright 2013, Google Inc. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can
+# be found in the LICENSE file.
+"""
+Tests the robustness and resiliency of vtworkers.
+"""
+
+import logging
+import unittest
+from collections import namedtuple
+
+from vtdb import keyrange_constants
+
+import environment
+import utils
+import tablet
+
+
+KEYSPACE_ID_TYPE = keyrange_constants.KIT_UINT64
+
+
+class ShardTablets(namedtuple('ShardTablets', 'master replicas rdonlys')):
+  """ShardTablets is a container for all the tablet.Tablets of a shard.
+
+  `master` should be a single Tablet, while `replicas` and `rdonlys` should be
+  lists of Tablets of the appropriate types.
+  """
+
+  @property
+  def all_tablets(self):
+    """Returns a list of all the tablets of the shard.
+
+    Does not guarantee any ordering on the returned tablets.
+    """
+    return [self.master] + self.replicas + self.rdonlys
+
+  @property
+  def replica(self):
+    """Returns the first replica Tablet instance for the shard, or None."""
+    if self.replicas:
+      return self.replicas[0]
+    else:
+      return None
+
+  @property
+  def rdonly(self):
+    """Returns the first replica Tablet instance for the shard, or None."""
+    if self.rdonlys:
+      return self.rdonlys[0]
+    else:
+      return None
+
+# initial shard, covers everything
+shard_master = tablet.Tablet()
+shard_replica = tablet.Tablet()
+shard_rdonly1 = tablet.Tablet()
+
+# split shards
+# range "" - 80
+shard_0_master = tablet.Tablet()
+shard_0_replica = tablet.Tablet()
+shard_0_rdonly1 = tablet.Tablet()
+# range 80 - ""
+shard_1_master = tablet.Tablet()
+shard_1_replica = tablet.Tablet()
+shard_1_rdonly1 = tablet.Tablet()
+
+shard_tablets = ShardTablets(shard_master, [shard_replica], [shard_rdonly1])
+shard_0_tablets = ShardTablets(shard_0_master, [shard_0_replica], [shard_0_rdonly1])
+shard_1_tablets = ShardTablets(shard_1_master, [shard_1_replica], [shard_1_rdonly1])
+
+
+def init_keyspace():
+  """Creates a `test_keyspace` keyspace with a sharding key."""
+  utils.run_vtctl(['CreateKeyspace', '-sharding_column_name', 'keyspace_id',
+    '-sharding_column_type', KEYSPACE_ID_TYPE,'test_keyspace'])
+
+
+def setUpModule():
+  try:
+    environment.topo_server().setup()
+
+    setup_procs = [
+        shard_master.init_mysql(),
+        shard_replica.init_mysql(),
+        shard_rdonly1.init_mysql(),
+        shard_0_master.init_mysql(),
+        shard_0_replica.init_mysql(),
+        shard_0_rdonly1.init_mysql(),
+        shard_1_master.init_mysql(),
+        shard_1_replica.init_mysql(),
+        shard_1_rdonly1.init_mysql(),
+        ]
+    utils.wait_procs(setup_procs)
+    init_keyspace()
+  except:
+    tearDownModule()
+    raise
+
+
+def tearDownModule():
+  if utils.options.skip_teardown:
+    return
+
+  teardown_procs = [
+      shard_master.teardown_mysql(),
+      shard_replica.teardown_mysql(),
+      shard_rdonly1.teardown_mysql(),
+      shard_0_master.teardown_mysql(),
+      shard_0_replica.teardown_mysql(),
+      shard_0_rdonly1.teardown_mysql(),
+      shard_1_master.teardown_mysql(),
+      shard_1_replica.teardown_mysql(),
+      shard_1_rdonly1.teardown_mysql(),
+      ]
+  utils.wait_procs(teardown_procs, raise_on_error=False)
+
+  environment.topo_server().teardown()
+  utils.kill_sub_processes()
+  utils.remove_tmp_files()
+
+  shard_master.remove_tree()
+  shard_replica.remove_tree()
+  shard_rdonly1.remove_tree()
+  shard_0_master.remove_tree()
+  shard_0_replica.remove_tree()
+  shard_0_rdonly1.remove_tree()
+  shard_1_master.remove_tree()
+  shard_1_replica.remove_tree()
+  shard_1_rdonly1.remove_tree()
+
+
+class TestBaseSplitCloneResiliency(unittest.TestCase):
+  """Tests that the SplitClone worker is resilient to particular failures."""
+
+  def run_shard_tablets(self, shard_name, shard_tablets, create_db=True, create_table=True, wait_state='SERVING'):
+    """Handles all the necessary work for initially running a shard's tablets.
+
+    This encompasses the following steps:
+      1. InitTablet for the appropriate tablets and types
+      2. (optional) Create db
+      3. Starting vttablets
+      4. Waiting for the appropriate vttablet state
+      5. Force reparent to the master tablet
+      6. RebuildKeyspaceGraph
+      7. (optional) Running initial schema setup
+
+    Args:
+      shard_name - the name of the shard to start tablets in
+      shard_tablets - an instance of ShardTablets for the given shard
+      wait_state - string, the vttablet state that we should wait for
+      create_db - boolean, True iff we should create a db on the tablets
+      create_table - boolean, True iff we should create a table on the tablets
+    """
+    shard_tablets.master.init_tablet('master', 'test_keyspace', shard_name)
+    for tablet in shard_tablets.replicas:
+      tablet.init_tablet('replica', 'test_keyspace', shard_name)
+    for tablet in shard_tablets.rdonlys:
+      tablet.init_tablet('rdonly', 'test_keyspace', shard_name)
+
+    # Start tablets (and possibly create databases)
+    for tablet in shard_tablets.all_tablets:
+      if create_db:
+        tablet.create_db('vt_test_keyspace')
+      tablet.start_vttablet(wait_for_state=None)
+
+    # Wait for tablet state to change after starting all tablets. This allows
+    # us to start all tablets at once, instead of sequentially waiting.
+    for tablet in shard_tablets.all_tablets:
+      tablet.wait_for_vttablet_state(wait_state)
+
+    # Reparent to choose an initial master
+    utils.run_vtctl(['InitShardMaster', 'test_keyspace/%s' % shard_name,
+                     shard_tablets.master.tablet_alias], auto_log=True)
+    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)
+
+    create_table_sql = (
+      'create table worker_test('
+      'id bigint unsigned,'
+      'msg varchar(64),'
+      'keyspace_id bigint(20) unsigned not null,'
+      'primary key (id),'
+      'index by_msg (msg)'
+      ') Engine=InnoDB'
+    )
+
+    if create_table:
+      utils.run_vtctl(['ApplySchema',
+                       '-sql=' + create_table_sql,
+                       'test_keyspace'],
+                      auto_log=True)
+
+  def _insert_value(self, tablet, id, msg, keyspace_id):
+    """Inserts a value in the MySQL database along with the required routing comments.
+
+    Args:
+      tablet - the Tablet instance to insert into
+      id - the value of `id` column
+      msg - the value of `msg` column
+      keyspace_id - the value of `keyspace_id` column
+    """
+    k = "%u" % keyspace_id
+    tablet.mquery('vt_test_keyspace', [
+        'begin',
+        'insert into worker_test(id, msg, keyspace_id) values(%u, "%s", 0x%x) /* EMD keyspace_id:%s user_id:%u */' % (id, msg, keyspace_id, k, id),
+        'commit'
+        ], write=True)
+
+  def insert_values(self, tablet, num_values, num_shards, offset=0, keyspace_id_range=2**64):
+    """Inserts simple values, one for each potential shard.
+
+    Each row is given a message that contains the shard number, so we can easily
+    verify that the source and destination shards have the same data.
+
+    Args:
+      tablet - the Tablet instance to insert into
+      num_values - the number of values to insert
+      num_shards - the number of shards that we expect to have
+      offset - amount that we should offset the `id`s by. This is useful for
+        inserting values multiple times.
+      keyspace_id_range - the number of distinct values that the keyspace id can have
+    """
+    shard_width = keyspace_id_range / num_shards
+    shard_offsets = [i * shard_width for i in xrange(num_shards)]
+    for i in xrange(num_values):
+      for shard_num in xrange(num_shards):
+        self._insert_value(tablet, shard_offsets[shard_num] + offset + i,
+                            'msg-shard-%u' % shard_num,
+                            shard_offsets[shard_num] + i)
+
+  def assert_shard_data_equal(self, shard_num, source_tablet, destination_tablet):
+    """Asserts that a shard's data is identical on source and destination tablets.
+
+    Args:
+      shard_num - the shard number of the shard that we want to verify the data of
+      source_tablet - Tablet instance of the source shard
+      destination_tablet - Tablet instance of the destination shard
+    """
+    select_query = 'select * from worker_test where msg="msg-shard-%s" order by id asc' % shard_num
+
+    source_rows = source_tablet.mquery('vt_test_keyspace', select_query)
+    destination_rows = destination_tablet.mquery('vt_test_keyspace', select_query)
+    self.assertEqual(source_rows, destination_rows)
+
+  def run_split_diff(self, keyspace_shard, source_tablets, destination_tablets):
+    """Runs a vtworker SplitDiff on the given keyspace/shard, and then sets all
+    former rdonly slaves back to rdonly.
+
+    Args:
+      keyspace_shard - keyspace/shard to run SplitDiff on (string)
+      source_tablets - ShardTablets instance for the source shard
+      destination_tablets - ShardTablets instance for the destination shard
+    """
+    logging.debug("Running vtworker SplitDiff for %s" % keyspace_shard)
+    stdout, stderr = utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
+      keyspace_shard], auto_log=True)
+
+    for shard_tablets in (source_tablets, destination_tablets):
+      for tablet in shard_tablets.rdonlys:
+        utils.run_vtctl(['ChangeSlaveType', tablet.tablet_alias, 'rdonly'],
+          auto_log=True)
+
+  def setUp(self):
+    """Creates the necessary shards, starts the tablets, and inserts some data."""
+    self.run_shard_tablets('0', shard_tablets)
+    # create the split shards
+    self.run_shard_tablets('-80', shard_0_tablets, create_db=False,
+      create_table=False, wait_state='NOT_SERVING')
+    self.run_shard_tablets('80-', shard_1_tablets, create_db=False,
+      create_table=False, wait_state='NOT_SERVING')
+
+    # Copy the schema to the destinattion shards
+    for keyspace_shard in ('test_keyspace/-80', 'test_keyspace/80-'):
+      utils.run_vtctl(['CopySchemaShard',
+                       '--exclude_tables', 'unrelated',
+                       shard_rdonly1.tablet_alias,
+                       keyspace_shard],
+                      auto_log=True)
+
+    logging.debug("Start inserting initial data: %s rows", utils.options.num_insert_rows)
+    self.insert_values(shard_master, utils.options.num_insert_rows, 2)
+    logging.debug("Done inserting initial data, waiting for replication to catch up")
+    utils.wait_for_replication_pos(shard_master, shard_rdonly1)
+    logging.debug("Replication on source rdonly tablet is caught up")
+
+  def tearDown(self):
+    """Tries to do the minimum to reset topology and tablets to their initial states.
+
+    When benchmarked, this seemed to take around 30% of the time of (setupModule +
+    tearDownModule).
+    """
+    for shard_tablet in [shard_tablets, shard_0_tablets, shard_1_tablets]:
+      for tablet in shard_tablet.all_tablets:
+        tablet.clean_dbs()
+        tablet.scrap(force=True, skip_rebuild=True)
+        utils.run_vtctl(['DeleteTablet', tablet.tablet_alias], auto_log=True)
+        tablet.kill_vttablet()
+    utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)
+    for shard in ['0', '-80', '80-']:
+      utils.run_vtctl(['DeleteShard', 'test_keyspace/%s' % shard], auto_log=True)
+
+  def verify_successful_worker_copy_with_reparent(self, mysql_down=False):
+    """Verifies that vtworker can succesfully copy data for a SplitClone.
+
+    Order of operations:
+    1. Run a background vtworker
+    2. Wait until the worker sucessfully resolves the destination masters.
+    3. Reparent the destination tablets
+    4. Wait until the vtworker copy is finished
+    5. Verify that the worker was forced to reresolve topology and retry writes
+      due to the reparent.
+    6. Verify that the data was copied successfully to both new shards
+
+    Args:
+      mysql_down - boolean, True iff we expect the MySQL instances on the
+        destination masters to be down.
+
+    Raises:
+      AssertionError if things didn't go as expected.
+    """
+    worker_proc, worker_port = utils.run_vtworker_bg(['--cell', 'test_nj',
+                        'SplitClone',
+                        '--source_reader_count', '1',
+                        '--destination_pack_count', '1',
+                        '--destination_writer_count', '1',
+                        '--strategy=-populate_blp_checkpoint',
+                        'test_keyspace/0'],
+                       auto_log=True)
+
+    if mysql_down:
+      # If MySQL is down, we wait until resolving at least twice (to verify that
+      # we do reresolve and retry due to MySQL being down).
+      worker_vars = utils.poll_for_vars('vtworker', worker_port,
+        'WorkerDestinationActualResolves >= 2',
+        condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 2)
+      self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
+        "expected vtworker to retry, but it didn't")
+      logging.debug("Worker has resolved at least twice, starting reparent now")
+
+      # Original masters have no running MySQL, so need to force the reparent
+      utils.run_vtctl(['EmergencyReparentShard', 'test_keyspace/-80',
+        shard_0_replica.tablet_alias], auto_log=True)
+      utils.run_vtctl(['EmergencyReparentShard', 'test_keyspace/80-',
+        shard_1_replica.tablet_alias], auto_log=True)
+
+    else:
+      utils.poll_for_vars('vtworker', worker_port,
+        'WorkerDestinationActualResolves >= 1',
+        condition_fn=lambda v: v.get('WorkerDestinationActualResolves') >= 1)
+      logging.debug("Worker has resolved at least once, starting reparent now")
+
+      utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/-80',
+        shard_0_replica.tablet_alias], auto_log=True)
+      utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/80-',
+        shard_1_replica.tablet_alias], auto_log=True)
+
+    logging.debug("Polling for worker state")
+    # There are a couple of race conditions around this, that we need to be careful of:
+    # 1. It's possible for the reparent step to take so long that the worker will
+    #   actually finish before we get to the polling step. To workaround this,
+    #   the test takes a parameter to increase the number of rows that the worker
+    #   has to copy (with the idea being to slow the worker down).
+    # 2. If the worker has a huge number of rows to copy, it's possible for the
+    #   polling to timeout before the worker has finished copying the data.
+    #
+    # You should choose a value for num_insert_rows, such that this test passes
+    # for your environment (trial-and-error...)
+    worker_vars = utils.poll_for_vars('vtworker', worker_port,
+      'WorkerState == cleaning up',
+      condition_fn=lambda v: v.get('WorkerState') == 'cleaning up')
+
+    # Verify that we were forced to reresolve and retry.
+    self.assertGreater(worker_vars['WorkerDestinationActualResolves'], 1)
+    self.assertGreater(worker_vars['WorkerDestinationAttemptedResolves'], 1)
+    self.assertNotEqual(worker_vars['WorkerRetryCount'], {},
+      "expected vtworker to retry, but it didn't")
+
+    utils.wait_procs([worker_proc])
+
+    utils.run_vtctl(['ChangeSlaveType', shard_rdonly1.tablet_alias, 'rdonly'],
+                     auto_log=True)
+
+    # Make sure that everything is caught up to the same replication point
+    self.run_split_diff('test_keyspace/-80', shard_tablets, shard_0_tablets)
+    self.run_split_diff('test_keyspace/80-', shard_tablets, shard_1_tablets)
+
+    self.assert_shard_data_equal(0, shard_master, shard_0_tablets.replica)
+    self.assert_shard_data_equal(1, shard_master, shard_1_tablets.replica)
+
+
+class TestReparentDuringWorkerCopy(TestBaseSplitCloneResiliency):
+
+  def test_reparent_during_worker_copy(self):
+    """This test simulates a destination reparent during a worker SplitClone copy.
+
+    The SplitClone command should be able to gracefully handle the reparent and
+    end up with the correct data on the destination.
+
+    Note: this test has a small possibility of flaking, due to the timing issues
+    involved. It's possible for the worker to finish the copy step before the
+    reparent succeeds, in which case there are assertions that will fail. This
+    seems better than having the test silently pass.
+    """
+    self.verify_successful_worker_copy_with_reparent()
+
+
+class TestMysqlDownDuringWorkerCopy(TestBaseSplitCloneResiliency):
+
+  def setUp(self):
+    """Shuts down MySQL on the destination masters (in addition to the base setup)"""
+    logging.debug("Starting base setup for MysqlDownDuringWorkerCopy")
+    super(TestMysqlDownDuringWorkerCopy, self).setUp()
+    logging.debug("Starting MysqlDownDuringWorkerCopy-specific setup")
+    utils.wait_procs([shard_0_master.shutdown_mysql(),
+      shard_1_master.shutdown_mysql()])
+    logging.debug("Finished MysqlDownDuringWorkerCopy-specific setup")
+
+  def tearDown(self):
+    """Restarts the MySQL processes that were killed during the setup."""
+    logging.debug("Starting MysqlDownDuringWorkerCopy-specific tearDown")
+    utils.wait_procs([shard_0_master.start_mysql(),
+      shard_1_master.start_mysql()])
+    logging.debug("Finished MysqlDownDuringWorkerCopy-specific tearDown")
+    super(TestMysqlDownDuringWorkerCopy, self).tearDown()
+    logging.debug("Finished base tearDown for MysqlDownDuringWorkerCopy")
+
+  def test_mysql_down_during_worker_copy(self):
+    """This test simulates MySQL being down on the destination masters."""
+    self.verify_successful_worker_copy_with_reparent(mysql_down=True)
+
+def add_test_options(parser):
+  parser.add_option('--num_insert_rows', type="int", default=3000,
+    help="The number of rows, per shard, that we should insert before resharding for this test.")
+
+if __name__ == '__main__':
+  utils.main(test_options=add_test_options)