зеркало из https://github.com/github/vitess-gh.git
874 строки
39 KiB
Python
Executable File
874 строки
39 KiB
Python
Executable File
#!/usr/bin/env python
|
|
#
|
|
# Copyright 2013, Google Inc. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can
|
|
# be found in the LICENSE file.
|
|
|
|
"""This test covers a resharding scenario of an already sharded keyspace.
|
|
|
|
We start with shards -80 and 80-. We then split 80- into 80-c0 and c0-.
|
|
|
|
This test is the main resharding test. It not only tests the regular resharding
|
|
workflow for an horizontal split, but also a lot of error cases and side
|
|
effects, like:
|
|
- migrating the traffic one cell at a time.
|
|
- migrating rdonly traffic back and forth.
|
|
- making sure we can't migrate the master until replica and rdonly are migrated.
|
|
- has a background thread to insert data during migration.
|
|
- tests a destination shard master failover while replication is running.
|
|
- tests a filtered replication source replacement while filtered replication
|
|
is running.
|
|
- tests 'vtctl SourceShardAdd' and 'vtctl SourceShardDelete'.
|
|
- makes sure the key range rules are properly enforced on masters.
|
|
"""
|
|
|
|
import threading
|
|
import time
|
|
|
|
import logging
|
|
import unittest
|
|
|
|
from vtproto import topodata_pb2
|
|
|
|
from vtdb import keyrange_constants
|
|
|
|
import base_sharding
|
|
import environment
|
|
import tablet
|
|
import utils
|
|
|
|
# initial shards
|
|
# range '' - 80
|
|
shard_0_master = tablet.Tablet()
|
|
shard_0_replica = tablet.Tablet()
|
|
shard_0_ny_rdonly = tablet.Tablet(cell='ny')
|
|
# range 80 - ''
|
|
shard_1_master = tablet.Tablet()
|
|
shard_1_slave1 = tablet.Tablet()
|
|
shard_1_slave2 = tablet.Tablet()
|
|
shard_1_ny_rdonly = tablet.Tablet(cell='ny')
|
|
shard_1_rdonly1 = tablet.Tablet()
|
|
|
|
# split shards
|
|
# range 80 - c0
|
|
shard_2_master = tablet.Tablet()
|
|
shard_2_replica1 = tablet.Tablet()
|
|
shard_2_replica2 = tablet.Tablet()
|
|
shard_2_rdonly1 = tablet.Tablet()
|
|
# range c0 - ''
|
|
shard_3_master = tablet.Tablet()
|
|
shard_3_replica = tablet.Tablet()
|
|
shard_3_rdonly1 = tablet.Tablet()
|
|
|
|
all_tablets = [shard_0_master, shard_0_replica, shard_0_ny_rdonly,
|
|
shard_1_master, shard_1_slave1, shard_1_slave2,
|
|
shard_1_ny_rdonly, shard_1_rdonly1,
|
|
shard_2_master, shard_2_replica1, shard_2_replica2,
|
|
shard_2_rdonly1,
|
|
shard_3_master, shard_3_replica, shard_3_rdonly1]
|
|
|
|
|
|
def setUpModule():
|
|
try:
|
|
environment.topo_server().setup()
|
|
setup_procs = [t.init_mysql() for t in all_tablets]
|
|
utils.Vtctld().start()
|
|
utils.wait_procs(setup_procs)
|
|
except:
|
|
tearDownModule()
|
|
raise
|
|
|
|
|
|
def tearDownModule():
|
|
utils.required_teardown()
|
|
if utils.options.skip_teardown:
|
|
return
|
|
|
|
teardown_procs = [t.teardown_mysql() for t in all_tablets]
|
|
utils.wait_procs(teardown_procs, raise_on_error=False)
|
|
environment.topo_server().teardown()
|
|
utils.kill_sub_processes()
|
|
utils.remove_tmp_files()
|
|
for t in all_tablets:
|
|
t.remove_tree()
|
|
|
|
|
|
# InsertThread will insert a value into the timestamps table, and then
|
|
# every 1/5s will update its value with the current timestamp
|
|
class InsertThread(threading.Thread):
|
|
|
|
def __init__(self, tablet_obj, thread_name, thread_id, user_id,
|
|
keyspace_id):
|
|
threading.Thread.__init__(self)
|
|
self.tablet = tablet_obj
|
|
self.thread_name = thread_name
|
|
self.thread_id = thread_id
|
|
self.user_id = user_id
|
|
self.keyspace_id = keyspace_id
|
|
self.str_keyspace_id = utils.uint64_to_hex(keyspace_id)
|
|
self.done = False
|
|
|
|
self.tablet.mquery(
|
|
'vt_test_keyspace',
|
|
['begin',
|
|
'insert into timestamps(id, time_milli, custom_ksid_col) '
|
|
'values(%d, %d, 0x%x) '
|
|
'/* vtgate:: keyspace_id:%s */ /* user_id:%d */' %
|
|
(self.thread_id, long(time.time() * 1000), self.keyspace_id,
|
|
self.str_keyspace_id, self.user_id),
|
|
'commit'],
|
|
write=True, user='vt_app')
|
|
self.start()
|
|
|
|
def run(self):
|
|
try:
|
|
while not self.done:
|
|
self.tablet.mquery(
|
|
'vt_test_keyspace',
|
|
['begin',
|
|
'update timestamps set time_milli=%d '
|
|
'where id=%d /* vtgate:: keyspace_id:%s */ /* user_id:%d */' %
|
|
(long(time.time() * 1000), self.thread_id,
|
|
self.str_keyspace_id, self.user_id),
|
|
'commit'],
|
|
write=True, user='vt_app')
|
|
time.sleep(0.2)
|
|
except Exception: # pylint: disable=broad-except
|
|
logging.exception('InsertThread got exception.')
|
|
|
|
|
|
# MonitorLagThread will get values from a database, and compare the timestamp
|
|
# to evaluate lag. Since the qps is really low, and we send binlogs as chunks,
|
|
# the latency is pretty high (a few seconds).
|
|
class MonitorLagThread(threading.Thread):
|
|
|
|
def __init__(self, tablet_obj, thread_name, thread_id):
|
|
threading.Thread.__init__(self)
|
|
self.tablet = tablet_obj
|
|
self.thread_name = thread_name
|
|
self.thread_id = thread_id
|
|
self.done = False
|
|
self.max_lag_ms = 0
|
|
self.lag_sum_ms = 0
|
|
self.sample_count = 0
|
|
self.start()
|
|
|
|
def run(self):
|
|
try:
|
|
while not self.done:
|
|
result = self.tablet.mquery(
|
|
'vt_test_keyspace',
|
|
'select time_milli from timestamps where id=%d' %
|
|
self.thread_id)
|
|
if result:
|
|
lag_ms = long(time.time() * 1000) - long(result[0][0])
|
|
logging.debug('MonitorLagThread(%s) got %d ms',
|
|
self.thread_name, lag_ms)
|
|
self.sample_count += 1
|
|
self.lag_sum_ms += lag_ms
|
|
if lag_ms > self.max_lag_ms:
|
|
self.max_lag_ms = lag_ms
|
|
time.sleep(1.0)
|
|
except Exception: # pylint: disable=broad-except
|
|
logging.exception('MonitorLagThread got exception.')
|
|
|
|
|
|
class TestResharding(unittest.TestCase, base_sharding.BaseShardingTest):
|
|
|
|
# create_schema will create the same schema on the keyspace
|
|
# then insert some values
|
|
def _create_schema(self):
|
|
if base_sharding.keyspace_id_type == keyrange_constants.KIT_BYTES:
|
|
t = 'varbinary(64)'
|
|
else:
|
|
t = 'bigint(20) unsigned'
|
|
# Note that the primary key columns are not defined first on purpose to test
|
|
# that a reordered column list is correctly used everywhere in vtworker.
|
|
create_table_template = '''create table %s(
|
|
custom_ksid_col ''' + t + ''' not null,
|
|
msg varchar(64),
|
|
id bigint not null,
|
|
parent_id bigint not null,
|
|
primary key (parent_id, id),
|
|
index by_msg (msg)
|
|
) Engine=InnoDB'''
|
|
create_view_template = (
|
|
'create view %s'
|
|
'(parent_id, id, msg, custom_ksid_col)'
|
|
'as select parent_id, id, msg, custom_ksid_col '
|
|
'from %s')
|
|
create_timestamp_table = '''create table timestamps(
|
|
id int not null,
|
|
time_milli bigint(20) unsigned not null,
|
|
custom_ksid_col ''' + t + ''' not null,
|
|
primary key (id)
|
|
) Engine=InnoDB'''
|
|
create_unrelated_table = '''create table unrelated(
|
|
name varchar(64),
|
|
primary key (name)
|
|
) Engine=InnoDB'''
|
|
|
|
utils.run_vtctl(['ApplySchema',
|
|
'-sql=' + create_table_template % ('resharding1'),
|
|
'test_keyspace'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['ApplySchema',
|
|
'-sql=' + create_table_template % ('resharding2'),
|
|
'test_keyspace'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['ApplySchema',
|
|
'-sql=' + create_view_template % ('view1', 'resharding1'),
|
|
'test_keyspace'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['ApplySchema',
|
|
'-sql=' + create_timestamp_table,
|
|
'test_keyspace'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['ApplySchema',
|
|
'-sql=' + create_unrelated_table,
|
|
'test_keyspace'],
|
|
auto_log=True)
|
|
|
|
def _insert_startup_values(self):
|
|
self._insert_value(shard_0_master, 'resharding1', 1, 'msg1',
|
|
0x1000000000000000)
|
|
self._insert_value(shard_1_master, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000)
|
|
self._insert_value(shard_1_master, 'resharding1', 3, 'msg3',
|
|
0xD000000000000000)
|
|
|
|
def _check_startup_values(self):
|
|
# check first value is in the right shard
|
|
self._check_value(shard_2_master, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000)
|
|
self._check_value(shard_2_replica1, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000)
|
|
self._check_value(shard_2_replica2, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000)
|
|
self._check_value(shard_2_rdonly1, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000)
|
|
self._check_value(shard_3_master, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000, should_be_here=False)
|
|
self._check_value(shard_3_replica, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000, should_be_here=False)
|
|
self._check_value(shard_3_rdonly1, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000, should_be_here=False)
|
|
|
|
# check second value is in the right shard too
|
|
self._check_value(shard_2_master, 'resharding1', 3, 'msg3',
|
|
0xD000000000000000, should_be_here=False)
|
|
self._check_value(shard_2_replica1, 'resharding1', 3, 'msg3',
|
|
0xD000000000000000, should_be_here=False)
|
|
self._check_value(shard_2_replica2, 'resharding1', 3, 'msg3',
|
|
0xD000000000000000, should_be_here=False)
|
|
self._check_value(shard_2_rdonly1, 'resharding1', 3, 'msg3',
|
|
0xD000000000000000, should_be_here=False)
|
|
self._check_value(shard_3_master, 'resharding1', 3, 'msg3',
|
|
0xD000000000000000)
|
|
self._check_value(shard_3_replica, 'resharding1', 3, 'msg3',
|
|
0xD000000000000000)
|
|
self._check_value(shard_3_rdonly1, 'resharding1', 3, 'msg3',
|
|
0xD000000000000000)
|
|
|
|
def _insert_lots(self, count, base=0):
|
|
for i in xrange(count):
|
|
self._insert_value(shard_1_master, 'resharding1', 10000 + base + i,
|
|
'msg-range1-%d' % i, 0xA000000000000000 + base + i)
|
|
self._insert_value(shard_1_master, 'resharding1', 20000 + base + i,
|
|
'msg-range2-%d' % i, 0xE000000000000000 + base + i)
|
|
|
|
# _check_lots returns how many of the values we have, in percents.
|
|
def _check_lots(self, count, base=0):
|
|
found = 0
|
|
for i in xrange(count):
|
|
if self._is_value_present_and_correct(shard_2_replica2, 'resharding1',
|
|
10000 + base + i, 'msg-range1-%d' %
|
|
i, 0xA000000000000000 + base + i):
|
|
found += 1
|
|
if self._is_value_present_and_correct(shard_3_replica, 'resharding1',
|
|
20000 + base + i, 'msg-range2-%d' %
|
|
i, 0xE000000000000000 + base + i):
|
|
found += 1
|
|
percent = found * 100 / count / 2
|
|
logging.debug('I have %d%% of the data', percent)
|
|
return percent
|
|
|
|
def _check_lots_timeout(self, count, threshold, timeout, base=0):
|
|
while True:
|
|
value = self._check_lots(count, base=base)
|
|
if value >= threshold:
|
|
return value
|
|
timeout = utils.wait_step('waiting for %d%% of the data' % threshold,
|
|
timeout, sleep_time=1)
|
|
|
|
# _check_lots_not_present makes sure no data is in the wrong shard
|
|
def _check_lots_not_present(self, count, base=0):
|
|
for i in xrange(count):
|
|
self._check_value(shard_3_replica, 'resharding1', 10000 + base + i,
|
|
'msg-range1-%d' % i, 0xA000000000000000 + base + i,
|
|
should_be_here=False)
|
|
self._check_value(shard_2_replica2, 'resharding1', 20000 + base + i,
|
|
'msg-range2-%d' % i, 0xE000000000000000 + base + i,
|
|
should_be_here=False)
|
|
|
|
def _test_keyrange_constraints(self):
|
|
with self.assertRaisesRegexp(
|
|
Exception, '.*enforce custom_ksid_col range.*'):
|
|
shard_0_master.execute(
|
|
"insert into resharding1(id, msg, custom_ksid_col) "
|
|
" values(1, 'msg', :custom_ksid_col)",
|
|
bindvars={'custom_ksid_col': 0x9000000000000000},
|
|
)
|
|
with self.assertRaisesRegexp(
|
|
Exception, '.*enforce custom_ksid_col range.*'):
|
|
shard_0_master.execute(
|
|
"update resharding1 set msg = 'msg' where id = 1",
|
|
bindvars={'custom_ksid_col': 0x9000000000000000},
|
|
)
|
|
with self.assertRaisesRegexp(
|
|
Exception, '.*enforce custom_ksid_col range.*'):
|
|
shard_0_master.execute(
|
|
'delete from resharding1 where id = 1',
|
|
bindvars={'custom_ksid_col': 0x9000000000000000},
|
|
)
|
|
|
|
def test_resharding(self):
|
|
# we're going to reparent and swap these two
|
|
global shard_2_master, shard_2_replica1
|
|
|
|
utils.run_vtctl(['CreateKeyspace',
|
|
'--sharding_column_name', 'bad_column',
|
|
'--sharding_column_type', 'bytes',
|
|
'test_keyspace'])
|
|
utils.run_vtctl(['SetKeyspaceShardingInfo', 'test_keyspace',
|
|
'custom_ksid_col', 'uint64'], expect_fail=True)
|
|
utils.run_vtctl(['SetKeyspaceShardingInfo', '-force',
|
|
'test_keyspace',
|
|
'custom_ksid_col', base_sharding.keyspace_id_type])
|
|
|
|
shard_0_master.init_tablet('master', 'test_keyspace', '-80')
|
|
shard_0_replica.init_tablet('replica', 'test_keyspace', '-80')
|
|
shard_0_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '-80')
|
|
shard_1_master.init_tablet('master', 'test_keyspace', '80-')
|
|
shard_1_slave1.init_tablet('replica', 'test_keyspace', '80-')
|
|
shard_1_slave2.init_tablet('replica', 'test_keyspace', '80-')
|
|
shard_1_ny_rdonly.init_tablet('rdonly', 'test_keyspace', '80-')
|
|
shard_1_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-')
|
|
|
|
utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)
|
|
ks = utils.run_vtctl_json(['GetSrvKeyspace', 'test_nj', 'test_keyspace'])
|
|
self.assertEqual(ks['sharding_column_name'], 'custom_ksid_col')
|
|
|
|
# we set full_mycnf_args to True as a test in the KIT_BYTES case
|
|
full_mycnf_args = (base_sharding.keyspace_id_type ==
|
|
keyrange_constants.KIT_BYTES)
|
|
|
|
# create databases so vttablet can start behaving somewhat normally
|
|
for t in [shard_0_master, shard_0_replica, shard_0_ny_rdonly,
|
|
shard_1_master, shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
|
|
shard_1_rdonly1]:
|
|
t.create_db('vt_test_keyspace')
|
|
t.start_vttablet(wait_for_state=None, full_mycnf_args=full_mycnf_args)
|
|
|
|
# wait for the tablets (replication is not setup, the slaves won't be
|
|
# healthy)
|
|
shard_0_master.wait_for_vttablet_state('SERVING')
|
|
shard_0_replica.wait_for_vttablet_state('NOT_SERVING')
|
|
shard_0_ny_rdonly.wait_for_vttablet_state('NOT_SERVING')
|
|
shard_1_master.wait_for_vttablet_state('SERVING')
|
|
shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
|
|
shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')
|
|
shard_1_ny_rdonly.wait_for_vttablet_state('NOT_SERVING')
|
|
shard_1_rdonly1.wait_for_vttablet_state('NOT_SERVING')
|
|
|
|
# reparent to make the tablets work
|
|
utils.run_vtctl(['InitShardMaster', 'test_keyspace/-80',
|
|
shard_0_master.tablet_alias], auto_log=True)
|
|
utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-',
|
|
shard_1_master.tablet_alias], auto_log=True)
|
|
|
|
# check the shards
|
|
shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace'])
|
|
self.assertIn('-80', shards, 'unexpected shards: %s' % str(shards))
|
|
self.assertIn('80-', shards, 'unexpected shards: %s' % str(shards))
|
|
self.assertEqual(len(shards), 2, 'unexpected shards: %s' % str(shards))
|
|
|
|
# create the tables
|
|
self._create_schema()
|
|
self._insert_startup_values()
|
|
self._test_keyrange_constraints()
|
|
|
|
# run a health check on source replicas so they respond to discovery
|
|
# (for binlog players) and on the source rdonlys (for workers)
|
|
for t in [shard_0_replica, shard_1_slave1]:
|
|
utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
|
|
for t in [shard_0_ny_rdonly, shard_1_ny_rdonly, shard_1_rdonly1]:
|
|
utils.run_vtctl(['RunHealthCheck', t.tablet_alias])
|
|
|
|
# create the split shards
|
|
shard_2_master.init_tablet('master', 'test_keyspace', '80-c0')
|
|
shard_2_replica1.init_tablet('replica', 'test_keyspace', '80-c0')
|
|
shard_2_replica2.init_tablet('replica', 'test_keyspace', '80-c0')
|
|
shard_2_rdonly1.init_tablet('rdonly', 'test_keyspace', '80-c0')
|
|
shard_3_master.init_tablet('master', 'test_keyspace', 'c0-')
|
|
shard_3_replica.init_tablet('replica', 'test_keyspace', 'c0-')
|
|
shard_3_rdonly1.init_tablet('rdonly', 'test_keyspace', 'c0-')
|
|
|
|
# start vttablet on the split shards (no db created,
|
|
# so they're all not serving)
|
|
shard_2_master.start_vttablet(wait_for_state=None)
|
|
shard_3_master.start_vttablet(wait_for_state=None)
|
|
for t in [shard_2_replica1, shard_2_replica2, shard_2_rdonly1,
|
|
shard_3_replica, shard_3_rdonly1]:
|
|
t.start_vttablet(wait_for_state=None)
|
|
for t in [shard_2_master, shard_2_replica1, shard_2_replica2,
|
|
shard_2_rdonly1,
|
|
shard_3_master, shard_3_replica, shard_3_rdonly1]:
|
|
t.wait_for_vttablet_state('NOT_SERVING')
|
|
|
|
utils.run_vtctl(['InitShardMaster', 'test_keyspace/80-c0',
|
|
shard_2_master.tablet_alias], auto_log=True)
|
|
utils.run_vtctl(['InitShardMaster', 'test_keyspace/c0-',
|
|
shard_3_master.tablet_alias], auto_log=True)
|
|
|
|
# check the shards
|
|
shards = utils.run_vtctl_json(['FindAllShardsInKeyspace', 'test_keyspace'])
|
|
for s in ['-80', '80-', '80-c0', 'c0-']:
|
|
self.assertIn(s, shards, 'unexpected shards: %s' % str(shards))
|
|
self.assertEqual(len(shards), 4, 'unexpected shards: %s' % str(shards))
|
|
|
|
utils.run_vtctl(['RebuildKeyspaceGraph', 'test_keyspace'],
|
|
auto_log=True)
|
|
utils.check_srv_keyspace(
|
|
'test_nj', 'test_keyspace',
|
|
'Partitions(master): -80 80-\n'
|
|
'Partitions(rdonly): -80 80-\n'
|
|
'Partitions(replica): -80 80-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
|
|
# disable shard_1_slave2, so we're sure filtered replication will go
|
|
# from shard_1_slave1
|
|
utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'spare'])
|
|
shard_1_slave2.wait_for_vttablet_state('NOT_SERVING')
|
|
|
|
# we need to create the schema, and the worker will do data copying
|
|
for keyspace_shard in ('test_keyspace/80-c0', 'test_keyspace/c0-'):
|
|
utils.run_vtctl(['CopySchemaShard', '--exclude_tables', 'unrelated',
|
|
shard_1_rdonly1.tablet_alias, keyspace_shard],
|
|
auto_log=True)
|
|
|
|
# Run vtworker as daemon for the following SplitClone commands.
|
|
worker_proc, worker_port, worker_rpc_port = utils.run_vtworker_bg(
|
|
['--cell', 'test_nj', '--command_display_interval', '10ms'],
|
|
auto_log=True)
|
|
|
|
# Copy the data from the source to the destination shards.
|
|
# --max_tps is only specified to enable the throttler and ensure that the
|
|
# code is executed. But the intent here is not to throttle the test, hence
|
|
# the rate limit is set very high.
|
|
#
|
|
# Initial clone (online).
|
|
workerclient_proc = utils.run_vtworker_client_bg(
|
|
['SplitClone',
|
|
'--offline=false',
|
|
'--exclude_tables', 'unrelated',
|
|
'--chunk_count', '10',
|
|
'--min_rows_per_chunk', '1',
|
|
'--min_healthy_rdonly_tablets', '1',
|
|
'--max_tps', '9999',
|
|
'test_keyspace/80-'],
|
|
worker_rpc_port)
|
|
utils.wait_procs([workerclient_proc])
|
|
self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
|
|
2, 0, 0)
|
|
|
|
# Reset vtworker such that we can run the next command.
|
|
workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port)
|
|
utils.wait_procs([workerclient_proc])
|
|
|
|
# Test the correct handling of keyspace_id changes which happen after
|
|
# the first clone.
|
|
# Let row 2 go to shard 3 instead of shard 2.
|
|
shard_1_master.mquery('vt_test_keyspace',
|
|
'update resharding1 set'
|
|
' custom_ksid_col=0xD000000000000000 WHERE id=2',
|
|
write=True)
|
|
workerclient_proc = utils.run_vtworker_client_bg(
|
|
['SplitClone',
|
|
'--offline=false',
|
|
'--exclude_tables', 'unrelated',
|
|
'--chunk_count', '10',
|
|
'--min_rows_per_chunk', '1',
|
|
'--min_healthy_rdonly_tablets', '1',
|
|
'--max_tps', '9999',
|
|
'test_keyspace/80-'],
|
|
worker_rpc_port)
|
|
utils.wait_procs([workerclient_proc])
|
|
# Row 2 will be deleted from shard 2 and inserted to shard 3.
|
|
self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
|
|
1, 0, 1)
|
|
self._check_value(shard_2_master, 'resharding1', 2, 'msg2',
|
|
0xD000000000000000, should_be_here=False)
|
|
self._check_value(shard_3_master, 'resharding1', 2, 'msg2',
|
|
0xD000000000000000)
|
|
# Reset vtworker such that we can run the next command.
|
|
workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port)
|
|
utils.wait_procs([workerclient_proc])
|
|
|
|
# Move row 2 back to shard 2 from shard 3 by changing the keyspace_id again.
|
|
shard_1_master.mquery('vt_test_keyspace',
|
|
'update resharding1 set'
|
|
' custom_ksid_col=0x9000000000000000 WHERE id=2',
|
|
write=True)
|
|
workerclient_proc = utils.run_vtworker_client_bg(
|
|
['SplitClone',
|
|
'--offline=false',
|
|
'--exclude_tables', 'unrelated',
|
|
'--chunk_count', '10',
|
|
'--min_rows_per_chunk', '1',
|
|
'--min_healthy_rdonly_tablets', '1',
|
|
'--max_tps', '9999',
|
|
'test_keyspace/80-'],
|
|
worker_rpc_port)
|
|
utils.wait_procs([workerclient_proc])
|
|
# Row 2 will be deleted from shard 3 and inserted to shard 2.
|
|
self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
|
|
1, 0, 1)
|
|
self._check_value(shard_2_master, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000)
|
|
self._check_value(shard_3_master, 'resharding1', 2, 'msg2',
|
|
0x9000000000000000, should_be_here=False)
|
|
# Reset vtworker such that we can run the next command.
|
|
workerclient_proc = utils.run_vtworker_client_bg(['Reset'], worker_rpc_port)
|
|
utils.wait_procs([workerclient_proc])
|
|
|
|
# Modify the destination shard. SplitClone will revert the changes.
|
|
# Delete row 2 (provokes an insert).
|
|
shard_2_master.mquery('vt_test_keyspace',
|
|
'delete from resharding1 where id=2', write=True)
|
|
# Update row 3 (provokes an update).
|
|
shard_3_master.mquery('vt_test_keyspace',
|
|
"update resharding1 set msg='msg-not-3' where id=3",
|
|
write=True)
|
|
# Insert row 4 and 5 (provokes a delete).
|
|
self._insert_value(shard_3_master, 'resharding1', 4, 'msg4',
|
|
0xD000000000000000)
|
|
self._insert_value(shard_3_master, 'resharding1', 5, 'msg5',
|
|
0xD000000000000000)
|
|
|
|
workerclient_proc = utils.run_vtworker_client_bg(
|
|
['SplitClone',
|
|
'--exclude_tables', 'unrelated',
|
|
'--chunk_count', '10',
|
|
'--min_rows_per_chunk', '1',
|
|
'--min_healthy_rdonly_tablets', '1',
|
|
'--max_tps', '9999',
|
|
'test_keyspace/80-'],
|
|
worker_rpc_port)
|
|
utils.wait_procs([workerclient_proc])
|
|
# Change tablet, which was taken offline, back to rdonly.
|
|
utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias,
|
|
'rdonly'], auto_log=True)
|
|
self.verify_reconciliation_counters(worker_port, 'Online', 'resharding1',
|
|
1, 1, 2)
|
|
self.verify_reconciliation_counters(worker_port, 'Offline', 'resharding1',
|
|
0, 0, 0)
|
|
# Terminate worker daemon because it is no longer needed.
|
|
utils.kill_sub_process(worker_proc, soft=True)
|
|
|
|
# TODO(alainjobart): experiment with the dontStartBinlogPlayer option
|
|
|
|
# check the startup values are in the right place
|
|
self._check_startup_values()
|
|
|
|
# check the schema too
|
|
utils.run_vtctl(['ValidateSchemaKeyspace', '--exclude_tables=unrelated',
|
|
'test_keyspace'], auto_log=True)
|
|
|
|
# check the binlog players are running and exporting vars
|
|
self.check_destination_master(shard_2_master, ['test_keyspace/80-'])
|
|
self.check_destination_master(shard_3_master, ['test_keyspace/80-'])
|
|
|
|
# check that binlog server exported the stats vars
|
|
self.check_binlog_server_vars(shard_1_slave1, horizontal=True)
|
|
|
|
# Check that the throttler was enabled.
|
|
self.check_throttler_service(shard_2_master.rpc_endpoint(),
|
|
['BinlogPlayer/0'], 9999)
|
|
self.check_throttler_service(shard_3_master.rpc_endpoint(),
|
|
['BinlogPlayer/0'], 9999)
|
|
|
|
# testing filtered replication: insert a bunch of data on shard 1,
|
|
# check we get most of it after a few seconds, wait for binlog server
|
|
# timeout, check we get all of it.
|
|
logging.debug('Inserting lots of data on source shard')
|
|
self._insert_lots(1000)
|
|
logging.debug('Checking 80 percent of data is sent quickly')
|
|
v = self._check_lots_timeout(1000, 80, 5)
|
|
if v != 100:
|
|
# small optimization: only do this check if we don't have all the data
|
|
# already anyway.
|
|
logging.debug('Checking all data goes through eventually')
|
|
self._check_lots_timeout(1000, 100, 20)
|
|
logging.debug('Checking no data was sent the wrong way')
|
|
self._check_lots_not_present(1000)
|
|
self.check_binlog_player_vars(shard_2_master, ['test_keyspace/80-'],
|
|
seconds_behind_master_max=30)
|
|
self.check_binlog_player_vars(shard_3_master, ['test_keyspace/80-'],
|
|
seconds_behind_master_max=30)
|
|
self.check_binlog_server_vars(shard_1_slave1, horizontal=True,
|
|
min_statements=1000, min_transactions=1000)
|
|
|
|
# use vtworker to compare the data (after health-checking the destination
|
|
# rdonly tablets so discovery works)
|
|
utils.run_vtctl(['RunHealthCheck', shard_3_rdonly1.tablet_alias])
|
|
logging.debug('Running vtworker SplitDiff')
|
|
utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
|
|
'--exclude_tables', 'unrelated',
|
|
'--min_healthy_rdonly_tablets', '1',
|
|
'test_keyspace/c0-'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
|
|
auto_log=True)
|
|
|
|
utils.pause('Good time to test vtworker for diffs')
|
|
|
|
# get status for destination master tablets, make sure we have it all
|
|
self.check_running_binlog_player(shard_2_master, 4000, 2000)
|
|
self.check_running_binlog_player(shard_3_master, 4000, 2000)
|
|
|
|
# start a thread to insert data into shard_1 in the background
|
|
# with current time, and monitor the delay
|
|
insert_thread_1 = InsertThread(shard_1_master, 'insert_low', 1, 10000,
|
|
0x9000000000000000)
|
|
insert_thread_2 = InsertThread(shard_1_master, 'insert_high', 2, 10001,
|
|
0xD000000000000000)
|
|
monitor_thread_1 = MonitorLagThread(shard_2_replica2, 'insert_low', 1)
|
|
monitor_thread_2 = MonitorLagThread(shard_3_replica, 'insert_high', 2)
|
|
|
|
# tests a failover switching serving to a different replica
|
|
utils.run_vtctl(['ChangeSlaveType', shard_1_slave2.tablet_alias, 'replica'])
|
|
utils.run_vtctl(['ChangeSlaveType', shard_1_slave1.tablet_alias, 'spare'])
|
|
shard_1_slave2.wait_for_vttablet_state('SERVING')
|
|
shard_1_slave1.wait_for_vttablet_state('NOT_SERVING')
|
|
utils.run_vtctl(['RunHealthCheck', shard_1_slave2.tablet_alias])
|
|
|
|
# test data goes through again
|
|
logging.debug('Inserting lots of data on source shard')
|
|
self._insert_lots(1000, base=1000)
|
|
logging.debug('Checking 80 percent of data was sent quickly')
|
|
self._check_lots_timeout(1000, 80, 5, base=1000)
|
|
self.check_binlog_server_vars(shard_1_slave2, horizontal=True,
|
|
min_statements=800, min_transactions=800)
|
|
|
|
# check we can't migrate the master just yet
|
|
utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
|
|
expect_fail=True)
|
|
|
|
# check query service is off on master 2 and master 3, as filtered
|
|
# replication is enabled. Even health check that is enabled on
|
|
# master 3 should not interfere (we run it to be sure).
|
|
utils.run_vtctl(['RunHealthCheck', shard_3_master.tablet_alias],
|
|
auto_log=True)
|
|
for master in [shard_2_master, shard_3_master]:
|
|
utils.check_tablet_query_service(self, master, False, False)
|
|
stream_health = utils.run_vtctl_json(['VtTabletStreamHealth',
|
|
'-count', '1',
|
|
master.tablet_alias])
|
|
logging.debug('Got health: %s', str(stream_health))
|
|
self.assertIn('realtime_stats', stream_health)
|
|
self.assertNotIn('serving', stream_health)
|
|
|
|
# check the destination master 3 is healthy, even though its query
|
|
# service is not running (if not healthy this would exception out)
|
|
shard_3_master.get_healthz()
|
|
|
|
# now serve rdonly from the split shards, in test_nj only
|
|
utils.run_vtctl(['MigrateServedTypes', '--cells=test_nj',
|
|
'test_keyspace/80-', 'rdonly'], auto_log=True)
|
|
utils.check_srv_keyspace('test_nj', 'test_keyspace',
|
|
'Partitions(master): -80 80-\n'
|
|
'Partitions(rdonly): -80 80-c0 c0-\n'
|
|
'Partitions(replica): -80 80-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
utils.check_srv_keyspace('test_ny', 'test_keyspace',
|
|
'Partitions(master): -80 80-\n'
|
|
'Partitions(rdonly): -80 80-\n'
|
|
'Partitions(replica): -80 80-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
|
|
utils.check_tablet_query_service(self, shard_1_ny_rdonly, True, False)
|
|
utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)
|
|
|
|
# now serve rdonly from the split shards, everywhere
|
|
utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'rdonly'],
|
|
auto_log=True)
|
|
utils.check_srv_keyspace('test_nj', 'test_keyspace',
|
|
'Partitions(master): -80 80-\n'
|
|
'Partitions(rdonly): -80 80-c0 c0-\n'
|
|
'Partitions(replica): -80 80-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
utils.check_srv_keyspace('test_ny', 'test_keyspace',
|
|
'Partitions(master): -80 80-\n'
|
|
'Partitions(rdonly): -80 80-c0 c0-\n'
|
|
'Partitions(replica): -80 80-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
utils.check_tablet_query_service(self, shard_0_ny_rdonly, True, False)
|
|
utils.check_tablet_query_service(self, shard_1_ny_rdonly, False, True)
|
|
utils.check_tablet_query_service(self, shard_1_rdonly1, False, True)
|
|
|
|
# then serve replica from the split shards
|
|
destination_shards = ['test_keyspace/80-c0', 'test_keyspace/c0-']
|
|
|
|
utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
|
|
auto_log=True)
|
|
utils.check_srv_keyspace('test_nj', 'test_keyspace',
|
|
'Partitions(master): -80 80-\n'
|
|
'Partitions(rdonly): -80 80-c0 c0-\n'
|
|
'Partitions(replica): -80 80-c0 c0-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
utils.check_tablet_query_service(self, shard_1_slave2, False, True)
|
|
|
|
# move replica back and forth
|
|
utils.run_vtctl(
|
|
['MigrateServedTypes', '-reverse', 'test_keyspace/80-', 'replica'],
|
|
auto_log=True)
|
|
# After a backwards migration, queryservice should be enabled on
|
|
# source and disabled on destinations
|
|
utils.check_tablet_query_service(self, shard_1_slave2, True, False)
|
|
# Destination tablets would have query service disabled for other
|
|
# reasons than the migration, so check the shard record instead of
|
|
# the tablets directly.
|
|
utils.check_shard_query_services(self, destination_shards,
|
|
topodata_pb2.REPLICA, False)
|
|
utils.check_srv_keyspace('test_nj', 'test_keyspace',
|
|
'Partitions(master): -80 80-\n'
|
|
'Partitions(rdonly): -80 80-c0 c0-\n'
|
|
'Partitions(replica): -80 80-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
|
|
utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'replica'],
|
|
auto_log=True)
|
|
# After a forwards migration, queryservice should be disabled on
|
|
# source and enabled on destinations
|
|
utils.check_tablet_query_service(self, shard_1_slave2, False, True)
|
|
# Destination tablets would have query service disabled for other
|
|
# reasons than the migration, so check the shard record instead of
|
|
# the tablets directly
|
|
utils.check_shard_query_services(self, destination_shards,
|
|
topodata_pb2.REPLICA, True)
|
|
utils.check_srv_keyspace('test_nj', 'test_keyspace',
|
|
'Partitions(master): -80 80-\n'
|
|
'Partitions(rdonly): -80 80-c0 c0-\n'
|
|
'Partitions(replica): -80 80-c0 c0-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
|
|
# reparent shard_2 to shard_2_replica1, then insert more data and
|
|
# see it flow through still
|
|
utils.run_vtctl(['PlannedReparentShard', 'test_keyspace/80-c0',
|
|
shard_2_replica1.tablet_alias])
|
|
|
|
# update our test variables to point at the new master
|
|
shard_2_master, shard_2_replica1 = shard_2_replica1, shard_2_master
|
|
|
|
logging.debug('Inserting lots of data on source shard after reparenting')
|
|
self._insert_lots(3000, base=2000)
|
|
logging.debug('Checking 80 percent of data was sent fairly quickly')
|
|
self._check_lots_timeout(3000, 80, 10, base=2000)
|
|
|
|
# use vtworker to compare the data again
|
|
logging.debug('Running vtworker SplitDiff')
|
|
utils.run_vtworker(['-cell', 'test_nj', 'SplitDiff',
|
|
'--exclude_tables', 'unrelated',
|
|
'--min_healthy_rdonly_tablets', '1',
|
|
'test_keyspace/c0-'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['ChangeSlaveType', shard_1_rdonly1.tablet_alias, 'rdonly'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['ChangeSlaveType', shard_3_rdonly1.tablet_alias, 'rdonly'],
|
|
auto_log=True)
|
|
|
|
# going to migrate the master now, check the delays
|
|
monitor_thread_1.done = True
|
|
monitor_thread_2.done = True
|
|
insert_thread_1.done = True
|
|
insert_thread_2.done = True
|
|
logging.debug('DELAY 1: %s max_lag=%d ms avg_lag=%d ms',
|
|
monitor_thread_1.thread_name,
|
|
monitor_thread_1.max_lag_ms,
|
|
monitor_thread_1.lag_sum_ms / monitor_thread_1.sample_count)
|
|
logging.debug('DELAY 2: %s max_lag=%d ms avg_lag=%d ms',
|
|
monitor_thread_2.thread_name,
|
|
monitor_thread_2.max_lag_ms,
|
|
monitor_thread_2.lag_sum_ms / monitor_thread_2.sample_count)
|
|
|
|
# mock with the SourceShard records to test 'vtctl SourceShardDelete'
|
|
# and 'vtctl SourceShardAdd'
|
|
utils.run_vtctl(['SourceShardDelete', 'test_keyspace/c0-', '0'],
|
|
auto_log=True)
|
|
utils.run_vtctl(['SourceShardAdd', '--key_range=80-',
|
|
'test_keyspace/c0-', '0', 'test_keyspace/80-'],
|
|
auto_log=True)
|
|
|
|
# then serve master from the split shards, make sure the source master's
|
|
# query service is now turned off
|
|
utils.run_vtctl(['MigrateServedTypes', 'test_keyspace/80-', 'master'],
|
|
auto_log=True)
|
|
utils.check_srv_keyspace('test_nj', 'test_keyspace',
|
|
'Partitions(master): -80 80-c0 c0-\n'
|
|
'Partitions(rdonly): -80 80-c0 c0-\n'
|
|
'Partitions(replica): -80 80-c0 c0-\n',
|
|
keyspace_id_type=base_sharding.keyspace_id_type,
|
|
sharding_column_name='custom_ksid_col')
|
|
utils.check_tablet_query_service(self, shard_1_master, False, True)
|
|
|
|
# check the binlog players are gone now
|
|
self.check_no_binlog_player(shard_2_master)
|
|
self.check_no_binlog_player(shard_3_master)
|
|
|
|
# delete the original tablets in the original shard
|
|
tablet.kill_tablets([shard_1_master, shard_1_slave1, shard_1_slave2,
|
|
shard_1_ny_rdonly, shard_1_rdonly1])
|
|
for t in [shard_1_slave1, shard_1_slave2, shard_1_ny_rdonly,
|
|
shard_1_rdonly1]:
|
|
utils.run_vtctl(['DeleteTablet', t.tablet_alias], auto_log=True)
|
|
utils.run_vtctl(['DeleteTablet', '-allow_master',
|
|
shard_1_master.tablet_alias], auto_log=True)
|
|
|
|
# rebuild the serving graph, all mentions of the old shards shoud be gone
|
|
utils.run_vtctl(
|
|
['RebuildKeyspaceGraph', 'test_keyspace'], auto_log=True)
|
|
|
|
# test RemoveShardCell
|
|
utils.run_vtctl(
|
|
['RemoveShardCell', 'test_keyspace/-80', 'test_nj'], auto_log=True,
|
|
expect_fail=True)
|
|
utils.run_vtctl(
|
|
['RemoveShardCell', 'test_keyspace/80-', 'test_nj'], auto_log=True)
|
|
utils.run_vtctl(
|
|
['RemoveShardCell', 'test_keyspace/80-', 'test_ny'], auto_log=True)
|
|
shard = utils.run_vtctl_json(['GetShard', 'test_keyspace/80-'])
|
|
self.assertNotIn('cells', shard)
|
|
|
|
# delete the original shard
|
|
utils.run_vtctl(['DeleteShard', 'test_keyspace/80-'], auto_log=True)
|
|
|
|
# kill everything
|
|
tablet.kill_tablets([shard_0_master, shard_0_replica, shard_0_ny_rdonly,
|
|
shard_2_master, shard_2_replica1, shard_2_replica2,
|
|
shard_2_rdonly1,
|
|
shard_3_master, shard_3_replica, shard_3_rdonly1])
|
|
|
|
if __name__ == '__main__':
|
|
utils.main()
|