Better doc for tablet type, better unit tests.

This commit is contained in:
Alain Jobart 2016-05-09 10:28:37 -07:00
Родитель 6d815ad12d
Коммит 25bc282ba0
5 изменённых файлов: 126 добавлений и 67 удалений

Просмотреть файл

@ -71,16 +71,36 @@ func (KeyspaceIdType) EnumDescriptor() ([]byte, []int) { return fileDescriptor0,
type TabletType int32
const (
TabletType_UNKNOWN TabletType = 0
TabletType_MASTER TabletType = 1
TabletType_REPLICA TabletType = 2
TabletType_RDONLY TabletType = 3
TabletType_BATCH TabletType = 3
TabletType_SPARE TabletType = 4
// UNKNOWN is not a valid value.
TabletType_UNKNOWN TabletType = 0
// MASTER is the master server for the shard. Only MASTER allows DMLs.
TabletType_MASTER TabletType = 1
// REPLICA is a slave type. It is used to serve live traffic.
// A REPLICA can be promoted to MASTER. A demoted MASTER will go to REPLICA.
TabletType_REPLICA TabletType = 2
// RDONLY (old name) / BATCH (new name) is used to serve traffic for
// long-running jobs. It is a separate type from REPLICA so
// long-running queries don't affect web-like traffic.
TabletType_RDONLY TabletType = 3
TabletType_BATCH TabletType = 3
// SPARE is a type of servers that cannot serve queries, but is available
// in case an extra server is needed.
TabletType_SPARE TabletType = 4
// EXPERIMENTAL is like SPARE, except it can serve queries. This
// type can be used for usages not planned by Vitess, like online
// export to another storage engine.
TabletType_EXPERIMENTAL TabletType = 5
TabletType_BACKUP TabletType = 6
TabletType_RESTORE TabletType = 7
TabletType_WORKER TabletType = 8
// BACKUP is the type a server goes to when taking a backup. No queries
// can be served in BACKUP mode.
TabletType_BACKUP TabletType = 6
// RESTORE is the type a server uses when restoring a backup, at
// startup time. No queries can be served in RESTORE mode.
TabletType_RESTORE TabletType = 7
// WORKER is the type a server goes into when used by a vtworker
// process to perform an offline action. It is a serving type (as
// the vtworker processes may need queries to run). In this state,
// this tablet is dedicated to the vtworker process that uses it.
TabletType_WORKER TabletType = 8
)
var TabletType_name = map[int32]string{

Просмотреть файл

@ -4,9 +4,16 @@
package tabletmanager
// This file handles the health check. It is enabled by passing a
// target_tablet_type command line parameter. The tablet will then go
// to the target tablet type if healthy, and to 'spare' if not.
// This file handles the health check. It is always enabled in production
// vttablets (but not in vtcombo, and not in unit tests by default).
// If we are unhealthy, we'll stop the query service. In any case,
// we report our replication delay so vtgate's discovery can use this tablet
// or not.
//
// Note: we used to go to SPARE when unhealthy, and back to the target
// tablet type when healhty. Now that we use the discovery module,
// health is handled by clients subscribing to the health stream, so
// we don't need to do that any more.
import (
"flag"
@ -139,13 +146,9 @@ func (agent *ActionAgent) initHealthCheck() {
}
// runHealthCheck takes the action mutex, runs the health check,
// and if we need to change our state, do it.
// If we are the master, we don't change our type, healthy or not.
// If we are not the master, we change to spare if not healthy,
// or to the passed in targetTabletType if healthy.
//
// Note we only update the topo record if we need to, that is if our type or
// health details changed.
// and if we need to change our state, do it. We never change our type,
// just the health we report (so we do not change the topo server at all).
// We do not interact with topo server, we use cached values for everything.
//
// This will not change the BinlogPlayerMap, but if it is not empty,
// we will think we should not be running the query service.
@ -302,8 +305,7 @@ func (agent *ActionAgent) runHealthCheckProtected() {
// terminateHealthChecks is called when we enter lame duck mode.
// We will clean up our state, and set query service to lame duck mode.
// We only do something if we are in targetTabletType state, and then
// we just go to spare.
// We only do something if we are in a serving state, and not a master.
func (agent *ActionAgent) terminateHealthChecks() {
agent.actionMutex.Lock()
defer agent.actionMutex.Unlock()
@ -322,24 +324,16 @@ func (agent *ActionAgent) terminateHealthChecks() {
// Go lameduck for gracePeriod.
// We've already checked above that we're not MASTER.
// Enter new lameduck mode for gracePeriod, then shut down queryservice.
// New lameduck mode means keep accepting queries, but advertise unhealthy.
// After we return from this synchronous OnTermSync hook, servenv may decide
// to wait even longer, for the rest of the time specified by its own
// "-lameduck-period" flag. During that extra period, queryservice will be
// in old lameduck mode, meaning stay alive but reject new queries.
// Enter new lameduck mode for gracePeriod, then shut down
// queryservice. New lameduck mode means keep accepting
// queries, but advertise unhealthy. After we return from
// this synchronous OnTermSync hook, servenv may decide to
// wait even longer, for the rest of the time specified by its
// own "-lameduck-period" flag. During that extra period,
// queryservice will be in old lameduck mode, meaning stay
// alive but reject new queries.
agent.enterLameduck("terminating healthchecks")
agent.broadcastHealth()
time.Sleep(*gracePeriod)
agent.disallowQueries(tablet.Type, "terminating healthchecks")
}
// updateServingGraph will update the serving graph if we need to.
func (agent *ActionAgent) updateServingGraph(tablet *topodatapb.Tablet, targetTabletType topodatapb.TabletType) error {
if topo.IsInServingGraph(targetTabletType) {
if err := topotools.UpdateTabletEndpoints(agent.batchCtx, agent.TopoServer, tablet); err != nil {
return fmt.Errorf("UpdateTabletEndpoints failed: %v", err)
}
}
return nil
}

Просмотреть файл

@ -163,7 +163,7 @@ func TestHealthCheckControlsQueryService(t *testing.T) {
/// Consume the first health broadcast triggered by ActionAgent.Start():
// (REPLICA, NOT_SERVING) goes to (REPLICA, SERVING). And we
// should be serving.
if _, err := expectBroadcastData(agent.QueryServiceControl, 0); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "healthcheck not run yet", 0); err != nil {
t.Fatal(err)
}
if err := expectStateChange(agent.QueryServiceControl, true, topodatapb.TabletType_REPLICA); err != nil {
@ -203,7 +203,7 @@ func TestHealthCheckControlsQueryService(t *testing.T) {
if agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType != topodatapb.TabletType_REPLICA {
t.Errorf("invalid tabletserver target: %v", agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType)
}
if _, err := expectBroadcastData(agent.QueryServiceControl, 12); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "", 12); err != nil {
t.Fatal(err)
}
@ -231,7 +231,7 @@ func TestHealthCheckControlsQueryService(t *testing.T) {
if got := agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType; got != targetTabletType {
t.Errorf("invalid tabletserver target: got = %v, want = %v", got, targetTabletType)
}
if _, err := expectBroadcastData(agent.QueryServiceControl, 13); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, false, "tablet is unhealthy", 13); err != nil {
t.Fatal(err)
}
@ -315,7 +315,7 @@ func TestQueryServiceStopped(t *testing.T) {
/// Consume the first health broadcast triggered by ActionAgent.Start():
// (REPLICA, NOT_SERVING) goes to (REPLICA, SERVING). And we
// should be serving.
if _, err := expectBroadcastData(agent.QueryServiceControl, 0); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "healthcheck not run yet", 0); err != nil {
t.Fatal(err)
}
if err := expectStateChange(agent.QueryServiceControl, true, topodatapb.TabletType_REPLICA); err != nil {
@ -353,7 +353,7 @@ func TestQueryServiceStopped(t *testing.T) {
t.Errorf("invalid tabletserver target: got = %v, want = %v", got, want)
}
if _, err := expectBroadcastData(agent.QueryServiceControl, 14); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "", 14); err != nil {
t.Fatal(err)
}
@ -390,11 +390,7 @@ func TestQueryServiceStopped(t *testing.T) {
if got := agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType; got != want {
t.Errorf("invalid tabletserver target: got = %v, want = %v", got, want)
}
if bd, err := expectBroadcastData(agent.QueryServiceControl, 15); err == nil {
if bd.RealtimeStats.HealthError != "test cannot start query service" {
t.Errorf("unexpected HealthError: %v", *bd)
}
} else {
if _, err := expectBroadcastData(agent.QueryServiceControl, false, "test cannot start query service", 15); err != nil {
t.Fatal(err)
}
// NOTE: No more broadcasts or state changes since SetServingTypeError is set
@ -418,7 +414,7 @@ func TestTabletControl(t *testing.T) {
/// Consume the first health broadcast triggered by ActionAgent.Start():
// (REPLICA, NOT_SERVING) goes to (REPLICA, SERVING). And we
// should be serving.
if _, err := expectBroadcastData(agent.QueryServiceControl, 0); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "healthcheck not run yet", 0); err != nil {
t.Fatal(err)
}
if err := expectStateChange(agent.QueryServiceControl, true, topodatapb.TabletType_REPLICA); err != nil {
@ -448,7 +444,7 @@ func TestTabletControl(t *testing.T) {
if got := agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType; got != targetTabletType {
t.Errorf("invalid tabletserver target: got = %v, want = %v", got, targetTabletType)
}
if _, err := expectBroadcastData(agent.QueryServiceControl, 16); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "", 16); err != nil {
t.Fatal(err)
}
@ -485,7 +481,7 @@ func TestTabletControl(t *testing.T) {
// Consume the health broadcast which was triggered due to the QueryService
// state change from SERVING to NOT_SERVING.
if _, err := expectBroadcastData(agent.QueryServiceControl, 16); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, false, "", 16); err != nil {
t.Fatal(err)
}
if err := expectStateChange(agent.QueryServiceControl, false, targetTabletType); err != nil {
@ -515,12 +511,12 @@ func TestTabletControl(t *testing.T) {
if got := agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType; got != targetTabletType {
t.Errorf("invalid tabletserver target: got = %v, want = %v", got, targetTabletType)
}
if _, err := expectBroadcastData(agent.QueryServiceControl, 17); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, false, "", 17); err != nil {
t.Fatal(err)
}
// NOTE: No state change here since nothing has changed.
// go unhealthy, check we go to spare and QS is not running
// go unhealthy, check we go to error state and QS is not running
agent.HealthReporter.(*fakeHealthCheck).reportError = fmt.Errorf("tablet is unhealthy")
agent.HealthReporter.(*fakeHealthCheck).reportReplicationDelay = 18 * time.Second
before = time.Now()
@ -541,7 +537,7 @@ func TestTabletControl(t *testing.T) {
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}
if _, err := expectBroadcastData(agent.QueryServiceControl, 18); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, false, "tablet is unhealthy", 18); err != nil {
t.Fatal(err)
}
// NOTE: No state change here since QueryService is already NOT_SERVING.
@ -571,7 +567,7 @@ func TestTabletControl(t *testing.T) {
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}
if _, err := expectBroadcastData(agent.QueryServiceControl, 19); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, false, "", 19); err != nil {
t.Fatal(err)
}
if got := agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType; got != targetTabletType {
@ -597,7 +593,7 @@ func TestTabletControl(t *testing.T) {
// QueryService changed back from SERVING to NOT_SERVING since refreshTablet()
// re-read the topology and saw that REPLICA is still not allowed to serve.
if _, err := expectBroadcastData(agent.QueryServiceControl, 19); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "", 19); err != nil {
t.Fatal(err)
}
if err := expectStateChange(agent.QueryServiceControl, true, targetTabletType); err != nil {
@ -627,13 +623,26 @@ func TestStateChangeImmediateHealthBroadcast(t *testing.T) {
/// Consume the first health broadcast triggered by ActionAgent.Start():
// (REPLICA, NOT_SERVING) goes to (REPLICA, SERVING). And we
// should be serving.
if _, err := expectBroadcastData(agent.QueryServiceControl, 0); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "healthcheck not run yet", 0); err != nil {
t.Fatal(err)
}
if err := expectStateChange(agent.QueryServiceControl, true, topodatapb.TabletType_REPLICA); err != nil {
t.Fatal(err)
}
// Run health check to turn into a healthy replica
agent.HealthReporter.(*fakeHealthCheck).reportReplicationDelay = 12 * time.Second
agent.runHealthCheck()
if !agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should be running")
}
if got := agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType; got != topodatapb.TabletType_REPLICA {
t.Errorf("invalid tabletserver target: got = %v, want = %v", got, topodatapb.TabletType_REPLICA)
}
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "", 12); err != nil {
t.Fatal(err)
}
// Run TER to turn us into a proper master, wait for it to finish.
agent.HealthReporter.(*fakeHealthCheck).reportReplicationDelay = 19 * time.Second
if err := agent.RPCWrapLock(ctx, actionnode.TabletActionExternallyReparented, "", "", false, func() error {
@ -659,7 +668,7 @@ func TestStateChangeImmediateHealthBroadcast(t *testing.T) {
}
// Consume the health broadcast (no replication delay as we are master)
if _, err := expectBroadcastData(agent.QueryServiceControl, 0); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "", 0); err != nil {
t.Fatal(err)
}
if err := expectStateChange(agent.QueryServiceControl, true, targetTabletType); err != nil {
@ -682,7 +691,7 @@ func TestStateChangeImmediateHealthBroadcast(t *testing.T) {
if got := agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType; got != targetTabletType {
t.Errorf("invalid tabletserver target: got = %v, want = %v", got, targetTabletType)
}
if _, err := expectBroadcastData(agent.QueryServiceControl, 20); err != nil {
if _, err := expectBroadcastData(agent.QueryServiceControl, true, "", 20); err != nil {
t.Fatal(err)
}
@ -735,7 +744,7 @@ func TestStateChangeImmediateHealthBroadcast(t *testing.T) {
// (MASTER, SERVING) to (MASTER, NOT_SERVING).
// Since we didn't run healthcheck again yet, the broadcast data contains the
// cached replication lag of 20 instead of 21.
if bd, err := expectBroadcastData(agent.QueryServiceControl, 20); err == nil {
if bd, err := expectBroadcastData(agent.QueryServiceControl, false, "", 20); err == nil {
if bd.RealtimeStats.BinlogPlayersCount != 1 {
t.Fatalf("filtered replication must be enabled: %v", bd)
}
@ -762,7 +771,7 @@ func TestStateChangeImmediateHealthBroadcast(t *testing.T) {
if got := agent.QueryServiceControl.(*tabletservermock.Controller).CurrentTarget.TabletType; got != targetTabletType {
t.Errorf("invalid tabletserver target: got = %v, want = %v", got, targetTabletType)
}
if bd, err := expectBroadcastData(agent.QueryServiceControl, 22); err == nil {
if bd, err := expectBroadcastData(agent.QueryServiceControl, false, "", 22); err == nil {
if bd.RealtimeStats.BinlogPlayersCount != 1 {
t.Fatalf("filtered replication must be still running: %v", bd)
}
@ -794,7 +803,7 @@ func TestStateChangeImmediateHealthBroadcast(t *testing.T) {
}
// Since we didn't run healthcheck again yet, the broadcast data contains the
// cached replication lag of 22 instead of 23.
if bd, err := expectBroadcastData(agent.QueryServiceControl, 22); err == nil {
if bd, err := expectBroadcastData(agent.QueryServiceControl, true, "", 22); err == nil {
if bd.RealtimeStats.BinlogPlayersCount != 0 {
t.Fatalf("filtered replication must be disabled now: %v", bd)
}
@ -842,13 +851,16 @@ func TestOldHealthCheck(t *testing.T) {
// expectBroadcastData checks that runHealthCheck() broadcasted the expected
// stats (going the value for secondsBehindMaster).
// Note that it may be necessary to call this function twice when
// runHealthCheck() also calls freshTablet() which might trigger another
// broadcast e.g. because we went from REPLICA to SPARE and into lameduck.
func expectBroadcastData(qsc tabletserver.Controller, secondsBehindMaster uint32) (*tabletservermock.BroadcastData, error) {
func expectBroadcastData(qsc tabletserver.Controller, serving bool, healthError string, secondsBehindMaster uint32) (*tabletservermock.BroadcastData, error) {
bd := <-qsc.(*tabletservermock.Controller).BroadcastData
if got := bd.Serving; got != serving {
return nil, fmt.Errorf("unexpected BroadcastData.Serving, got: %v want: %v with bd: %+v", got, serving, bd)
}
if got := bd.RealtimeStats.HealthError; got != healthError {
return nil, fmt.Errorf("unexpected BroadcastData.HealthError, got: %v want: %v with bd: %+v", got, healthError, bd)
}
if got := bd.RealtimeStats.SecondsBehindMaster; got != secondsBehindMaster {
return nil, fmt.Errorf("unexpected BroadcastData. got: %v want: %v got bd: %+v", got, secondsBehindMaster, bd)
return nil, fmt.Errorf("unexpected BroadcastData.SecondsBehindMaster, got: %v want: %v with bd: %+v", got, secondsBehindMaster, bd)
}
return bd, nil
}

Просмотреть файл

@ -22,6 +22,9 @@ type BroadcastData struct {
// RealtimeStats stores the last broadcast stats.
RealtimeStats querypb.RealtimeStats
// Serving contains the QueryServiceEnabled flag
Serving bool
}
// StateChange stores the state the controller changed to.
@ -146,6 +149,7 @@ func (tqsc *Controller) BroadcastHealth(terTimestamp int64, stats *querypb.Realt
tqsc.BroadcastData <- &BroadcastData{
TERTimestamp: terTimestamp,
RealtimeStats: *stats,
Serving: tqsc.QueryServiceEnabled,
}
}

Просмотреть файл

@ -44,15 +44,44 @@ message TabletAlias {
// TabletType represents the type of a given tablet.
enum TabletType {
option allow_alias = true; // so we can have RDONLY and BATCH co-exist
UNKNOWN = 0; // not a valid value
// UNKNOWN is not a valid value.
UNKNOWN = 0;
// MASTER is the master server for the shard. Only MASTER allows DMLs.
MASTER = 1;
// REPLICA is a slave type. It is used to serve live traffic.
// A REPLICA can be promoted to MASTER. A demoted MASTER will go to REPLICA.
REPLICA = 2;
// RDONLY (old name) / BATCH (new name) is used to serve traffic for
// long-running jobs. It is a separate type from REPLICA so
// long-running queries don't affect web-like traffic.
RDONLY = 3;
BATCH = 3;
// SPARE is a type of servers that cannot serve queries, but is available
// in case an extra server is needed.
SPARE = 4;
// EXPERIMENTAL is like SPARE, except it can serve queries. This
// type can be used for usages not planned by Vitess, like online
// export to another storage engine.
EXPERIMENTAL = 5;
// BACKUP is the type a server goes to when taking a backup. No queries
// can be served in BACKUP mode.
BACKUP = 6;
// RESTORE is the type a server uses when restoring a backup, at
// startup time. No queries can be served in RESTORE mode.
RESTORE = 7;
// WORKER is the type a server goes into when used by a vtworker
// process to perform an offline action. It is a serving type (as
// the vtworker processes may need queries to run). In this state,
// this tablet is dedicated to the vtworker process that uses it.
WORKER = 8;
}