diff --git a/go/vt/vtctld/api.go b/go/vt/vtctld/api.go index ced85bd397..968aa9b37c 100644 --- a/go/vt/vtctld/api.go +++ b/go/vt/vtctld/api.go @@ -281,11 +281,12 @@ func initAPI(ctx context.Context, ts topo.Server, actions *ActionRepository, rea // Healthcheck real time status per (cell, keyspace, shard, tablet type). handleCollection("tablet_statuses", func(r *http.Request) (interface{}, error) { targetPath := getItemPath(r.URL.Path) + + // Get the heatmap data based on query parameters. if targetPath == "" { if err := r.ParseForm(); err != nil { return nil, err } - keyspace := r.FormValue("keyspace") cell := r.FormValue("cell") tabletType := r.FormValue("type") diff --git a/go/vt/vtctld/api_test.go b/go/vt/vtctld/api_test.go index 98ad51c63c..5f0136fbb3 100644 --- a/go/vt/vtctld/api_test.go +++ b/go/vt/vtctld/api_test.go @@ -11,11 +11,9 @@ import ( "golang.org/x/net/context" - "github.com/youtube/vitess/go/vt/discovery" "github.com/youtube/vitess/go/vt/wrangler" "github.com/youtube/vitess/go/vt/zktopo/zktestserver" - querypb "github.com/youtube/vitess/go/vt/proto/query" topodatapb "github.com/youtube/vitess/go/vt/proto/topodata" ) @@ -81,29 +79,14 @@ func TestAPI(t *testing.T) { realtimeStats := newRealtimeStatsForTesting() initAPI(ctx, ts, actionRepo, realtimeStats) - target := &querypb.Target{ - Keyspace: "ks1", - Shard: "-80", - TabletType: topodatapb.TabletType_REPLICA, - } - stats := &querypb.RealtimeStats{ - HealthError: "", - SecondsBehindMaster: 2, - BinlogPlayersCount: 0, - CpuUsage: 12.1, - Qps: 5.6, - } - tabletStats := &discovery.TabletStats{ - Key: "key1", - Tablet: &tablet1, - Target: target, - Up: true, - Serving: true, - TabletExternallyReparentedTimestamp: 5, - Stats: stats, - LastError: nil, - } - realtimeStats.tabletStats.StatsUpdate(tabletStats) + ts1 := tabletStats("cell1", "ks1", "-80", topodatapb.TabletType_REPLICA, 100) + ts2 := tabletStats("cell1", "ks1", "80-", topodatapb.TabletType_RDONLY, 200) + ts3 := tabletStats("cell2", "ks1", "80-", topodatapb.TabletType_REPLICA, 300) + ts4 := tabletStats("cell2", "ks1", "-80", topodatapb.TabletType_RDONLY, 400) + realtimeStats.StatsUpdate(ts1) + realtimeStats.StatsUpdate(ts2) + realtimeStats.StatsUpdate(ts3) + realtimeStats.StatsUpdate(ts4) // Test cases. table := []struct { @@ -164,9 +147,14 @@ func TestAPI(t *testing.T) { }`}, //Tablet Updates - {"GET", "tablet_statuses/cell1/ks1/-80/REPLICA", `{"100":{"Key":"key1","Tablet":{"alias":{"cell":"cell1","uid":100},"port_map":{"vt":100},"keyspace":"ks1","shard":"-80","key_range":{"end":"gA=="},"type":2},"Name":"","Target":{"keyspace":"ks1","shard":"-80","tablet_type":2},"Up":true,"Serving":true,"TabletExternallyReparentedTimestamp":5,"Stats":{"seconds_behind_master":2,"cpu_usage":12.1,"qps":5.6},"LastError":null}}`}, - {"GET", "tablet_statuses/cell1/ks1/replica", "can't get tablet_statuses: invalid target path: \"cell1/ks1/replica\" expected path: ///"}, - {"GET", "tablet_statuses/cell1/ks1/-80/hello", "can't get tablet_statuses: invalid tablet type: hello"}, + {"GET", "tablet_statuses/?metric=lag&keyspace=ks1&cell=cell1&type=REPLICA", `{ + "Labels":[{"Label":{"Name":"cell1","Rowspan":2},"NestedLabels":[{"Name":"REPLICA","Rowspan":1},{"Name":"RDONLY","Rowspan":1}]}, + {"Label":{"Name":"cell2","Rowspan":2},"NestedLabels":[{"Name":"REPLICA","Rowspan":1},{"Name":"RDONLY","Rowspan":1}]}], + "Data":[[100,-1],[-1,200],[-1,300],[400,-1]], + "Aliases":[[{"cell":"cell1","uid":100},null],[null,{"cell":"cell1","uid":200}],[null,{"cell":"cell2","uid":300}],[{"cell":"cell2","uid":400},null]]}`, + }, + {"GET", "tablet_statuses/lag/cell1/REPLICA", "can't get tablet_statuses: invalid target path: \"lag/cell1/REPLICA\" expected path: ?metric=&keyspace=&cell=&type="}, + {"GET", "tablet_statuses/?metric=lag&keyspace=ks1&cell=cell1&type=hello", "can't get tablet_statuses: invalid tablet type: hello"}, } for _, in := range table { diff --git a/go/vt/vtctld/realtime_status.go b/go/vt/vtctld/realtime_status.go index 3e55771508..178b805d24 100644 --- a/go/vt/vtctld/realtime_status.go +++ b/go/vt/vtctld/realtime_status.go @@ -24,7 +24,8 @@ func newRealtimeStats(ts topo.Server) (*realtimeStats, error) { // Up=False events for a tablet. hc.SetListener(tabletStatsCache, true) r := &realtimeStats{ - healthCheck: hc, + healthCheck: hc, + tabletStatsCache: tabletStatsCache, } // Get the list of all tablets from all cells and monitor the topology for added or removed tablets with a CellTabletsWatcher. diff --git a/go/vt/vtctld/realtime_status_test.go b/go/vt/vtctld/realtime_status_test.go index 01435faf32..32aeb67426 100644 --- a/go/vt/vtctld/realtime_status_test.go +++ b/go/vt/vtctld/realtime_status_test.go @@ -2,7 +2,6 @@ package vtctld import ( "fmt" - "strconv" "testing" "time" @@ -14,6 +13,7 @@ import ( "github.com/youtube/vitess/go/vt/tabletmanager/tmclient" "github.com/youtube/vitess/go/vt/tabletserver/grpcqueryservice" "github.com/youtube/vitess/go/vt/tabletserver/queryservice/fakes" + "github.com/youtube/vitess/go/vt/topo/topoproto" "github.com/youtube/vitess/go/vt/vttest/fakesqldb" "github.com/youtube/vitess/go/vt/wrangler" "github.com/youtube/vitess/go/vt/wrangler/testlib" @@ -23,125 +23,12 @@ import ( topodatapb "github.com/youtube/vitess/go/vt/proto/topodata" ) -// TestRealtimeStats tests the functionality of the realtimeStats object without using the HealthCheck object. -func TestRealtimeStats(t *testing.T) { - tabletType := topodatapb.TabletType_REPLICA.String() - ctx := context.Background() - cells := []string{"cell1", "cell2"} - ts := zktestserver.New(t, cells) - - // Populate topo. - ts.CreateKeyspace(ctx, "ks1", &topodatapb.Keyspace{ShardingColumnName: "shardcol"}) - ts.Impl.CreateShard(ctx, "ks1", "-80", &topodatapb.Shard{ - Cells: cells, - KeyRange: &topodatapb.KeyRange{Start: nil, End: []byte{0x80}}, - }) - ts.Impl.CreateShard(ctx, "ks1", "80-", &topodatapb.Shard{ - Cells: cells, - KeyRange: &topodatapb.KeyRange{Start: []byte{0x80}, End: nil}, - }) - - tablet1 := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, - Keyspace: "ks1", - Shard: "-80", - Type: topodatapb.TabletType_REPLICA, - KeyRange: &topodatapb.KeyRange{Start: nil, End: []byte{0x80}}, - PortMap: map[string]int32{"vt": 100}, - } - ts.CreateTablet(ctx, tablet1) - - tablet2 := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{Cell: "cell2", Uid: 200}, - Keyspace: "ks1", - Shard: "-80", - Type: topodatapb.TabletType_REPLICA, - KeyRange: &topodatapb.KeyRange{Start: nil, End: []byte{0x80}}, - PortMap: map[string]int32{"vt": 200}, - } - ts.CreateTablet(ctx, tablet2) - - realtimeStats := newRealtimeStatsForTesting() - - target := &querypb.Target{ - Keyspace: "ks1", - Shard: "-80", - TabletType: topodatapb.TabletType_REPLICA, - } - - stats1 := &querypb.RealtimeStats{ - HealthError: "", - SecondsBehindMaster: 2, - BinlogPlayersCount: 0, - CpuUsage: 12.1, - Qps: 5.6, - } - - // Test 1: tablet1's stats should be updated with the one received by the HealthCheck object. - want1 := &discovery.TabletStats{ - Tablet: tablet1, - Target: target, - Up: true, - Serving: true, - TabletExternallyReparentedTimestamp: 5, - Stats: stats1, - LastError: nil, - } - realtimeStats.tabletStats.StatsUpdate(want1) - result := realtimeStats.tabletStatuses("cell1", "ks1", "-80", tabletType) - checkResult(t, tablet1.Alias.Uid, result, want1) - - // Test 2: tablet1's stats should be updated with the new one received by the HealthCheck object. - stats2 := &querypb.RealtimeStats{ - HealthError: "Unhealthy tablet", - SecondsBehindMaster: 15, - BinlogPlayersCount: 0, - CpuUsage: 56.5, - Qps: 7.9, - } - want2 := &discovery.TabletStats{ - Tablet: tablet1, - Target: target, - Up: true, - Serving: true, - TabletExternallyReparentedTimestamp: 5, - Stats: stats2, - LastError: nil, - } - realtimeStats.tabletStats.StatsUpdate(want2) - result = realtimeStats.tabletStatuses("cell1", "ks1", "-80", tabletType) - checkResult(t, tablet1.Alias.Uid, result, want2) - - // Test 3: tablet2's stats should be updated with the one received by the HealthCheck object, - // leaving tablet1's stats unchanged. - stats3 := &querypb.RealtimeStats{ - HealthError: "Unhealthy tablet", - SecondsBehindMaster: 15, - BinlogPlayersCount: 0, - CpuUsage: 56.5, - Qps: 7.9, - } - want3 := &discovery.TabletStats{ - Tablet: tablet2, - Target: target, - Up: true, - Serving: true, - TabletExternallyReparentedTimestamp: 5, - Stats: stats3, - LastError: nil, - } - realtimeStats.tabletStats.StatsUpdate(want3) - result = realtimeStats.tabletStatuses("cell1", "ks1", "-80", tabletType) - checkResult(t, tablet1.Alias.Uid, result, want2) -} - // TestRealtimeStatsWithQueryService uses fakeTablets and the fakeQueryService to // copy the environment needed for the HealthCheck object. func TestRealtimeStatsWithQueryService(t *testing.T) { // Set up testing keyspace with 2 tablets within 2 cells. keyspace := "ks" shard := "-80" - tabletType := topodatapb.TabletType_REPLICA.String() db := fakesqldb.Register() ts := zktestserver.New(t, []string{"cell1", "cell2"}) wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient()) @@ -179,63 +66,67 @@ func TestRealtimeStatsWithQueryService(t *testing.T) { t.Fatalf("newRealtimeStats error: %v", err) } - // Test 1: tablet1's stats should be updated with the one received by the HealthCheck object. - // Note this also takes into account initialization of the health check module. + // Insert tablet1. want := &querypb.RealtimeStats{ SecondsBehindMaster: 1, } - if err := checkStats(realtimeStats, "0", "cell1", keyspace, shard, tabletType, want); err != nil { + if err := checkStats(realtimeStats, t1, want, 1); err != nil { t.Errorf("%v", err) } - // Test 2: tablet1's stats should be updated with the new one received by the HealthCheck object. + // Update tablet1. fqs1.AddHealthResponseWithQPS(2.0) want2 := &querypb.RealtimeStats{ SecondsBehindMaster: 1, Qps: 2.0, } - if err := checkStats(realtimeStats, "0", "cell1", keyspace, shard, tabletType, want2); err != nil { + if err := checkStats(realtimeStats, t1, want2, 1); err != nil { t.Errorf("%v", err) } - // Test 3: tablet2's stats should be updated with the one received by the HealthCheck object, - // leaving tablet1's stats unchanged. + // Insert tablet2. fqs2.AddHealthResponseWithQPS(3.0) want3 := &querypb.RealtimeStats{ SecondsBehindMaster: 1, Qps: 3.0, } - if err := checkStats(realtimeStats, "1", "cell2", keyspace, shard, tabletType, want3); err != nil { + if err := checkStats(realtimeStats, t2, want3, 1); err != nil { t.Errorf("%v", err) } - if err := checkStats(realtimeStats, "0", "cell1", keyspace, shard, tabletType, want2); err != nil { + if err := checkStats(realtimeStats, t1, want2, 1); err != nil { t.Errorf("%v", err) } } -// checkResult checks to see that the TabletStats received are as expected. -func checkResult(t *testing.T, wantedUID uint32, resultMap map[string]*discovery.TabletStats, original *discovery.TabletStats) { - result, ok := resultMap[strconv.FormatUint(uint64(wantedUID), 10)] - if !ok { - t.Errorf("No such tablet in tabletStatsCache") - } - if got, want := result.String(), original.String(); got != want { - t.Errorf("got: %#v, want: %#v", got, want) - } -} - // checkStats ensures that the HealthCheck object received an update and passed // that information to the correct tablet. -func checkStats(realtimeStats *realtimeStats, tabletUid, cell, keyspace, shard, tabletType string, want *querypb.RealtimeStats) error { +func checkStats(realtimeStats *realtimeStats, tablet *testlib.FakeTablet, want *querypb.RealtimeStats, wantCellCount int) error { + keyspace := tablet.Tablet.Keyspace + shard := tablet.Tablet.Shard + cell := tablet.Tablet.Alias.Cell + tabletType := tablet.Tablet.Type + tabletAlias := tablet.Tablet.Alias + deadline := time.Now().Add(time.Second * 5) for time.Now().Before(deadline) { - result, ok := (realtimeStats.tabletStatuses(cell, keyspace, shard, tabletType))[tabletUid] + realtimeStats.mu.Lock() + result, ok := realtimeStats.statuses[keyspace][shard][cell][tabletType] if !ok { + realtimeStats.mu.Unlock() continue } - got := result.Stats - if proto.Equal(got, want) { + var got *querypb.RealtimeStats + var gotCellCount int + for _, tabletStat := range result { + if topoproto.TabletAliasEqual(tabletAlias, tabletStat.Tablet.Alias) { + got = tabletStat.Stats + gotCellCount = realtimeStats.tabletCountsByCell[cell] + } + } + realtimeStats.mu.Unlock() + + if proto.Equal(got, want) && gotCellCount == wantCellCount { return nil } time.Sleep(1 * time.Millisecond) @@ -246,9 +137,11 @@ func checkStats(realtimeStats *realtimeStats, tabletUid, cell, keyspace, shard, // newRealtimeStatsForTesting creates a new realtimeStats object without creating a HealthCheck object. func newRealtimeStatsForTesting() *realtimeStats { tabletStatsCache := &tabletStatsCache{ - statuses: make(map[string]map[string]*discovery.TabletStats), + statuses: make(map[string]map[string]map[string]map[topodatapb.TabletType][]*discovery.TabletStats), + statusesByAlias: make(map[string]*discovery.TabletStats), + tabletCountsByCell: make(map[string]int), } return &realtimeStats{ - tabletStats: tabletStatsCache, + tabletStatsCache: tabletStatsCache, } } diff --git a/go/vt/vtctld/tablet_stats_cache_test.go b/go/vt/vtctld/tablet_stats_cache_test.go index eff7690c2a..543e56f686 100644 --- a/go/vt/vtctld/tablet_stats_cache_test.go +++ b/go/vt/vtctld/tablet_stats_cache_test.go @@ -19,22 +19,21 @@ func TestStatsUpdate(t *testing.T) { tablet1Stats2 := tabletStats("cell1", "ks1", "-80", topodatapb.TabletType_REPLICA, 200) tablet2Stats1 := tabletStats("cell1", "ks1", "-80", topodatapb.TabletType_REPLICA, 100) - // Test 1: tablet1's stats should be updated with the one received by the HealthCheck object. + // Insert tablet1. tabletStatsCache.StatsUpdate(tablet1Stats1) results1 := tabletStatsCache.statuses["ks1"]["-80"]["cell1"][topodatapb.TabletType_REPLICA] if got, want := results1[0], tablet1Stats1; !reflect.DeepEqual(got, want) { t.Errorf("got: %v, want: %v", got, want) } - // Test 2: tablet1's stats should be updated with the new one received by the HealthCheck object. + // Update tablet1. tabletStatsCache.StatsUpdate(tablet1Stats2) results2 := tabletStatsCache.statuses["ks1"]["-80"]["cell1"][topodatapb.TabletType_REPLICA] if got, want := results2[0], tablet1Stats2; !reflect.DeepEqual(got, want) { t.Errorf("got: %v, want: %v", got, want) } - // Test 3: tablet2's stats should be updated with the one received by the HealthCheck object - // leaving tablet1's stats unchanged. + // Insert tablet. List of tablets will be resorted. tabletStatsCache.StatsUpdate(tablet2Stats1) results3 := tabletStatsCache.statuses["ks1"]["-80"]["cell1"][topodatapb.TabletType_REPLICA] if got, want := results3[0], tablet2Stats1; !reflect.DeepEqual(got, want) { @@ -46,33 +45,32 @@ func TestStatsUpdate(t *testing.T) { t.Errorf("got: %v, want: %v", got, want) } - // Test 5: Check the list of cells to ensure it has the right count. + // Check tablet count in cell1. if got, want := tabletStatsCache.tabletCountsByCell["cell1"], 2; got != want { t.Errorf("got: %v, want: %v", got, want) } - // Test 5: tablet2 should be removed from all lists upon receiving an update that - // serving status has changed. + // Delete tablet2. tablet2Stats1.Up = false tabletStatsCache.StatsUpdate(tablet2Stats1) results5 := tabletStatsCache.statuses["ks1"]["-80"]["cell1"][topodatapb.TabletType_REPLICA] for _, stat := range results5 { if reflect.DeepEqual(stat, tablet2Stats1) { - t.Errorf("not deleleted from statuses") + t.Errorf("not deleleted from statusesByAliases") } } _, ok := tabletStatsCache.statusesByAlias[tablet2Stats1.Tablet.Alias.String()] if ok { - t.Errorf("not deleted from statuses") + t.Errorf("not deleted from statusesByAliases") } - // Test 6: Check to see that the tablet was removed from cell count. + // Check tablet count in cell1. if got, want := tabletStatsCache.tabletCountsByCell["cell1"], 1; got != want { t.Errorf("got: %v, want: %v", got, want) } - // Test 7: The cell entry was deleted when there are no more entries. + // Delete tablet1. List of known cells should be empty now. tablet1Stats2.Up = false tabletStatsCache.StatsUpdate(tablet1Stats2) _, ok = tabletStatsCache.tabletCountsByCell["cell1"]