From 615dc52d77a6e9a051245ce7840f147ac3b3ebf3 Mon Sep 17 00:00:00 2001 From: Lee Yi Jie Joel Date: Thu, 2 Jan 2020 14:10:30 -0800 Subject: [PATCH] Add are you alive example Signed-off-by: Lee Yi Jie Joel --- examples/are-you-alive/Gopkg.lock | 123 ++++++ examples/are-you-alive/Gopkg.toml | 29 ++ examples/are-you-alive/Makefile | 15 + examples/are-you-alive/README.md | 84 +++++ .../are-you-alive/cmd/are-you-alive/main.go | 352 ++++++++++++++++++ examples/are-you-alive/deploy/README.md | 5 + examples/are-you-alive/docker-compose.yml | 29 ++ examples/are-you-alive/pkg/client/client.go | 347 +++++++++++++++++ examples/are-you-alive/prometheus.yml | 34 ++ examples/are-you-alive/schemas/README.md | 9 + .../schemas/create_test_table.sql | 6 + examples/are-you-alive/schemas/vschema.json | 18 + 12 files changed, 1051 insertions(+) create mode 100644 examples/are-you-alive/Gopkg.lock create mode 100644 examples/are-you-alive/Gopkg.toml create mode 100644 examples/are-you-alive/Makefile create mode 100644 examples/are-you-alive/README.md create mode 100644 examples/are-you-alive/cmd/are-you-alive/main.go create mode 100644 examples/are-you-alive/deploy/README.md create mode 100644 examples/are-you-alive/docker-compose.yml create mode 100644 examples/are-you-alive/pkg/client/client.go create mode 100644 examples/are-you-alive/prometheus.yml create mode 100644 examples/are-you-alive/schemas/README.md create mode 100644 examples/are-you-alive/schemas/create_test_table.sql create mode 100644 examples/are-you-alive/schemas/vschema.json diff --git a/examples/are-you-alive/Gopkg.lock b/examples/are-you-alive/Gopkg.lock new file mode 100644 index 0000000000..06d7652679 --- /dev/null +++ b/examples/are-you-alive/Gopkg.lock @@ -0,0 +1,123 @@ +# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. + + +[[projects]] + digest = "1:d6afaeed1502aa28e80a4ed0981d570ad91b2579193404256ce672ed0a609e0d" + name = "github.com/beorn7/perks" + packages = ["quantile"] + pruneopts = "UT" + revision = "4b2b341e8d7715fae06375aa633dbb6e91b3fb46" + version = "v1.0.0" + +[[projects]] + digest = "1:ec6f9bf5e274c833c911923c9193867f3f18788c461f76f05f62bb1510e0ae65" + name = "github.com/go-sql-driver/mysql" + packages = ["."] + pruneopts = "UT" + revision = "72cd26f257d44c1114970e19afddcd812016007e" + version = "v1.4.1" + +[[projects]] + digest = "1:318f1c959a8a740366fce4b1e1eb2fd914036b4af58fbd0a003349b305f118ad" + name = "github.com/golang/protobuf" + packages = ["proto"] + pruneopts = "UT" + revision = "b5d812f8a3706043e23a9cd5babf2e5423744d30" + version = "v1.3.1" + +[[projects]] + digest = "1:31e761d97c76151dde79e9d28964a812c46efc5baee4085b86f68f0c654450de" + name = "github.com/konsorten/go-windows-terminal-sequences" + packages = ["."] + pruneopts = "UT" + revision = "f55edac94c9bbba5d6182a4be46d86a2c9b5b50e" + version = "v1.0.2" + +[[projects]] + digest = "1:ff5ebae34cfbf047d505ee150de27e60570e8c394b3b8fdbb720ff6ac71985fc" + name = "github.com/matttproud/golang_protobuf_extensions" + packages = ["pbutil"] + pruneopts = "UT" + revision = "c12348ce28de40eed0136aa2b644d0ee0650e56c" + version = "v1.0.1" + +[[projects]] + digest = "1:1db19cfa69213ff994b19451023750122862f39643a5f37728126e1b5180eaac" + name = "github.com/prometheus/client_golang" + packages = [ + "prometheus", + "prometheus/internal", + "prometheus/promauto", + "prometheus/promhttp", + ] + pruneopts = "UT" + revision = "50c4339db732beb2165735d2cde0bff78eb3c5a5" + version = "v0.9.3" + +[[projects]] + branch = "master" + digest = "1:2d5cd61daa5565187e1d96bae64dbbc6080dacf741448e9629c64fd93203b0d4" + name = "github.com/prometheus/client_model" + packages = ["go"] + pruneopts = "UT" + revision = "fd36f4220a901265f90734c3183c5f0c91daa0b8" + +[[projects]] + digest = "1:8dcedf2e8f06c7f94e48267dea0bc0be261fa97b377f3ae3e87843a92a549481" + name = "github.com/prometheus/common" + packages = [ + "expfmt", + "internal/bitbucket.org/ww/goautoneg", + "model", + ] + pruneopts = "UT" + revision = "17f5ca1748182ddf24fc33a5a7caaaf790a52fcc" + version = "v0.4.1" + +[[projects]] + branch = "master" + digest = "1:77b841555ca2ce5a45d7ba8b3eea5bd6e2e50c139d979b9b10f57a30fbd1132c" + name = "github.com/prometheus/procfs" + packages = [ + ".", + "internal/fs", + ] + pruneopts = "UT" + revision = "a7aeb8df3389edaf53efcc319ad0063cb27c3960" + +[[projects]] + digest = "1:04457f9f6f3ffc5fea48e71d62f2ca256637dee0a04d710288e27e05c8b41976" + name = "github.com/sirupsen/logrus" + packages = ["."] + pruneopts = "UT" + revision = "839c75faf7f98a33d445d181f3018b5c3409a45e" + version = "v1.4.2" + +[[projects]] + branch = "master" + digest = "1:9ca267f99487ef450fff0c2a49eaf0788d9ff3562bbaeff19f564d380f84bc6d" + name = "golang.org/x/sys" + packages = ["unix"] + pruneopts = "UT" + revision = "dbbf3f1254d491605cf4a0034ce25d0dc71b0c58" + +[[projects]] + digest = "1:c25289f43ac4a68d88b02245742347c94f1e108c534dda442188015ff80669b3" + name = "google.golang.org/appengine" + packages = ["cloudsql"] + pruneopts = "UT" + revision = "4c25cacc810c02874000e4f7071286a8e96b2515" + version = "v1.6.0" + +[solve-meta] + analyzer-name = "dep" + analyzer-version = 1 + input-imports = [ + "github.com/go-sql-driver/mysql", + "github.com/prometheus/client_golang/prometheus", + "github.com/prometheus/client_golang/prometheus/promauto", + "github.com/prometheus/client_golang/prometheus/promhttp", + "github.com/sirupsen/logrus", + ] + solver-name = "gps-cdcl" + solver-version = 1 diff --git a/examples/are-you-alive/Gopkg.toml b/examples/are-you-alive/Gopkg.toml new file mode 100644 index 0000000000..2765dc95c4 --- /dev/null +++ b/examples/are-you-alive/Gopkg.toml @@ -0,0 +1,29 @@ +# Gopkg.toml example +# +# Refer to https://golang.github.io/dep/docs/Gopkg.toml.html +# for detailed Gopkg.toml documentation. +# +# required = ["github.com/user/thing/cmd/thing"] +# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"] +# +# [[constraint]] +# name = "github.com/user/project" +# version = "1.0.0" +# +# [[constraint]] +# name = "github.com/user/project2" +# branch = "dev" +# source = "github.com/myfork/project2" +# +# [[override]] +# name = "github.com/x/y" +# version = "2.4.0" +# +# [prune] +# non-go = false +# go-tests = true +# unused-packages = true + +[prune] + go-tests = true + unused-packages = true diff --git a/examples/are-you-alive/Makefile b/examples/are-you-alive/Makefile new file mode 100644 index 0000000000..5461370576 --- /dev/null +++ b/examples/are-you-alive/Makefile @@ -0,0 +1,15 @@ +# We are using the "dev" project in our registry for this +NAME := "us.gcr.io/planetscale-dev/are-you-alive" +TAG := $$(git log -1 --pretty=%H) +IMG := ${NAME}:${TAG} +LATEST := ${NAME}:latest + +.PHONY: build push + +build: + @docker build -f build/release/Dockerfile -t ${IMG} . + @docker tag ${IMG} ${LATEST} + +push: + @docker push ${IMG} + @docker push ${LATEST} diff --git a/examples/are-you-alive/README.md b/examples/are-you-alive/README.md new file mode 100644 index 0000000000..889ba4f0e0 --- /dev/null +++ b/examples/are-you-alive/README.md @@ -0,0 +1,84 @@ +# Are You Alive? + +What does it mean to be alive? + +Well we don't know what it means for you, but we know what it means for our +Cloud Database! + +This project contains a simulated client application that can be used to measure +the health of a Vitess cluster over time. + +## Design + +For now, there is a specific database schema and vschema that you must apply to +the database that you are using for this test. + +This client application: + +1. Hammers the database with random data (not a load test though). +1. Measures all the important things: + - Client connection errors + - Write latency + - Read latency from masters + - Read latency from replicas + - Write errors + - Read errors on masters + - Write errors on replicas + - Errors in other operations on masters and replicas (e.g. COUNT) + - Latency on other operations on masters and replicas (e.g. COUNT) + - Data loss (by writing predictable data and testing for that) +1. Reports all these metrics to Prometheus. + +That's it! Keep it as simple and generic as possible, and someday our customers +can use this to test their clusters too! + +## Usage + +First, [initialize your database with the correct schemas](schemas/README.md). + +Run `are-you-alive --help` for usage. You can us the command line flags to +control the dataset size, whether to target reads at masters and replicas, your +mysql connection string, and the rate at which to send requests. + +Example: + +``` +./are-you-alive --mysql_connection_string +``` + +Where `` points to the database you are trying to test, +and everything else will be set to defaults. + +## Building + +``` +dep ensure -v +go build github.com/planetscale/are-you-alive/cmd/are-you-alive +``` + +## Testing + +First, [install docker compose](https://docs.docker.com/compose/install/) and +make sure it's working. Then run: + +``` +dep ensure -v +docker-compose build +docker-compose up +``` + +This will create a local mysqld and a local prometheus to scrape the app. It +will also start the app with the `--initialize` flag which tells it to +automatically create the test database. You might have to run this twice to +give mysql a chance to do its first initialization. + +After you run docker compose, navigate to `http://localhost:9090` to see +Prometheus and `http://localhost:8080/metrics` to see the raw metrics being +exported. + +## Push to Registry + +``` +make build +make push +``` diff --git a/examples/are-you-alive/cmd/are-you-alive/main.go b/examples/are-you-alive/cmd/are-you-alive/main.go new file mode 100644 index 0000000000..5433f4d9bf --- /dev/null +++ b/examples/are-you-alive/cmd/are-you-alive/main.go @@ -0,0 +1,352 @@ +package main + +import ( + "database/sql" + "flag" + "fmt" + "github.com/go-sql-driver/mysql" + "github.com/planetscale/are-you-alive/pkg/client" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" + "math/rand" + "net/http" + "os" + "os/signal" + "sync" + "time" +) + +/* +* To measure data loss, we need predictable writes. We do this with "minPage" +* and "maxPage". Once the difference between them is our desired dataset size, +* we can start deleting old records, but we expect to find one record for every +* "page" number between "minPage" and "maxPage". +* +* We don't measure "update loss" with this client right now. + */ + +var ( + maxPage = 0 + minPage = 0 + dataLossEvents = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "are_you_alive_data_loss_events", + Help: "Data loss events", + }, + []string{"database_name"}, + ) +) + +func writeNextRecord(environmentName string, connectionString string) error { + + // 1. Call monitored client + err := client.Write(environmentName, connectionString, maxPage) + if err != nil { + // Check to see if this is a duplicate key error. We've seen this + // sometimes happen, and when it does this client app gets stuck in an + // infinite loop of failure to write a duplicate key. It's possible + // that happens because a write is succesful but something goes wrong + // before the client recieves a response, so the client thinks the write + // failed and does not increment the count. + // + // So when we specifically see a duplicate key error, assume that's what + // happened, bump the count, and move on. + // + // See https://github.com/planetscale/planetscale-operator/issues/1776 + if me, ok := err.(*mysql.MySQLError); ok && me.Number == 1062 { + logrus.WithError(err).Warnf( + "Key '%d' already found, incrementing count", maxPage) + maxPage = maxPage + 1 + return nil + } + + logrus.WithError(err).Error("Error writing record") + return err + } + + // 2. Increment "maxPage" + maxPage = maxPage + 1 + return nil +} + +func readRandomRecord(environmentName string, connectionString string) error { + + // 1. Pick Random Number Between "minPage" and "maxPage" + if minPage == maxPage { + logrus.Warn("Nothing has been inserted yet!") + return nil + } + page := (rand.Int() % (maxPage - minPage)) + minPage + + // 2. Read Record + readID, readMsg, err := client.Read(environmentName, connectionString, page) + if err != nil { + if err == sql.ErrNoRows { + // This races with deletion, but if our page is greater than minPage + // we know that it should be in there. If it's less than minPage + // assume we are just racing the deletion goroutine and ignore the + // error. + if page <= minPage { + return nil + } + // For replicas, there is a chance we are suffering from replication + // lag, so ignore the missing row if we are a replica. + // TODO: Should we attempt to roughly figure out replication lag in + // this client, at least to catch major failures? We could probably + // multiply delay by the difference betwen maxCount and the page we + // are trying to read to figure out how long ago the row we were + // trying to write was written. + if client.ParseTabletType(connectionString) == "replica" || + client.ParseTabletType(connectionString) == "rdonly" { + return nil + } + logrus.WithError(err).WithFields(logrus.Fields{ + "page": page, + "minPage": minPage, + "maxPage": maxPage, + }).Error("Query succeeded but record not found, may mean data loss") + dataLossEvents.With( + prometheus.Labels{"database_name": client.ParseDBName(connectionString)}).Inc() + return err + } + logrus.WithError(err).Error("Error reading record") + return err + } + // Add zero here just so the metric exists for this database, even if it's + // zero. + dataLossEvents.With( + prometheus.Labels{"database_name": client.ParseDBName(connectionString)}).Add(0) + logrus.WithFields(logrus.Fields{ + "readID": readID, + "readMsg": readMsg, + }).Debug("Read row!") + + return nil +} + +func runCount(environmentName string, connectionString string) error { + + // 1. Run Count + count, err := client.Count(environmentName, connectionString) + if err != nil { + logrus.WithError(err).Error("Error counting records") + return err + } + logrus.WithFields(logrus.Fields{ + "count": count, + }).Debug("Counted rows!") + + // 2. Log if COUNT != "minPage" - "maxPage" + return nil +} + +func deleteLastRecordIfNecessary(environmentName string, connectionString string) error { + + // 1. Compare "maxPage" - "minPage" to Desired Dataset Size + if (maxPage - minPage) < *datasetSize { + return nil + } + logrus.WithFields(logrus.Fields{ + "current": maxPage - minPage, + "desired": *datasetSize, + }).Debug("Deleting last record") + + // 2. Delete Record If We Are Above Desired Size + err := client.Delete(environmentName, connectionString, minPage) + if err != nil { + logrus.WithError(err).Error("Error deleting record") + return err + } + + // 3. Increment "minPage" + minPage = minPage + 1 + return nil +} + +var ( + mysqlConnectionString = flag.String( + "mysql_connection_string", "", "Connection string for db to test") + prometheusMetricsAddress = flag.String( + "prometheus_metrics_address", ":8080", "Address on which to serve prometheus metrics") + debug = flag.Bool("debug", false, "Enable debug logging") + useVtgate = flag.Bool("vtgate", false, "Using vtgate (for @master and @replica)") + readFromReplica = flag.Bool("replica", false, "Read from replica") + readFromReadOnly = flag.Bool("rdonly", false, "Read from rdonly") + initialize = flag.Bool("initialize", false, "Initialize database (for testing)") + sleepTime = flag.Int("delay", 1*1000*1000*1000, "Delay in nanoseconds between ops") + datasetSize = flag.Int("dataset_size", 10, "Number of total records in database") + environmentName = flag.String("environment_name", "prod", + "Environment the database is deployed in that this client is pointing at") +) + +type runner struct { + connString string + envName string + fn func(string, string) error + errMessage string + sleepTime time.Duration +} + +func (r *runner) run() { + for { + time.Sleep(r.sleepTime) + err := r.fn(r.envName, r.connString) + if err != nil { + logrus.WithError(err).Error(r.errMessage) + } + } +} + +func waitForCtrlC() { + var endWaiter sync.WaitGroup + endWaiter.Add(1) + var signalChannel chan os.Signal + signalChannel = make(chan os.Signal, 1) + signal.Notify(signalChannel, os.Interrupt) + go func() { + <-signalChannel + endWaiter.Done() + }() + endWaiter.Wait() +} + +func runPrometheus() { + http.Handle("/metrics", promhttp.Handler()) + logrus.Fatal(http.ListenAndServe(*prometheusMetricsAddress, nil)) +} + +func main() { + + // 0. Handle Arguments + flag.Parse() + if *debug { + logrus.SetLevel(logrus.DebugLevel) + } + logrus.WithFields(logrus.Fields{ + "mysqlConnectionString": *mysqlConnectionString, + "prometheusMetricsAddress": *prometheusMetricsAddress, + "debug": *debug, + }).Debug("Command line arguments") + + connectionString := "" + if *mysqlConnectionString != "" { + connectionString = *mysqlConnectionString + } else if os.Getenv("MYSQL_CONN_STRING") != "" { + connectionString = os.Getenv("MYSQL_CONN_STRING") + } + masterConnectionString := connectionString + replicaConnectionString := connectionString + rdonlyConnectionString := connectionString + // When using vtgate, we want to append @master and @replica to the DSN, but + // this will fail against normal mysql which we're using for testing. See: + // https://vitess.io/docs/user-guides/faq/#how-do-i-choose-between-master-vs-replica-for-queries + if *useVtgate { + // We need to pass interpolateParams when using a vtgate because + // prepare is not supported. + // + // See: + // - https://github.com/go-sql-driver/mysql/blob/master/README.md#interpolateparams + // - https://github.com/src-d/go-mysql-server/issues/428 + // - https://github.com/vitessio/vitess/pull/3862 + masterConnectionString = fmt.Sprintf("%s@master?interpolateParams=true", connectionString) + replicaConnectionString = fmt.Sprintf("%s@replica?interpolateParams=true", connectionString) + rdonlyConnectionString = fmt.Sprintf("%s@rdonly?interpolateParams=true", connectionString) + } + fmt.Println("masterConnectionString:", masterConnectionString) + fmt.Println("replicaConnectionString:", replicaConnectionString) + fmt.Println("rdonlyConnectionString:", rdonlyConnectionString) + + // 1. Set Up Prometheus Metrics + logrus.Info("Prometheus Go") + go runPrometheus() + + // 2. Initialize Database + logrus.Info("Initializing database") + // For local testing, does not initialize vschema + if *initialize { + client.InitializeDatabase(*environmentName, masterConnectionString, "are_you_alive_messages") + } + client.WipeTestTable(*environmentName, masterConnectionString, "are_you_alive_messages") + + // 3. Start goroutines to do various things + logrus.Info("Starting client goroutines") + deleter := runner{ + connString: masterConnectionString, + envName: *environmentName, + fn: deleteLastRecordIfNecessary, + errMessage: "Recieved error deleting last record", + sleepTime: time.Duration(*sleepTime), + } + go deleter.run() + writer := runner{ + connString: masterConnectionString, + envName: *environmentName, + fn: writeNextRecord, + errMessage: "Recieved error writing next record", + sleepTime: time.Duration(*sleepTime), + } + go writer.run() + reader := runner{ + connString: masterConnectionString, + envName: *environmentName, + fn: readRandomRecord, + errMessage: "Recieved error reading record", + sleepTime: time.Duration(*sleepTime), + } + go reader.run() + counter := runner{ + connString: masterConnectionString, + envName: *environmentName, + fn: runCount, + errMessage: "Recieved error running count", + sleepTime: time.Duration(*sleepTime), + } + go counter.run() + + // Only bother starting a replica reader/counter if we are using a vtgate + // and actually are asking to do replica reads + if *useVtgate && *readFromReplica { + replicaReader := runner{ + connString: replicaConnectionString, + envName: *environmentName, + fn: readRandomRecord, + errMessage: "Recieved error reading record from replica", + sleepTime: time.Duration(*sleepTime), + } + go replicaReader.run() + replicaRowCounter := runner{ + connString: replicaConnectionString, + envName: *environmentName, + fn: runCount, + errMessage: "Recieved error running count on replica", + sleepTime: time.Duration(*sleepTime), + } + go replicaRowCounter.run() + } + + // Only bother starting a rdonly reader/counter if we are using a vtgate and + // actually are asking to do rdonly reads + if *useVtgate && *readFromReadOnly { + replicaReader := runner{ + connString: rdonlyConnectionString, + envName: *environmentName, + fn: readRandomRecord, + errMessage: "Recieved error reading record from rdonly", + sleepTime: time.Duration(*sleepTime), + } + go replicaReader.run() + replicaRowCounter := runner{ + connString: rdonlyConnectionString, + envName: *environmentName, + fn: runCount, + errMessage: "Recieved error running count on rdonly", + sleepTime: time.Duration(*sleepTime), + } + go replicaRowCounter.run() + } + + logrus.Info("Press Ctrl+C to end\n") + waitForCtrlC() + logrus.Info("\n") +} diff --git a/examples/are-you-alive/deploy/README.md b/examples/are-you-alive/deploy/README.md new file mode 100644 index 0000000000..57592ab666 --- /dev/null +++ b/examples/are-you-alive/deploy/README.md @@ -0,0 +1,5 @@ +# Kubernetes Deployment + +Configuration for deploying `are-you-alive` on Kubernetes. + +Also deploys Prometheus and Alertmanager. diff --git a/examples/are-you-alive/docker-compose.yml b/examples/are-you-alive/docker-compose.yml new file mode 100644 index 0000000000..a8203b09f5 --- /dev/null +++ b/examples/are-you-alive/docker-compose.yml @@ -0,0 +1,29 @@ +# https://www.firehydrant.io/blog/developer-a-go-app-with-docker-compose/ +version: '3' +services: + app: + build: build/dev + image: are-you-alive-dev + volumes: + - .:/go/src/github.com/planetscale/are-you-alive + working_dir: /go/src/github.com/planetscale/are-you-alive + environment: + MYSQL_CONN_STRING: root:mysql@tcp(mysql)/testfixture + depends_on: + - mysql + ports: + - 8080:8080 + prom: + image: quay.io/prometheus/prometheus:v2.0.0 + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: "--config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus" + ports: + - 9090:9090 + depends_on: + - app + mysql: + image: mysql:5.7 + environment: + MYSQL_DATABASE: testfixture + MYSQL_ROOT_PASSWORD: mysql diff --git a/examples/are-you-alive/pkg/client/client.go b/examples/are-you-alive/pkg/client/client.go new file mode 100644 index 0000000000..e4f35d3e88 --- /dev/null +++ b/examples/are-you-alive/pkg/client/client.go @@ -0,0 +1,347 @@ +package client + +import ( + "database/sql" + "fmt" + mysql "github.com/go-sql-driver/mysql" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/sirupsen/logrus" + "strings" +) + +/* + * This package is meant to provide a client that includes prometheus metrics + * for common database issues. + */ + +var ( + defaultBuckets = []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10} + countErrorLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_count_error_latency_seconds", + Help: "Latency to recieve a count error", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name", "tablet_type"}, + ) + readErrorLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_read_error_latency_seconds", + Help: "Latency to recieve a read error", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name", "tablet_type"}, + ) + deleteErrorLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_delete_error_latency_seconds", + Help: "Latency to recieve a delete error", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name"}, + ) + writeErrorLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_write_error_latency_seconds", + Help: "Latency to recieve a write error", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name"}, + ) + connectErrorLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_connect_error_latency_seconds", + Help: "Latency to recieve a connect error", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name", "tablet_type"}, + ) + countLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_count_latency_seconds", + Help: "Time it takes to count to the database", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name", "tablet_type"}, + ) + readLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_read_latency_seconds", + Help: "Time it takes to read to the database", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name", "tablet_type"}, + ) + deleteLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_delete_latency_seconds", + Help: "Time it takes to delete to the database", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name"}, + ) + writeLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_write_latency_seconds", + Help: "Time it takes to write to the database", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name"}, + ) + connectLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "are_you_alive_connect_latency_seconds", + Help: "Time it takes to connect to the database", + Buckets: defaultBuckets, + }, + []string{"environment_name", "database_name", "tablet_type"}, + ) +) + +// ParseDBName extracts the database name from a mysql connection string. +func ParseDBName(connectionString string) string { + mysqlConfig, err := mysql.ParseDSN(connectionString) + if err != nil { + logrus.WithError(err).Fatal("Error parsing DSN!") + } + return mysqlConfig.DBName +} + +// ParseTabletType extracts the tablet type from a vitess specific mysql +// connection string. +// +// See https://vitess.io/docs/faq/queries/ for where these come from. +func ParseTabletType(connectionString string) string { + databaseName := ParseDBName(connectionString) + if strings.HasSuffix(databaseName, "@master") { + return "master" + } else if strings.HasSuffix(databaseName, "@replica") { + return "replica" + } else if strings.HasSuffix(databaseName, "@rdonly") { + return "rdonly" + } else { + return "default" + } +} + +func openDatabase(environmentName string, connectionString string) (*sql.DB, error) { + databaseName := ParseDBName(connectionString) + tabletType := ParseTabletType(connectionString) + // NOTE: This is probably not measuring open connections. I think they + // connections are created/fetched from the pool when an operation is + // actually performed. We could force this with a ping probably, but for + // now this is here just as a sanity check that this is actually all + // happening locally. We should just see everything complete within + // milliseconds. + labels := prometheus.Labels{ + "environment_name": environmentName, + "database_name": databaseName, + "tablet_type": tabletType} + connectTimer := prometheus.NewTimer(connectLatency.With(labels)) + connectErrorTimer := prometheus.NewTimer(connectErrorLatency.With(labels)) + db, err := sql.Open("mysql", connectionString) + if err != nil { + logrus.WithError(err).Error("Error connecting to database") + connectErrorTimer.ObserveDuration() + return nil, err + } + connectTimer.ObserveDuration() + return db, nil +} + +// InitializeDatabase will connect to the given connectionString, drop the +// given tableName, and recreate it with the schema that the rest of the client +// expects. This is not something any normal client would do but is convenient +// here because we are just using this client for monitoring. +func InitializeDatabase(environmentName string, connectionString string, tableName string) error { + + // 0. Create logger + log := logrus.WithField("connection_string", connectionString) + + // 1. Open client to database + db, err := openDatabase(environmentName, connectionString) + if err != nil { + log.WithError(err).Error("Error opening database") + return err + } + defer db.Close() + + // 2. Delete test table, but continue if it's not there + if _, err := db.Exec(fmt.Sprintf("DROP TABLE %s", tableName)); err != nil { + log.WithError(err).Warn("Error deleting database") + } + + // 3. Create table + createSQL := fmt.Sprintf( + "CREATE TABLE IF NOT EXISTS %s(page INT, message VARCHAR(255) NOT NULL, PRIMARY KEY (page))", tableName) + if _, err := db.Exec(createSQL); err != nil { + log.WithError(err).Error("Error creating database") + return err + } + return nil +} + +// WipeTestTable connects to the database given by connectionString and deletes +// everything in the table given by tableName because this client expects the +// table to be empty. No client would normally do this, but it's convenient for +// testing. +func WipeTestTable(environmentName string, connectionString string, tableName string) error { + + // 0. Create logger + log := logrus.WithField("connection_string", connectionString) + + // 1. Open client to database + db, err := openDatabase(environmentName, connectionString) + if err != nil { + log.WithError(err).Error("Error opening database") + return err + } + defer db.Close() + + // 2. Clear database + if _, err := db.Exec(fmt.Sprintf("DELETE FROM %s", tableName)); err != nil { + log.WithError(err).Warn("Error clearing table") + } + return nil +} + +// Write will write the record given by page to the test table in the database +// referenced by connectionString. +func Write(environmentName string, connectionString string, page int) error { + + // 0. Create logger + log := logrus.WithField("connection_string", connectionString) + + // 1. Open client to database + databaseName := ParseDBName(connectionString) + db, err := openDatabase(environmentName, connectionString) + if err != nil { + log.WithError(err).Error("Error opening database") + return err + } + defer db.Close() + + // 2. Write record + labels := prometheus.Labels{ + "environment_name": environmentName, + "database_name": databaseName} + writeTimer := prometheus.NewTimer(writeLatency.With(labels)) + writeErrorTimer := prometheus.NewTimer(writeErrorLatency.With(labels)) + if _, err := db.Exec("INSERT INTO are_you_alive_messages (page, message) VALUES (?, ?)", page, "foo"); err != nil { + log.WithError(err).Error("Error inserting into database") + writeErrorTimer.ObserveDuration() + return err + } + writeTimer.ObserveDuration() + return nil +} + +// Read will read the record given by page from the test table in the database +// referenced by connectionString. +func Read(environmentName string, connectionString string, page int) (int, string, error) { + + // 0. Create logger + log := logrus.WithField("connection_string", connectionString) + + // 1. Open client to database + databaseName := ParseDBName(connectionString) + db, err := openDatabase(environmentName, connectionString) + if err != nil { + log.WithError(err).Error("Error opening database") + return 0, "", err + } + defer db.Close() + + // 2. Read record + tabletType := ParseTabletType(connectionString) + labels := prometheus.Labels{ + "environment_name": environmentName, + "database_name": databaseName, + "tablet_type": tabletType} + readTimer := prometheus.NewTimer(readLatency.With(labels)) + readErrorTimer := prometheus.NewTimer(readErrorLatency.With(labels)) + row := db.QueryRow("SELECT * FROM are_you_alive_messages WHERE page=?", page) + var readID int + var readMsg string + if err := row.Scan(&readID, &readMsg); err != nil { + if err == sql.ErrNoRows { + // If our error is just that we didn't find anything, don't treat + // this as an error or a success. Just return and let the caller + // deal with it so we don't mess up our metrics. + return 0, "", err + } + log.WithError(err).Error("Error connecting to database") + readErrorTimer.ObserveDuration() + return 0, "", err + } + logrus.WithFields(logrus.Fields{ + "readId": readID, + "readMsg": readMsg, + }).Debug("Successfully read row") + readTimer.ObserveDuration() + return readID, readMsg, nil +} + +// Count will count all the documents in the test table in the database +// referenced by connectionString. +func Count(environmentName string, connectionString string) (int, error) { + + // 0. Create logger + log := logrus.WithField("connection_string", connectionString) + + // 1. Open client to database + databaseName := ParseDBName(connectionString) + db, err := openDatabase(environmentName, connectionString) + if err != nil { + log.WithError(err).Error("Error opening database") + return 0, err + } + defer db.Close() + + // 2. Run Count + tabletType := ParseTabletType(connectionString) + labels := prometheus.Labels{ + "environment_name": environmentName, + "database_name": databaseName, + "tablet_type": tabletType} + countTimer := prometheus.NewTimer(countLatency.With(labels)) + countErrorTimer := prometheus.NewTimer(countErrorLatency.With(labels)) + row := db.QueryRow("SELECT COUNT(*) FROM are_you_alive_messages") + var count int + if err := row.Scan(&count); err != nil { + log.WithError(err).Error("Error running count") + countErrorTimer.ObserveDuration() + return 0, err + } + logrus.WithFields(logrus.Fields{ + "count": count, + }).Debug("Successfully ran count") + countTimer.ObserveDuration() + return count, nil +} + +// Delete will delete the record given by page from the test table in the +// database referenced by connectionString. +func Delete(environmentName string, connectionString string, page int) error { + + // 0. Create logger + log := logrus.WithFields(logrus.Fields{ + "connection_string": connectionString, + "page": page, + }) + + // 1. Open client to database + databaseName := ParseDBName(connectionString) + labels := prometheus.Labels{ + "environment_name": environmentName, + "database_name": databaseName} + deleteTimer := prometheus.NewTimer(deleteLatency.With(labels)) + deleteErrorTimer := prometheus.NewTimer(deleteErrorLatency.With(labels)) + db, err := openDatabase(environmentName, connectionString) + if err != nil { + log.WithError(err).Error("Error opening database") + deleteErrorTimer.ObserveDuration() + return err + } + defer db.Close() + + // 2. Delete record + if _, err := db.Exec("DELETE FROM are_you_alive_messages WHERE page=?", page); err != nil { + log.WithError(err).Error("Error deleting record") + deleteErrorTimer.ObserveDuration() + return err + } + deleteTimer.ObserveDuration() + return nil +} diff --git a/examples/are-you-alive/prometheus.yml b/examples/are-you-alive/prometheus.yml new file mode 100644 index 0000000000..d968a28136 --- /dev/null +++ b/examples/are-you-alive/prometheus.yml @@ -0,0 +1,34 @@ +# my global config +global: + scrape_interval: 5s # By default, scrape targets every 5 seconds. + evaluation_interval: 5s # By default, scrape targets every 5 seconds. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'local-integration-test' + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + # - "first.rules" + # - "second.rules" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['localhost:9090'] + - job_name: "app" + scrape_interval: "5s" + static_configs: + - targets: ['app:8080'] diff --git a/examples/are-you-alive/schemas/README.md b/examples/are-you-alive/schemas/README.md new file mode 100644 index 0000000000..e119ab3350 --- /dev/null +++ b/examples/are-you-alive/schemas/README.md @@ -0,0 +1,9 @@ +# Test Schemas + +MySQL schemas and Vitess schemas that must be applied to your cluster to use +this test helper. They should be applied using `vtctlclient` like this: + +``` +vtctlclient -server "" ApplySchema -sql "$(cat create_test_table.sql)" +vtctlclient -server "" ApplyVSchema -vschema "$(cat vschema.json)" +``` diff --git a/examples/are-you-alive/schemas/create_test_table.sql b/examples/are-you-alive/schemas/create_test_table.sql new file mode 100644 index 0000000000..c382d34352 --- /dev/null +++ b/examples/are-you-alive/schemas/create_test_table.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS are_you_alive_messages ( + page INT, + message VARCHAR(255) NOT NULL, + PRIMARY KEY (page) +) + diff --git a/examples/are-you-alive/schemas/vschema.json b/examples/are-you-alive/schemas/vschema.json new file mode 100644 index 0000000000..bf8df1b6e6 --- /dev/null +++ b/examples/are-you-alive/schemas/vschema.json @@ -0,0 +1,18 @@ +{ + "sharded": true, + "vindexes": { + "hash": { + "type": "hash" + } + }, + "tables": { + "are_you_alive_messages": { + "column_vindexes": [ + { + "column": "page", + "name": "hash" + } + ] + } + } +}